In [1]:
%cd /content/drive/MyDrive/Agriculture App/agriculture-predictor-planner

/content/drive/MyDrive/Agriculture App/agriculture-predictor-planner


In [16]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


In [4]:
#Load the data
df = pd.read_csv("data/final/master_crop.csv")

In [None]:
print(df.info(verbose=True))

In [None]:
print(df.head(20))

In [19]:
from re import IGNORECASE
X = df.drop(columns="yield")
y = df["yield"]

# 3. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 4. Build preprocessing pipeline
cat_cols = ["district", "crop", "month"]
num_cols = ["tmax", "tmin", "precip", "wind"]

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(drop="first", sparse_output=False), cat_cols),
    ("num", StandardScaler(), num_cols),
])

# 5. Full pipeline with XGBoost
pipeline = Pipeline([
    ("preproc", preprocessor),
    ("model", XGBRegressor(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=6,
        random_state=42,
        tree_method="hist",       # fast on larger data
    )),
])

# 6. Fit & evaluate
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)    # returns MSE
rmse = np.sqrt(mse)                         # take square root for RMSE
print("XGB RMSE:", rmse)
print("XGB R²:", r2_score(y_test, y_pred))

XGB RMSE: 611.7942043797642
XGB R²: 0.596051389312682


In [None]:
param_dist = {
  "model__n_estimators": [100, 200, 400],
  "model__max_depth": [4, 6, 8],
  "model__learning_rate": [0.01, 0.05, 0.1],
  "model__subsample": [0.6, 0.8, 1.0],
  "model__colsample_bytree": [0.6, 0.8, 1.0],
}

search = RandomizedSearchCV(
  pipeline, param_dist,
  n_iter=20, cv=3,
  scoring="neg_root_mean_squared_error",
  n_jobs=-1, random_state=42,
  verbose=1
)
search.fit(
  X_train, y_train,
  model__early_stopping_rounds=20,
  model__eval_set=[(X_test, y_test)],
  model__verbose=False
)

print("Best params:", search.best_params_)
print("Best CV RMSE:", -search.best_score_)