In [1]:
%cd /content/drive/MyDrive/Agriculture App/agriculture-predictor-planner

/content/drive/MyDrive/Agriculture App/agriculture-predictor-planner


In [16]:
import pandas as pd
import numpy as np
import joblib
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit, train_test_split
from xgboost.callback import EarlyStopping
from re import IGNORECASE


In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


In [4]:
#Load the data
df = pd.read_csv("data/final/master_crop.csv")

In [None]:
print(df.info(verbose=True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415573 entries, 0 to 415572
Data columns (total 9 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   district  415573 non-null  object 
 1   year      415573 non-null  int64  
 2   yield     415573 non-null  float64
 3   crop      415573 non-null  object 
 4   month     399891 non-null  float64
 5   tmax      399891 non-null  float64
 6   tmin      399891 non-null  float64
 7   precip    399891 non-null  float64
 8   wind      399891 non-null  float64
dtypes: float64(6), int64(1), object(2)
memory usage: 28.5+ MB
None


In [None]:
print(df.head(20))

In [None]:
print(df['yield'].mean())

853.6796118652677


1. **Model training method #1 (XGB RMSE: 595.6751212599659,
XGB R²: 0.617056832224721)**

In [None]:
# 1. Define the features and target
X = df.drop(columns="yield")
y = df["yield"]

# 2. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# 3. Define preprocessor (one-hot for categories, scale numeric)
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(drop="first", sparse_output=False), ["district", "crop", "month"]),
    ("num", StandardScaler(), ["tmax", "tmin", "precip", "wind", "year"])])


# 4. Build pipeline with XGBRegressor
pipeline = Pipeline([
    ("preproc", preprocessor),
    ("model", XGBRegressor(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=6,
        random_state=42,
        tree_method="hist",
        enable_categorical=True
    ))
])


# 5. Fit & evaluate
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)    # returns MSE
rmse = np.sqrt(mse)                         # take square root for RMSE
print("XGB RMSE:", rmse)
print("XGB R²:", r2_score(y_test, y_pred))

XGB RMSE: 571.1065163177335
XGB R²: 0.6479943513291873


2**. 🎯🎯🎯🎯🎯Model training method #2(more accurate): (XGB RMSE: 315.43857550884024,
XGB R²: 0.8926146573109333)**

The jump from RMSE ≈ 595 → 315 and R² ≈ 0.62 → 0.89 comes almost entirely from letting XGBoost handle categorical features natively (instead of One-Hot) and from adding your Year column as a continuous predictor.

*Native categorical support*

When you convert District and Crop to pandas category dtype and run

model = XGBRegressor(..., enable_categorical=True)
model.fit(X_train, y_train)

XGBoost learns optimal splits on each category directly—no sparse, high-dimensional one-hot vectors. This usually gives a big boost in both speed and accuracy, especially with moderately high-cardinality features.

*Year as a numeric trend feature*

By keeping Year as a real-valued input (instead of dropping it or one-hot encoding it), the model can pick up on secular trends (e.g. gradual improvements in yields over time). That temporal signal often explains a large chunk of variance.

In [6]:
# 1. Define the features and target
X = df.drop(columns="yield")
y = df["yield"]

# 2. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# 3. Convert District/Crop into categories
for df in (X_train, X_test):
    df["district"] = df["district"].astype("category")
    df["crop"]     = df["crop"].astype("category")


# 4. Setup XGBRegressor
model = XGBRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    tree_method="hist",
    enable_categorical=True
)

# Add early‐stopping via callback
model.set_params(callbacks=[EarlyStopping(rounds=20, save_best=True)])

# 5. Fit directly on the DataFrame with categories
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=False
)

# 6. Evaluate
y_pred = model.predict(X_test)


mse = mean_squared_error(y_test, y_pred)    # returns MSE
rmse = np.sqrt(mse)                         # take square root for RMSE
print("XGB RMSE:", rmse)
print("XGB R²:", r2_score(y_test, y_pred))

XGB RMSE: 315.43857550884024
XGB R²: 0.8926146573109333


In [None]:
# 1. Split features & target
X = df.drop(columns="yield")
y = df["yield"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 2. Define preprocessor (one-hot for categories, scale numeric)
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(drop="first", sparse_output=False), ["district", "crop", "month"]),
    ("num", StandardScaler(), ["tmax", "tmin", "precip", "wind", "year"])
])

# 3. Build pipeline with XGBRegressor (no categorical flag needed)
pipeline = Pipeline([
    ("preproc", preprocessor),
    ("model", XGBRegressor(
        tree_method="hist",
        random_state=42
    ))
])

param_dist = {
    "model__n_estimators": [50, 100],
    "model__learning_rate": [0.05, 0.1],
    "model__max_depth": [3, 5],
    "model__subsample": [0.8],
    "model__colsample_bytree": [0.8],
    "model__gamma": [0, 1],
    "model__reg_alpha": [0, 0.1],
    "model__reg_lambda": [1, 5],
}

# 5. Setup and run RandomizedSearchCV
rs = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=10,
    scoring="neg_root_mean_squared_error",
    cv=3,
    error_score="raise",
    verbose=2,
    random_state=42,
    n_jobs=-1
)
rs.fit(X_train, y_train)

# 6. Evaluate on the hold-out test set
best = rs.best_estimator_
y_pred = best.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2   = r2_score(y_test, y_pred)
print(f"Best CV RMSE: {(-rs.best_score_):.2f}")
print("Test   RMSE:", rmse)
print("Test   R²:  ", r2)
print("Best Hyperparameters:", rs.best_params_)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best CV RMSE: 656.02
Test   RMSE: 647.6145260923869
Test   R²:   0.5473645498032093
Best Hyperparameters: {'model__subsample': 0.8, 'model__reg_lambda': 5, 'model__reg_alpha': 0, 'model__n_estimators': 100, 'model__max_depth': 5, 'model__learning_rate': 0.1, 'model__gamma': 1, 'model__colsample_bytree': 0.8}


In [None]:
# 1. Split features & target
X = df.drop(columns="yield")
y = df["yield"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 2. Define preprocessor (one-hot for categories, scale numeric)
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(drop="first", sparse_output=False), ["district", "crop", "month"]),
    ("num", StandardScaler(), ["tmax", "tmin", "precip", "wind", "year"])
])

# 3. Build pipeline with XGBRegressor (no categorical flag needed)
pipeline = Pipeline([
    ("preproc", preprocessor),
    ("model", XGBRegressor(
        tree_method="hist",
        random_state=42
    ))
])

param_dist = {
    "model__n_estimators":      [50, 100, 200, 300],
    "model__learning_rate":     [0.01, 0.05, 0.1, 0.2],
    "model__max_depth":         [3, 5, 7, 9],
    "model__subsample":         [0.6, 0.8, 1.0],
    "model__colsample_bytree":  [0.6, 0.8, 1.0],
    "model__gamma":             [0, 1, 5],
    "model__reg_alpha":         [0, 0.1, 1],
    "model__reg_lambda":        [1, 5, 10],
}

# 5. Setup and run RandomizedSearchCV
rs = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=30,
    scoring="neg_root_mean_squared_error",
    cv=3,
    error_score="raise",
    verbose=2,
    random_state=42,
    n_jobs=-1
)
rs.fit(X_train, y_train)

# 6. Evaluate on the hold-out test set
best = rs.best_estimator_
y_pred = best.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2   = r2_score(y_test, y_pred)
print(f"Best CV RMSE: {(-rs.best_score_):.2f}")
print("Test   RMSE:", rmse)
print("Test   R²:  ", r2)
print("Best Hyperparameters:", rs.best_params_)


Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best CV RMSE: 421.04
Test   RMSE: 399.60420128048327
Test   R²:   0.8276642046240366
Best Hyperparameters: {'model__subsample': 0.6, 'model__reg_lambda': 10, 'model__reg_alpha': 0, 'model__n_estimators': 300, 'model__max_depth': 9, 'model__learning_rate': 0.2, 'model__gamma': 1, 'model__colsample_bytree': 1.0}


In [12]:
def predict_top5_crops(model, X_train, district, month, tmax, tmin, precip, wind, year=2022):
    """
    Predict top 5 crops to sow in a given district and month based on weather.

    Parameters:
        model: Trained XGBRegressor
        X_train: Training DataFrame (to extract category mappings + column order)
        district (str): District name
        month (int): Sowing month (1–12)
        tmax (float): Max temperature
        tmin (float): Min temperature
        precip (float): Precipitation
        wind (float): Wind speed
        year (int): Year value (must be same type used during training)

    Returns:
        DataFrame: Top 5 crops and their predicted yields
    """
    import pandas as pd

    # 1. Get the trained categories and feature column order
    crop_cats = X_train["crop"].cat.categories
    district_cats = X_train["district"].cat.categories
    feature_order = X_train.columns.tolist()

    # 2. Build prediction DataFrame (one row per crop)
    all_crops = crop_cats.tolist()
    predict_df = pd.DataFrame([
        {
            "district": district,
            "year": year,
            "crop": crop,
            "month": month,
            "tmax": tmax,
            "tmin": tmin,
            "precip": precip,
            "wind": wind
        }
        for crop in all_crops
    ])

    # 3. Convert to categorical using training mappings
    predict_df["district"] = pd.Categorical(predict_df["district"], categories=district_cats)
    predict_df["crop"] = pd.Categorical(predict_df["crop"], categories=crop_cats)

    # 4. Reorder columns to match training
    predict_df = predict_df[feature_order]

    # 5. Predict
    predict_df["predicted_yield"] = model.predict(predict_df)

    # 6. Return top 5 crops
    top5 = (
        predict_df
        .sort_values("predicted_yield", ascending=False)
        .head(5)
        .reset_index(drop=True)
    )

    return top5[["crop", "predicted_yield"]]


In [None]:
top5 = predict_top5_crops(
    model=model,
    X_train=X_train,
    district="Dehradun",
    month=6,
    tmax=34.0,
    tmin=22.0,
    precip=200.0,
    wind=2.5,
    year=2022
)

print(top5)


In [17]:
# Save the trained model
joblib.dump(model, "models/xgb_crop_yield_model.pkl")

# Save the crop/district categories
joblib.dump(X_train["crop"].cat.categories.tolist(), "models/crop_categories.pkl")
joblib.dump(X_train["district"].cat.categories.tolist(), "models/district_categories.pkl")
joblib.dump(X_train.columns.tolist(), "models/feature_order.pkl")  # To preserve column order


['models/feature_order.pkl']