# CELL 1: INSTALL LIBRARIES

In [1]:
# We need to install XGBoost and the library for TabNet.
# The 'q' flag makes the output less verbose.

!pip install -q xgboost
!pip install -q pytorch-tabnet

print("✅ Libraries installed.")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h✅ Libraries installed.


# CELL 2: IMPORTS, DRIVE MOUNT, AND CONFIGURATION

In [5]:
import pandas as pd
import numpy as np
import os
import joblib
from google.colab import drive

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import xgboost as xgb
from pytorch_tabnet.tab_model import TabNetRegressor

import torch

# --- Mount Drive ---
drive.mount('/content/drive')

# --- Configuration ---
PROJECT_DIR = "/content/drive/My Drive/Pungda"
DATASET_PATH = os.path.join(PROJECT_DIR, "training_dataset_final.csv")
SCALERS_PATH = os.path.join(PROJECT_DIR, "scalers.joblib")
MODEL_SAVE_DIR = os.path.join(PROJECT_DIR, "models")
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)

# --- Set Device ---
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
print(f"Model save directory: {MODEL_SAVE_DIR}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cpu
Model save directory: /content/drive/My Drive/Pungda/models


# CELL 3: DATA LOADING AND PREPARATION

In [6]:
# We load the final dataset and the scalers we saved earlier.
# This ensures we process our data exactly the same way every time.

print("Loading data and scalers...")
df = pd.read_csv(DATASET_PATH)
scalers = joblib.load(SCALERS_PATH)
req_scaler = scalers['req']
emb_scaler = scalers['emb']
yield_scaler = scalers['yield']
print("Data and scalers loaded successfully.")

# --- Define Features and Target ---
# IMPORTANT: These must match the columns in the training dataset!
requirement_cols = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
embedding_cols = [f'A{i:02d}' for i in range(64)]
feature_cols = requirement_cols + embedding_cols
target_col = 'yield'

# --- Apply the Saved Scalers ---
# We use .transform() here, NOT .fit_transform(), to prevent data leakage.
df[requirement_cols] = req_scaler.transform(df[requirement_cols])
df[embedding_cols] = emb_scaler.transform(df[embedding_cols])
# We don't scale the target 'yield' yet, as models handle it differently.

# --- Split Data for Training and Validation ---
X = df[feature_cols]
y = df[target_col]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nData shapes:")
print(f"X_train: {X_train.shape}")
print(f"X_val:   {X_val.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_val:   {y_val.shape}")

Loading data and scalers...
Data and scalers loaded successfully.

Data shapes:
X_train: (43938, 71)
X_val:   (10985, 71)
y_train: (43938,)
y_val:   (10985,)


# CELL 4: MODEL 1 - XGBOOST (THE REIGNING CHAMPION)

In [7]:
print("--- Training XGBoost Model ---")

# Instantiate the XGBoost regressor.
# Key parameters:
#   - n_estimators: Number of boosting rounds (trees).
#   - learning_rate: How much to shrink the contribution of each tree.
#   - tree_method='hist': A much faster training method.
#   - device='cuda': This tells XGBoost to use the GPU.
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    tree_method='hist',
    device=device,
    early_stopping_rounds=50 # Stop training if validation score doesn't improve
)

# Train the model
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=100 # Print progress every 100 rounds
)

# --- Evaluate the Model ---
print("\n--- Evaluating XGBoost ---")
preds = xgb_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, preds))
mae = mean_absolute_error(y_val, preds)
r2 = r2_score(y_val, preds)

print(f"Validation R² Score: {r2:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print("\nMAE means the model's predictions are, on average, off by this many tons/hectare.")
print("R² of 1.0 is a perfect prediction.")

# --- Save the Trained Model ---
XGB_MODEL_PATH = os.path.join(MODEL_SAVE_DIR, "xgboost_yield_model.json")
xgb_model.save_model(XGB_MODEL_PATH)
print(f"\nXGBoost model saved to: {XGB_MODEL_PATH}")

--- Training XGBoost Model ---
[0]	validation_0-rmse:34.23145
[100]	validation_0-rmse:7.80612
[200]	validation_0-rmse:7.74773
[300]	validation_0-rmse:7.72282
[400]	validation_0-rmse:7.70418
[500]	validation_0-rmse:7.68774
[600]	validation_0-rmse:7.67909
[700]	validation_0-rmse:7.67393
[800]	validation_0-rmse:7.66484
[900]	validation_0-rmse:7.65798
[999]	validation_0-rmse:7.65609

--- Evaluating XGBoost ---
Validation R² Score: 0.9546
Validation RMSE: 7.6550
Validation MAE: 3.2032

MAE means the model's predictions are, on average, off by this many tons/hectare.
R² of 1.0 is a perfect prediction.

XGBoost model saved to: /content/drive/My Drive/Pungda/models/xgboost_yield_model.json


In [8]:
# -----------------------------------------------------------------
# CELL 5: MODEL 2 - TABNET (THE DEEP LEARNING CHALLENGER)
# -----------------------------------------------------------------
# TabNet requires data as NumPy arrays.

print("\n\n--- Training TabNet Model ---")

# Convert data to NumPy arrays
X_train_np = X_train.values
y_train_np = y_train.values.reshape(-1, 1) # Reshape for TabNet
X_val_np = X_val.values
y_val_np = y_val.values.reshape(-1, 1)

# Instantiate the TabNet Regressor
tabnet_model = TabNetRegressor(
    verbose=10,
    seed=42,
    device_name=device
)

# Train the model
tabnet_model.fit(
    X_train=X_train_np, y_train=y_train_np,
    eval_set=[(X_val_np, y_val_np)],
    eval_name=['val'],
    eval_metric=['mae'],
    max_epochs=100,
    patience=15, # Early stopping patience
    batch_size=1024
)

# --- Evaluate the Model ---
print("\n--- Evaluating TabNet ---")
preds = tabnet_model.predict(X_val_np).flatten() # Flatten the output
rmse = np.sqrt(mean_squared_error(y_val, preds))
mae = mean_absolute_error(y_val, preds)
r2 = r2_score(y_val, preds)

print(f"Validation R² Score: {r2:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")

# --- Save the Trained Model ---
TABNET_MODEL_PATH = os.path.join(MODEL_SAVE_DIR, "tabnet_yield_model")
tabnet_model.save_model(TABNET_MODEL_PATH)
print(f"\nTabNet model saved to: {TABNET_MODEL_PATH}.zip")



--- Training TabNet Model ---




epoch 0  | loss: 1227.56252| val_mae: 14.48003|  0:00:03s
epoch 10 | loss: 76.84739| val_mae: 3.93636 |  0:00:41s
epoch 20 | loss: 66.27465| val_mae: 3.82626 |  0:01:17s
epoch 30 | loss: 59.42333| val_mae: 3.62971 |  0:01:55s
epoch 40 | loss: 54.07036| val_mae: 3.8592  |  0:02:32s
epoch 50 | loss: 56.90972| val_mae: 3.72595 |  0:03:08s

Early stopping occurred at epoch 58 with best_epoch = 43 and best_val_mae = 3.4972





--- Evaluating TabNet ---
Validation R² Score: 0.9492
Validation RMSE: 8.0979
Validation MAE: 3.4972
Successfully saved model at /content/drive/My Drive/Pungda/models/tabnet_yield_model.zip

TabNet model saved to: /content/drive/My Drive/Pungda/models/tabnet_yield_model.zip
