In [16]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error

import os
import torch

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [2]:
if torch.cuda.is_available():
    print(f"GPU Detected: {torch.cuda.get_device_name(0)}")
else:
    print("‚ö†Ô∏è No GPU detected. Training on CPU (RandomForest doesn't use GPU anyway)")

GPU Detected: NVIDIA GeForce RTX 4060 Laptop GPU


In [3]:
# Step 1: Load CSV
df = pd.read_csv("crop_yield_dataset.csv")

In [4]:
# Step 3: One-hot encode categorical features
cat_cols = ["Crop_Type", "Soil_Type"]
target_col = "Crop_Yield"
numerical_cols = df.drop(columns=cat_cols + ["Date", target_col]).columns.tolist()

encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
X_cat = encoder.fit_transform(df[cat_cols])
X_num = df[numerical_cols].values
X = np.hstack([X_cat, X_num])
y = df[target_col]

In [5]:
# Step 4: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
# Step 5: Train Random Forest model

# model = RandomForestRegressor(n_estimators=100, random_state=42)

# model = LinearRegression()

# model = DecisionTreeRegressor(random_state=42)

# model = SVR()

# model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

model = LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

In [25]:
rmse_scorer = make_scorer(root_mean_squared_error)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [34]:
scores = cross_val_score(model, X, y, cv=kf, scoring=rmse_scorer)

print(f"Average RMSE: {np.mean(scores):.2f} ¬± {np.std(scores):.2f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000346 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 901
[LightGBM] [Info] Number of data points in the train set: 29216, number of used features: 23
[LightGBM] [Info] Start training from score 26.843632
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000542 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 901
[LightGBM] [Info] Number of data points in the train set: 29216, number of used features: 23
[LightGBM] [Info] Start training from score 27.004647
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000341 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug

In [35]:
print(f"üß™ Training model: {model.__class__.__name__}")
model.fit(X_train, y_train)

üß™ Training model: LGBMRegressor
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000326 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 901
[LightGBM] [Info] Number of data points in the train set: 29216, number of used features: 23
[LightGBM] [Info] Start training from score 26.843632


In [37]:
# Step 6: Evaluate
print("Evaluation:")
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

rmse_train = root_mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)
rmse_test = root_mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f"Train RMSE: {rmse_train:.2f} | R¬≤: {r2_train:.3f}")
print(f"Test  RMSE: {rmse_test:.2f} | R¬≤: {r2_test:.3f}")

Evaluation:
Train RMSE: 3.63 | R¬≤: 0.980
Test  RMSE: 3.88 | R¬≤: 0.977


In [41]:
# Step 7: Save model and encoder
os.makedirs("backend/model", exist_ok=True)
joblib.dump(model, "trained_model.pkl")
joblib.dump(encoder, "encoder.pkl")
print("The model and encoder have been saved to backend/model/")

The model and encoder have been saved to backend/model/
