In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore',category=FutureWarning)

In [11]:
data = pd.read_csv('../Datasets/rice_data_outlier_removed.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Year,State Name,Dist Name,RICE AREA (1000 ha),RICE PRODUCTION (1000 tons),RICE YIELD (Kg per ha),State_en
0,0,2010,Chhattisgarh,Durg,777.92,1319.18,1695.77,3
1,1,2011,Chhattisgarh,Durg,806.55,1416.48,1756.23,3
2,2,2012,Chhattisgarh,Durg,807.07,1534.22,1900.97,3
3,3,2013,Chhattisgarh,Durg,819.44,1296.24,1581.86,3
4,4,2014,Chhattisgarh,Durg,834.87,1444.24,1729.89,3


In [None]:
X = data[['Year', 'RICE AREA (1000 ha)', 'RICE PRODUCTION (1000 tons)', 'State Code']]
y = data['RICE YIELD (Kg per ha)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training XGBoost model

In [13]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)

xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_r2 = r2_score(y_test, xgb_predictions)
print(f'XGBoost Test MSE: {xgb_mse:.2f}')
print(f'XGBoost Test R^2: {xgb_r2:.2f}')

XGBoost Test MSE: 46348.71
XGBoost Test R^2: 0.96


# Training LightGBM model

In [14]:
lgb_model = lgb.LGBMRegressor()
lgb_model.fit(X_train, y_train)
lgb_predictions = lgb_model.predict(X_test)

lgb_mse = mean_squared_error(y_test, lgb_predictions)
lgb_r2 = r2_score(y_test, lgb_predictions)
print(f'LightGBM Test MSE: {lgb_mse:.2f}')
print(f'LightGBM Test R^2: {lgb_r2:.2f}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000112 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 539
[LightGBM] [Info] Number of data points in the train set: 1975, number of used features: 4
[LightGBM] [Info] Start training from score 2063.416015
LightGBM Test MSE: 43439.42
LightGBM Test R^2: 0.96


# Hyperparameter tuning of xgb model

In [15]:
xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

xgb_grid_search = GridSearchCV(xgb.XGBRegressor(), xgb_param_grid, cv=3, scoring='neg_mean_squared_error')
xgb_grid_search.fit(X_train, y_train)

print(f'Best XGBoost Parameters: {xgb_grid_search.best_params_}')
print(f'Best XGBoost MSE: {-xgb_grid_search.best_score_:.2f}')

Best XGBoost Parameters: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200}
Best XGBoost MSE: 38371.78


# Hyperparameter tuning lightgbm model

In [16]:
lgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

lgb_grid_search = GridSearchCV(lgb.LGBMRegressor(), lgb_param_grid, cv=3, scoring='neg_mean_squared_error')
lgb_grid_search.fit(X_train, y_train)

print(f'Best LightGBM Parameters: {lgb_grid_search.best_params_}')
print(f'Best LightGBM MSE: {-lgb_grid_search.best_score_:.2f}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000137 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 539
[LightGBM] [Info] Number of data points in the train set: 1316, number of used features: 4
[LightGBM] [Info] Start training from score 2067.700472
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000047 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 539
[LightGBM] [Info] Number of data points in the train set: 1317, number of used features: 4
[LightGBM] [Info] Start training from score 2068.279559
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000053 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 539
[LightGBM] [Info] Number of data points in the train set: 1317, number of used features: 4
[LightGBM] [Info] Start traini

In [17]:
print(f'XGBoost Test MSE: {xgb_mse:.2f}')
print(f'LightGBM Test MSE: {lgb_mse:.2f}')

XGBoost Test MSE: 46348.71
LightGBM Test MSE: 43439.42


# Comparing metrics

In [18]:
original_accuracy = 0.95  
xgb_accuracy = xgb_model.score(X_test, y_test) 
lgb_accuracy = lgb_model.score(X_test, y_test)  

print(f'Original Model Test Accuracy: {original_accuracy:.2f}')
print(f'XGBoost Test Accuracy: {xgb_accuracy:.2f}')
print(f'LightGBM Test Accuracy: {lgb_accuracy:.2f}')

if xgb_accuracy > original_accuracy:
    print("XGBoost improved the testing accuracy.")
else:
    print("XGBoost did not improve the testing accuracy.")

if lgb_accuracy > original_accuracy:
    print("LightGBM improved the testing accuracy.")
else:
    print("LightGBM did not improve the testing accuracy.")

Original Model Test Accuracy: 0.95
XGBoost Test Accuracy: 0.96
LightGBM Test Accuracy: 0.96
XGBoost improved the testing accuracy.
LightGBM improved the testing accuracy.
