In [1]:
# Install required packages (run only once, then comment out)
# !pip install pandas matplotlib seaborn numpy scikit-learn joblib

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
# Update the path to your actual CSV location
df = pd.read_csv(r'C:\Users\YourUsername\Path\rice_data_outlier_removed.csv')

In [None]:
# Feature selection
x = df[['Year', 'RICE AREA (1000 ha)', 'RICE PRODUCTION (1000 tons)', 'State Code']]
y = df['RICE YIELD (Kg per ha)']

In [4]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [5]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
x_train_scaled = ss.fit_transform(x_train)
x_test_scaled = ss.transform(x_test)

In [6]:
# Define and fit Random Forest model
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=42)
rf.fit(x_train_scaled, y_train)

In [7]:
# Cross-validation score
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf, x_train_scaled, y_train, cv=5)
print('Cross-validated train score: ', scores.mean() * 100)

In [8]:
# Model score on train data
rf.score(x_train_scaled, y_train) * 100

In [9]:
import joblib
import os
os.makedirs('Models', exist_ok=True)
joblib.dump(rf, 'Models/rf_model.pkl')
joblib.dump(ss, 'Models/scaler.pkl')

In [10]:
# Predict and save to CSV
y_pred = rf.predict(x_test_scaled)
pred_df = pd.DataFrame({'Actual': y_test.values, 'Predicted': y_pred})
pred_df.to_csv('Models/test_predictions.csv', index=False)

In [12]:
## Notebook Usage Tips
- Run all cells in order from top to bottom to avoid NameError and missing variable issues.
- If you restart the kernel, re-run all cells to redefine variables and models.
- If you see a ModuleNotFoundError, install the required package using pip in a code cell, e.g., `!pip install pandas`.
- If you see a FileNotFoundError, check that the file path matches your project structure.


SyntaxError: invalid syntax (1972316463.py, line 2)

In [13]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for RandomForestRegressor
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring='r2',
    verbose=2
)

grid_search.fit(x_train_scaled, y_train)

print('Best Score:', grid_search.best_score_)
print('Best Params:', grid_search.best_params_)

# Save best parameters to a JSON file
import json
with open('Models/best_params.json', 'w') as f:
    json.dump(grid_search.best_params_, f, indent=4)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Score: 0.9661651111878362
Best Params: {'max_depth': 30, 'min_samples_split': 2, 'n_estimators': 150}
Best Score: 0.9661651111878362
Best Params: {'max_depth': 30, 'min_samples_split': 2, 'n_estimators': 150}


In [15]:
!pip install xgboost
# Try other models: GradientBoostingRegressor and XGBoost (if installed)
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

# GradientBoostingRegressor tuning
gb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}
gb_grid = GridSearchCV(
    estimator=GradientBoostingRegressor(random_state=42),
    param_grid=gb_param_grid,
    cv=5,
    n_jobs=-1,
    scoring='r2',
    verbose=2
)
gb_grid.fit(x_train_scaled, y_train)
print('GBR Best Score:', gb_grid.best_score_)
print('GBR Best Params:', gb_grid.best_params_)

# Try XGBoost if available
try:
    from xgboost import XGBRegressor
    xgb_param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2]
    }
    xgb_grid = GridSearchCV(
        estimator=XGBRegressor(random_state=42, verbosity=0),
        param_grid=xgb_param_grid,
        cv=5,
        n_jobs=-1,
        scoring='r2',
        verbose=2
    )
    xgb_grid.fit(x_train_scaled, y_train)
    print('XGB Best Score:', xgb_grid.best_score_)
    print('XGB Best Params:', xgb_grid.best_params_)
except ImportError:
    print('XGBoost is not installed. To try it, run: !pip install xgboost')


Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 1.3/150.0 MB 7.0 MB/s eta 0:00:22
    --------------------------------------- 2.6/150.0 MB 6.6 MB/s eta 0:00:23
   - -------------------------------------- 3.9/150.0 MB 6.5 MB/s eta 0:00:23
   - -------------------------------------- 5.2/150.0 MB 6.3 MB/s eta 0:00:24
   - -------------------------------------- 6.6/150.0 MB 6.3 MB/s eta 0:00:23
   -- ------------------------------------- 7.9/150.0 MB 6.3 MB/s eta 0:00:23
   -- ------------------------------------- 9.2/150.0 MB 6.3 MB/s eta 0:00:23
   -- ------------------------------------- 10.2/150.0 MB 6.1 MB/s eta 0:00:23
   --- ------------------------------------ 11.3/150.0 MB 5.9 MB/s eta 0:00:24
   --- ------------------------------------ 12.6/150.0 MB 5.9 MB/s eta 0:00:2

In [16]:
# Save the best XGBoost model and its parameters
try:
    best_xgb = xgb_grid.best_estimator_
    import joblib, json, os
    os.makedirs('Models', exist_ok=True)
    joblib.dump(best_xgb, 'Models/xgb_model.pkl')
    with open('Models/xgb_best_params.json', 'w') as f:
        json.dump(xgb_grid.best_params_, f, indent=4)
    print('Best XGBoost model and parameters saved to Models/.')
except Exception as e:
    print('Could not save XGBoost model:', e)


Best XGBoost model and parameters saved to Models/.
