In [6]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score  # Import r2_score

# Load processed data
df = pd.read_csv('../data/processed/maize_yield_kenya_processed.csv')

# Split data into features and target
X = df[['average_rain_fall_mm_per_year', 'pesticides_tonnes', 'avg_temp', 'rainfall_to_temp_ratio']]
y = df['hg/ha_yield']

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Initialize Grid Search
rf_model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X, y)

# Best parameters
print(f"Best Parameters: {grid_search.best_params_}")

# Evaluate best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X)
r2 = r2_score(y, y_pred)  # Now r2_score is defined
print(f"R² after tuning: {r2}")

Best Parameters: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 200}
R² after tuning: 0.3905922499196658
