# 3_Modeling (fixed)

Fixed modeling notebook with a robust `compute_rmse` helper to support older sklearn versions. Uses `soil_ml_ready.csv` produced by feature engineering. Run cells top-to-bottom.

In [None]:
# Imports and setup
from pathlib import Path
import pandas as pd
import numpy as np
ROOT = Path.cwd()
IN = ROOT / 'soil_ml_ready.csv'
if not IN.exists():
    raise FileNotFoundError('soil_ml_ready.csv not found. Run feature engineering first.')
OUT = ROOT / 'model_outputs'
OUT.mkdir(exist_ok=True)

df = pd.read_csv(IN, low_memory=False)
print('Loaded', IN, 'shape', df.shape)


In [None]:
# Helper: robust RMSE across sklearn versions
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def compute_rmse(y_true, y_pred):
    try:
        return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        return (mean_squared_error(y_true, y_pred)) ** 0.5


In [None]:
# prepare features and temporal split
target_candidates = [c for c in df.columns if 'average_soilmoisture_level' in c.lower()]
if not target_candidates:
    raise KeyError('Target column not found')
target = target_candidates[0]

train = df[df['Year']==2018].copy()
test = df[df['Year']==2020].copy()
print('Train rows:', len(train), 'Test rows:', len(test))

features = ['Month_num','month_sin','month_cos','Season_num','state_freq','district_id','lag_1','lag_7','rolling_3','rolling_6']
features = [f for f in features if f in df.columns]
print('Using features:', features)

X_train = train[features]
y_train = train[target]
X_test = test[features]
y_test = test[target]


In [None]:
# impute and scale
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
imp = SimpleImputer(strategy='median')
scaler = StandardScaler()
X_train_imp = imp.fit_transform(X_train)
X_test_imp = imp.transform(X_test)
X_train_scaled = scaler.fit_transform(X_train_imp)
X_test_scaled = scaler.transform(X_test_imp)


In [None]:
# Train models: Linear, RandomForest, optional XGBoost/LightGBM
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import joblib

models = {}
models['Linear'] = LinearRegression()
models['RandomForest'] = RandomForestRegressor(n_estimators=200, random_state=42)

# optional XGBoost / LightGBM
try:
    from xgboost import XGBRegressor
    models['XGBoost'] = XGBRegressor(objective='reg:squarederror', n_estimators=200, random_state=42, verbosity=0)
except Exception as e:
    print('XGBoost not available:', e)
try:
    from lightgbm import LGBMRegressor
    models['LightGBM'] = LGBMRegressor(n_estimators=200, random_state=42)
except Exception as e:
    print('LightGBM not available:', e)

results = {}
for name, model in models.items():
    print('Training', name)
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)
    results[name] = {
        'MAE': float(mean_absolute_error(y_test, preds)),
        'RMSE': float(compute_rmse(y_test, preds)),
        'R2': float(r2_score(y_test, preds))
    }
    try:
        joblib.dump(model, OUT/f'{name}_reg.joblib')
    except Exception as e:
        print('Could not save model', name, e)

import json
(OUT/'regression_results.json').write_text(json.dumps(results, indent=2))
print('Saved results to', OUT/'regression_results.json')
print(results)


In [None]:
# Classification baseline if moisture_class exists
if 'moisture_class' in df.columns:
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.preprocessing import LabelEncoder
    from sklearn.metrics import accuracy_score, f1_score, classification_report
    le = LabelEncoder()
    y_train_clf = le.fit_transform(train['moisture_class'])
    y_test_clf = le.transform(test['moisture_class'])
    X_train_clf = imp.fit_transform(train[features])
    X_test_clf = imp.transform(test[features])
    clf = RandomForestClassifier(n_estimators=200, random_state=42)
    clf.fit(X_train_clf, y_train_clf)
    preds = clf.predict(X_test_clf)
    (OUT/'classification_report.txt').write_text(classification_report(y_test_clf, preds))
    print('Classification acc:', accuracy_score(y_test_clf, preds))
else:
    print('No moisture_class column; skipping classification')
