In [1]:
from pathlib import Path
import sys

PROJECT_DIR = Path.cwd().resolve()
if (PROJECT_DIR / "src").exists():
    sys.path.insert(0, str(PROJECT_DIR))
else:
    PROJECT_DIR = Path("..").resolve()
    sys.path.insert(0, str(PROJECT_DIR))

print("PROJECT_DIR:", PROJECT_DIR)
print("src exists:", (PROJECT_DIR / "src").exists())

PROJECT_DIR: /Users/nicolepretini/PycharmProjects/wheat-yield-climate-ml
src exists: True


In [2]:
import pandas as pd

from src.data import load_clean_data
from src.features import build_features
from src.modeling import cross_validate_models
from src.config import TARGET, TABLE_DIR

In [3]:
df = load_clean_data()
df.head()

Unnamed: 0,year,yield_kg_ha,tas_gs_mean,pr_gs_sum
0,2000,2650,16.2,480
1,2001,2410,15.8,430
2,2002,2780,16.5,510
3,2003,2520,16.0,455
4,2004,2900,16.7,525


In [4]:
X = build_features(df)
y = df[TARGET]
X.head(), y.describe()

(   tas_gs_mean  pr_gs_sum  tas_sq   pr_sq  tas_x_pr
 0         16.2        480  262.44  230400    7776.0
 1         15.8        430  249.64  184900    6794.0
 2         16.5        510  272.25  260100    8415.0
 3         16.0        455  256.00  207025    7280.0
 4         16.7        525  278.89  275625    8767.5,
 count      15.000000
 mean     4147.333333
 std      2086.714666
 min      2410.000000
 25%      2675.000000
 50%      2900.000000
 75%      6650.000000
 max      7350.000000
 Name: yield_kg_ha, dtype: float64)

In [5]:
results = cross_validate_models(X, y, n_splits=5)
results

Unnamed: 0,model,cv_folds,rmse_mean,rmse_std,r2_mean,r2_std
1,Ridge,5,712.553704,344.836965,-1.614671,5.532743
0,DummyMean,5,2097.959825,312.918309,-10.720533,23.614521


In [6]:
TABLE_DIR.mkdir(parents=True, exist_ok=True)
out_path = TABLE_DIR / "cv_results.csv"
results.to_csv(out_path, index=False)
out_path

PosixPath('/Users/nicolepretini/PycharmProjects/wheat-yield-climate-ml/outputs/tables/cv_results.csv')