In [1]:
import numpy as np
import pandas as pd
import scipy
import xarray as xr
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

In [2]:
ds = xr.open_dataset("C:/Users/mwood/1projects/rainfall_pred/data/processed/era5_conus_20240101.nc", engine="netcdf4")

In [7]:
input_vars = ['t2m', 'sp', 'u10', 'v10', 'cape']
target_var = 'tp_hourly'

# normalize per variable
for var in input_vars:
    mean = ds[var].mean()
    std = ds[var].std()
    ds[f"{var}_norm"] = (ds[var] - mean) / std

# stack spatial dims into one: time, lat*lon, feature
X = np.stack([ds[f"{var}_norm"].values for var in input_vars], axis=-1)  # shape: time, lat, lon, features
Y = ds[target_var].values  # shape: time, lat, lon

# reshape to (samples, features) and (samples,)
n_time, n_lat, n_lon, n_feat = X.shape
X_flat = X.reshape(n_time * n_lat * n_lon, n_feat)
Y_flat = Y.reshape(n_time * n_lat * n_lon)

# train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_flat, Y_flat, test_size=0.2)

model = Ridge()
model.fit(X_train, y_train)

score = model.score(X_test, y_test)
print(f"R² score: {score:.3f}")

R² score: 0.0022
