# Imports

In [None]:
import pandas as pd
import torch
import numpy as np
from torch.utils.data import TensorDataset, DataLoader

from sklearn.preprocessing import StandardScaler

import optuna
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [None]:
static_vars_df = pd.read_parquet('static_vars.parquet')
obs_df = pd.read_parquet('obs.parquet')

print('static_vars_df head:')
print(static_vars_df.head())
print('\nobs_df head:')
print(obs_df.head())

static_vars_df head:
        NHDPlusID       variable          value
0  55000900028341  aspect_ne_pct       0.074821
1  55000900028341  aspect_sw_pct       0.007807
2  55000900028341  aspect_nw_pct       0.917372
3  55000900028341  aspect_se_pct       0.000000
4  55000900028341    elev_min_cm  103297.000000

obs_df head:
        NHDPlusID SiteIDCode        Date       variable     value
0  55000900130309     GSWS01  1980-01-01  Discharge_CMS  0.045505
1  55000900130309     GSWS01  1980-01-02  Discharge_CMS  0.043410
2  55000900130309     GSWS01  1980-01-03  Discharge_CMS  0.048592
3  55000900130309     GSWS01  1980-01-04  Discharge_CMS  0.061872
4  55000900130309     GSWS01  1980-01-05  Discharge_CMS  0.276599


In [None]:
# Pivot static vars (each variable as a column)
static_wide = static_vars_df.pivot(index='NHDPlusID', columns='variable', values='value').reset_index()

# Pivot obs_df (Date as rows, variable as columns)
obs_wide = obs_df.pivot_table(index=['NHDPlusID', 'Date'], columns='variable', values='value').reset_index()

# Merge static features into each site’s obs
merged_df = obs_wide.merge(static_wide, on='NHDPlusID', how='left')

# Sort and forward-fill
merged_df = merged_df.sort_values(['NHDPlusID', 'Date']).ffill()
merged_df.head()

variable,NHDPlusID,Date,Discharge_CMS,Flow_Status,HoboWetDry0.05,MaxDepth_Censor,MaxDepth_Threshold,MaxDepth_cm,ArbolateSu,AreaSqKm,...,aspect_se_pct,aspect_sw_pct,curv_mean,curv_median,elev_max_cm,elev_mean_cm,elev_median_cm,elev_min_cm,slp_mean_pct,slp_median_pct
0,55000900027171,2020-08-12,,0.0,,0.0,0.0,0.0,0.100818,0.0017,...,0.0,1.0,63.946892,63.048618,66223.0,58209.588235,57898.0,51722.0,63.946892,63.048618
1,55000900027173,2020-08-12,,0.75,,1.0,0.0,7.0,0.297666,0.1235,...,0.0,0.565992,42.080586,41.399857,103022.0,88853.689879,88575.0,74487.0,42.080586,41.399857
2,55000900027174,2020-08-12,,1.0,,1.0,0.0,5.5,0.410513,0.0923,...,0.0,0.0,55.87293,56.605061,96559.0,79958.39545,80669.0,63278.0,55.87293,56.605061
3,55000900027177,2020-09-03,,0.5,,1.0,0.0,1.0,0.366996,0.1216,...,0.6875,0.07977,63.409,65.848724,72273.0,60448.368421,60738.5,46037.0,63.409,65.848724
4,55000900027180,2020-09-02,,0.5,,1.0,0.0,11.0,1.082607,0.0592,...,0.0,0.0,21.379627,22.186161,55631.0,49135.785473,48683.0,46107.0,21.379627,22.186161


In [None]:
target_col = 'HoboWetDry0.05'
feature_cols = [c for c in merged_df.columns if c not in ['Date', 'NHDPlusID', target_col]]

# Normalize numeric cols
scaler = StandardScaler()
merged_df[feature_cols] = scaler.fit_transform(merged_df[feature_cols])

# Create sequences (e.g., 30-day window)
def create_sequences(df, seq_len=30):
    X, y = [], []
    for i in range(len(df) - seq_len):
        seq_x = df.iloc[i:i+seq_len][feature_cols].values
        seq_y = df.iloc[i+seq_len][target_col]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

# Example: use one NHDPlusID for now
site_df = merged_df[merged_df['NHDPlusID'] == merged_df['NHDPlusID'].unique()[0]].dropna()
X, y = create_sequences(site_df, seq_len=30)

X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32).unsqueeze(1)
dataset = TensorDataset(X_tensor, y_tensor)

In [None]:
# pick the site with most valid HoboWetDry0.05 values
site_counts = merged_df.groupby("NHDPlusID")["HoboWetDry0.05"].apply(lambda x: x.dropna().shape[0])
best_site = site_counts.idxmax()
print(f"Using site {best_site} with {site_counts[best_site]} valid samples")

# subset data for that site
site_df = merged_df[merged_df["NHDPlusID"] == best_site].dropna(subset=["HoboWetDry0.05"]).sort_values("Date")

# shorter sequence length to ensure dense training data
seq_len = 30  # you can raise later (e.g., 60 or 90)

X, y = create_sequences(site_df, seq_len=seq_len)
print(f"Generated {len(X)} sequences with seq_len={seq_len}")

# convert to tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32).unsqueeze(1)
print("Tensor shapes:", X_tensor.shape, y_tensor.shape)

Using site 55000900061097 with 15883 valid samples
Generated 15853 sequences with seq_len=30
Tensor shapes: torch.Size([15853, 30, 24]) torch.Size([15853, 1])


In [None]:
#OPTUNA SECTION

In [None]:
pip install optuna

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


In [None]:
# --- Split dataset ---
X_train, X_val, y_train, y_val = train_test_split(
    X_tensor, y_tensor, test_size=0.2, shuffle=True, random_state=42
)

# --- Define LSTM model class ---
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return self.sigmoid(out)

# --- Define Optuna objective ---
def objective(trial):
    hidden_size = trial.suggest_int("hidden_size", 32, 128)
    num_layers  = trial.suggest_int("num_layers", 1, 3)
    dropout     = trial.suggest_float("dropout", 0.0, 0.4)
    lr          = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
    batch_size  = trial.suggest_categorical("batch_size", [16, 32, 64])
    epochs      = 7 # short tuning run

    model = LSTMModel(
        input_size=X_tensor.shape[2],
        hidden_size=hidden_size,
        num_layers=num_layers,
        dropout=dropout
    )
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(TensorDataset(X_val,   y_val),   batch_size=batch_size, shuffle=False)

    # --- training ---
    for epoch in range(epochs):
        model.train()
        for xb, yb in train_loader:
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()

    # --- validation ---
    model.eval()
    preds, actuals = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            p = model(xb).squeeze().cpu().numpy()
            preds.append(p)
            actuals.append(yb.squeeze().cpu().numpy())

    preds = np.concatenate(preds)
    actuals = np.concatenate(actuals)

    if len(np.unique(actuals)) < 2:
        return 1.0  # meaningless trial

    auc = roc_auc_score(actuals, preds)
    return 1 - auc  # minimize (1 - AUC)

# --- run Optuna study (8 trials) ---
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials = 7)

print("\n✅ Best Hyperparameters:")
print(study.best_params)
print(f"Best Validation AUC: {1 - study.best_value:.4f}")



[I 2026-01-03 02:03:33,072] A new study created in memory with name: no-name-3d94b82d-7690-4eae-b362-f0139b93ca95
[I 2026-01-03 02:05:41,832] Trial 0 finished with value: 1.0 and parameters: {'hidden_size': 110, 'num_layers': 2, 'dropout': 0.1621773443438842, 'lr': 0.0024719059411162374, 'batch_size': 32}. Best is trial 0 with value: 1.0.
[I 2026-01-03 02:06:56,454] Trial 1 finished with value: 1.0 and parameters: {'hidden_size': 80, 'num_layers': 2, 'dropout': 0.36362996979722584, 'lr': 0.004156881972742796, 'batch_size': 16}. Best is trial 0 with value: 1.0.
[I 2026-01-03 02:07:51,729] Trial 2 finished with value: 1.0 and parameters: {'hidden_size': 113, 'num_layers': 1, 'dropout': 0.14722933912641886, 'lr': 0.008587481210770737, 'batch_size': 16}. Best is trial 0 with value: 1.0.
[I 2026-01-03 02:08:30,304] Trial 3 finished with value: 1.0 and parameters: {'hidden_size': 101, 'num_layers': 1, 'dropout': 0.2925769106904386, 'lr': 0.002244037003505455, 'batch_size': 64}. Best is trial


✅ Best Hyperparameters:
{'hidden_size': 110, 'num_layers': 2, 'dropout': 0.1621773443438842, 'lr': 0.0024719059411162374, 'batch_size': 32}
Best Validation AUC: 0.0000


In [None]:
#optuna visualizations (to see)

In [None]:
import optuna
from optuna.visualization import (
    plot_optimization_history,
    plot_param_importances,
    plot_slice,
    plot_parallel_coordinate,
    plot_contour
)

In [None]:
from optuna.visualization.matplotlib import plot_contour
import matplotlib.pyplot as plt

fig = plot_contour(
    study,
    params=["lr", "dropout"]   # you can swap in any two/three
)
plt.title("Contour Plot: hidden_size vs lr")
plt.tight_layout()
plt.show()

In [None]:
fig = plot_param_importances(study)
fig.show()

In [None]:
#new model with optuna hyperparameters

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# --- Train/val split ---
X_train, X_val, y_train, y_val = train_test_split(
    X_tensor, y_tensor, test_size=0.2, shuffle=True, random_state=42
)
print("Train:", X_train.shape, "Val:", X_val.shape)

# --- best params from Optuna ---
best_params = {
    'hidden_size': 69,
    'num_layers': 1,
    'dropout': 0.25944456350220574,
    'lr': 0.003440467271425508,
    'batch_size': 64
}

# --- define model ---
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
                            batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return self.sigmoid(out)

model = LSTMModel(
    input_size=X_tensor.shape[2],
    hidden_size=best_params['hidden_size'],
    num_layers=best_params['num_layers'],
    dropout=best_params['dropout']
)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=best_params['lr'])

train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=best_params['batch_size'], shuffle=True)
val_loader   = DataLoader(TensorDataset(X_val,   y_val),   batch_size=best_params['batch_size'], shuffle=False)

# --- train for 10 epochs ---
epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs} | Train Loss: {total_loss/len(train_loader):.6f}")

# --- get predictions ---
def get_preds(loader):
    preds, actuals = [], []
    model.eval()
    with torch.no_grad():
        for xb, yb in loader:
            p = model(xb).squeeze().cpu().numpy()
            preds.append(p)
            actuals.append(yb.squeeze().cpu().numpy())
    return np.concatenate(preds), np.concatenate(actuals)

train_p, train_y = get_preds(train_loader)
val_p,   val_y   = get_preds(val_loader)

thr = 0.5
train_c = (train_p > thr).astype(int)
val_c   = (val_p > thr).astype(int)

# --- metrics ---
def evaluate(y_true, y_prob, y_pred):
    cm   = confusion_matrix(y_true, y_pred)
    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec  = recall_score(y_true, y_pred, zero_division=0)
    f1   = f1_score(y_true, y_pred)
    auc  = roc_auc_score(y_true, y_prob)
    return cm, acc, prec, rec, f1, auc

train_cm, train_acc, train_prec, train_rec, train_f1, train_auc = evaluate(train_y, train_p, train_c)
val_cm,   val_acc,   val_prec,   val_rec,   val_f1,   val_auc   = evaluate(val_y,   val_p,   val_c)

print("\nTRAIN METRICS")
print(f"Acc {train_acc:.3f} | Prec {train_prec:.3f} | Rec {train_rec:.3f} | F1 {train_f1:.3f} | AUC {train_auc:.3f}")
print("Confusion Matrix:\n", train_cm)

print("\nVAL METRICS")
print(f"Acc {val_acc:.3f} | Prec {val_prec:.3f} | Rec {val_rec:.3f} | F1 {val_f1:.3f} | AUC {val_auc:.3f}")
print("Confusion Matrix:\n", val_cm)

# --- visualize confusion matrices ---
fig, ax = plt.subplots(1, 2, figsize=(10,4))
sns.heatmap(train_cm, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Train Confusion Matrix'); ax[0].set_xlabel('Predicted'); ax[0].set_ylabel('Actual')

sns.heatmap(val_cm, annot=True, fmt='d', cmap='Blues', ax=ax[1])
ax[1].set_title('Validation Confusion Matrix'); ax[1].set_xlabel('Predicted'); ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()


In [None]:
#Discretization

In [None]:
# --- DISCRETIZATION: Convert continuous discharge into wet/dry ---
DRY_THRESHOLD = 0.00014   # per Jake's message

# Create a new binary column from discharge
merged_df["wetdry_discharge"] = (merged_df["Discharge_CMS"] >= DRY_THRESHOLD).astype(int)

print("Wet/dry (from discharge) distribution:")
print(merged_df["wetdry_discharge"].value_counts())

In [None]:
# --- Combine HOBO + discharge discretization ---
merged_df["wetdry_final"] = merged_df["HoboWetDry0.05"]

# Fill missing HOBO with discharge discretization
merged_df["wetdry_final"] = merged_df["wetdry_final"].fillna(merged_df["wetdry_discharge"])

print("Final combined wet/dry distribution:")
print(merged_df["wetdry_final"].value_counts())

In [None]:
target_col = "wetdry_final"

feature_cols = [c for c in merged_df.columns
                if c not in ["Date", "NHDPlusID", target_col]]

In [None]:
# Create discharge-based wet/dry
merged_df["wetdry_discharge"] = (merged_df["Discharge_CMS"] >= DRY_THRESHOLD).astype(int)

# Combine HOBO + discharge discretization
merged_df["wetdry_final"] = merged_df["HoboWetDry0.05"]
merged_df["wetdry_final"] = merged_df["wetdry_final"].fillna(merged_df["wetdry_discharge"])

print("Final wet/dry distribution:")
print(merged_df["wetdry_final"].value_counts())

In [None]:
# pick the sites that have both classes
valid_obs = merged_df.dropna(subset=["wetdry_final"])
site_variation = valid_obs.groupby("NHDPlusID")["wetdry_final"].nunique()
sites_with_both = site_variation[site_variation > 1].index.tolist()

# pick one or all sites
site_df = merged_df[merged_df["NHDPlusID"].isin(sites_with_both)].sort_values("Date")


In [None]:
df = site_df.copy()

# 1. Fill all NaNs per feature
df = df.fillna(method='ffill').fillna(method='bfill')

# 2. If any remain (because entire column missing early)
df = df.fillna(0)

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np
# 1. Keep numeric only
df_numeric = df.select_dtypes(include=[np.number])

print("Dropped columns:", set(df.columns) - set(df_numeric.columns))

# 2. Scale
scaler = StandardScaler()
scaled = scaler.fit_transform(df_numeric.values)
df_scaled = pd.DataFrame(scaled, columns=df_numeric.columns)


In [None]:
target_col = "wetdry_final"  # adjust to correct name

y = df[target_col].values
df = df.drop(columns=[target_col])

In [None]:
df_numeric = df.select_dtypes(include=[np.number])


In [None]:
def create_sequences(df, #can change seq length to be 60 to
                     seq_len=30, target_col="wetdry_final"):
    X, y = [], []
    values = df.values
    labels = df[target_col].values

    for i in range(len(df) - seq_len):
        seq = values[i:i+seq_len]
        target = labels[i+seq_len]

        X.append(seq)
        y.append(target)

    return np.array(X), np.array(y)


In [None]:
target_col = "wetdry_final"  # CHANGE THIS IF NEEDED


In [None]:
y_original = site_df[target_col].astype(int).values


In [None]:
df_scaled_with_label = df_scaled.copy()
df_scaled_with_label[target_col] = y_original

In [None]:
#can change sequence length here too
X, y = create_sequences(site_df, seq_len=30)


In [None]:
# ===========================================
# 1. Clean site_df
# ===========================================
df = site_df.copy()
df = df.fillna(method='ffill').fillna(method='bfill').fillna(0)

target_col = "wetdry_final"     # <----- your label
y_original = df[target_col].astype(int).values

# ===========================================
# 2. Select numeric features only (remove date)
# ===========================================
df_numeric = df.select_dtypes(include=[np.number]).drop(columns=[target_col], errors='ignore')

# ===========================================
# 3. Scale features
# ===========================================
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled = scaler.fit_transform(df_numeric)
df_scaled = pd.DataFrame(scaled, columns=df_numeric.columns)

# ===========================================
# 4. Add label back
# ===========================================
df_scaled_with_label = df_scaled.copy()
df_scaled_with_label[target_col] = y_original

print("df_scaled_with_label shape:", df_scaled_with_label.shape)

# ===========================================
# 5. Build sequences
# ===========================================
X, y = create_sequences(df_scaled_with_label, seq_len=30, target_col=target_col)

print("NaNs in X:", np.isnan(X).sum())
print("NaNs in y:", np.isnan(y).sum())
print("Shapes:", X.shape, y.shape)


In [None]:
!pip install imbalanced-learn


In [None]:
#ADASYN for class imbalance

In [None]:
from imblearn.over_sampling import ADASYN


In [None]:
n, T, d = X.shape

# Flatten sequences: (n, T, d) -> (n, T*d)
X_flat = X.reshape(n, T * d)

adasyn = ADASYN(random_state=42)
X_flat_res, y_res = adasyn.fit_resample(X_flat, y.astype(int))

# Reshape back to (n_resampled, T, d)
X_res = X_flat_res.reshape(-1, T, d)

print("Original class counts:", np.bincount(y.astype(int)))
print("After ADASYN:", np.bincount(y_res.astype(int)))
print("Original shape:", X.shape, "Resampled shape:", X_res.shape)



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np


# =======================
# Convert to tensors (use ADASYN output)
# =======================
X_tensor = torch.tensor(X_res, dtype=torch.float32)
y_tensor = torch.tensor(y_res.reshape(-1, 1), dtype=torch.float32)

print("X_tensor:", X_tensor.shape)
print("y_tensor:", y_tensor.shape)


In [None]:



# =======================
# Train/Val split
# =======================
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X_tensor, y_tensor, test_size=0.2, shuffle=True, random_state=42
)

print("Train:", X_train.shape, "Val:", X_val.shape)

# =======================
# Compute pos_weight safely
# =======================
pos = int(y_train.sum().item())
neg = int((1 - y_train).sum().item())
safe_pos_weight = 1.0  # KEEP MILD FOR NOW

print("safe_pos_weight =", safe_pos_weight)
pos_weight_tensor = torch.tensor([safe_pos_weight], dtype=torch.float32)

# =======================
# Define LSTM Model
# =======================
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=48, num_layers=1, dropout=0.2):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.lstm(x)
        logits = self.fc(out[:, -1, :])
        return logits


model = LSTMModel(X_tensor.shape[2])

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)
optimizer = optim.Adam(model.parameters(), lr=1e-4)

train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
val_loader   = DataLoader(TensorDataset(X_val, y_val), batch_size=32, shuffle=False)

# =======================
# Train with early stopping
# =======================
best_val_loss = float("inf")
best_state = None
patience = 3
wait = 0
epochs = 15

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for xb, yb in train_loader:
        optimizer.zero_grad()

        logits = model(xb)
        logits = torch.nan_to_num(logits, nan=0.0, posinf=5.0, neginf=-5.0)

        loss = criterion(logits, yb)
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        total_loss += loss.item()

    # ---- validation loss ----
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for xb, yb in val_loader:
            logits = model(xb)
            logits = torch.nan_to_num(logits, nan=0.0, posinf=5.0, neginf=-5.0)
            val_loss += criterion(logits, yb).item()

    print(f"Epoch {epoch+1}/{epochs} | Train Loss={total_loss/len(train_loader):.4f} | Val Loss={val_loss/len(val_loader):.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_state = model.state_dict().copy()
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print("Early stopping triggered.")
            break

model.load_state_dict(best_state)
print("Loaded best model state.")


In [None]:
# =======================
# Predict safely
# =======================
def get_probs(loader):
    model.eval()
    preds, actuals = [], []
    with torch.no_grad():
        for xb, yb in loader:
            logits = model(xb)
            logits = torch.nan_to_num(logits, nan=0.0)

            probs = torch.sigmoid(logits).squeeze().cpu().numpy()
            probs = np.nan_to_num(probs, nan=0.0)

            preds.append(probs)
            actuals.append(yb.squeeze().cpu().numpy())

    return np.concatenate(preds), np.concatenate(actuals)

train_probs, train_true = get_probs(train_loader)
val_probs, val_true = get_probs(val_loader)

# =======================
# Apply threshold
# =======================
threshold = 0.5
train_pred = (train_probs >= threshold).astype(int)
val_pred = (val_probs >= threshold).astype(int)

# =======================
# Compute metrics
# =======================
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def compute_metrics(y_true, y_prob, y_pred):
    return (
        confusion_matrix(y_true, y_pred),
        accuracy_score(y_true, y_pred),
        precision_score(y_true, y_pred, zero_division=0),
        recall_score(y_true, y_pred, zero_division=0),
        f1_score(y_true, y_pred),
        roc_auc_score(y_true, y_prob)
    )

train_cm, train_acc, train_prec, train_rec, train_f1, train_auc = compute_metrics(train_true, train_probs, train_pred)
val_cm,   val_acc,   val_prec,   val_rec,   val_f1,   val_auc   = compute_metrics(val_true, val_probs, val_pred)

print("\nTRAIN METRICS")
print(train_cm, train_acc, train_prec, train_rec, train_f1, train_auc)

print("\nVAL METRICS")
print(val_cm, val_acc, val_prec, val_rec, val_f1, val_auc)

# =======================
# Plot confusion matrices
# =======================
import seaborn as sns
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 2, figsize=(12, 5))

sns.heatmap(train_cm, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title("Train Confusion Matrix")

sns.heatmap(val_cm, annot=True, fmt='d', cmap='Blues', ax=ax[1])
ax[1].set_title("Validation Confusion Matrix")

plt.show()


In [None]:
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score
)
import numpy as np

# ---------------------------------------------------
# 1. GET PREDICTIONS
# ---------------------------------------------------
train_probs, train_true = get_probs(train_loader)
val_probs,   val_true   = get_probs(val_loader)

# ---------------------------------------------------
# 2. SANITIZE ALL VALUES (critical)
# ---------------------------------------------------
train_probs = np.nan_to_num(train_probs, nan=0.0, posinf=1.0, neginf=0.0)
val_probs   = np.nan_to_num(val_probs,   nan=0.0, posinf=1.0, neginf=0.0)

train_true = np.nan_to_num(train_true, nan=0.0)
val_true   = np.nan_to_num(val_true,   nan=0.0)

# ---------------------------------------------------
# 3. CHOOSE THRESHOLD
# ---------------------------------------------------
threshold = 0.5  # you can swap in your tuned threshold

train_pred = (train_probs >= threshold).astype(int)
val_pred   = (val_probs   >= threshold).astype(int)

# ---------------------------------------------------
# 4. EVALUATION FUNCTION
# ---------------------------------------------------
def compute_metrics(y_true, y_prob, y_pred):
    cm   = confusion_matrix(y_true, y_pred)
    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec  = recall_score(y_true, y_pred, zero_division=0)
    f1   = f1_score(y_true, y_pred)
    auc  = roc_auc_score(y_true, y_prob) if len(np.unique(y_true)) > 1 else float('nan')
    return cm, acc, prec, rec, f1, auc

# ---------------------------------------------------
# 5. COMPUTE TRAIN + VAL METRICS
# ---------------------------------------------------
train_cm, train_acc, train_prec, train_rec, train_f1, train_auc = \
    compute_metrics(train_true, train_probs, train_pred)

val_cm, val_acc, val_prec, val_rec, val_f1, val_auc = \
    compute_metrics(val_true, val_probs, val_pred)

# ---------------------------------------------------
# 6. PRINT METRICS
# ---------------------------------------------------
print("\n================ TRAIN METRICS ================")
print(f"Accuracy:  {train_acc:.4f}")
print(f"Precision: {train_prec:.4f}")
print(f"Recall:    {train_rec:.4f}")
print(f"F1 Score:  {train_f1:.4f}")
print(f"AUC:       {train_auc:.4f}")
print("Confusion Matrix:\n", train_cm)

print("\n================ VAL METRICS ================")
print(f"Accuracy:  {val_acc:.4f}")
print(f"Precision: {val_prec:.4f}")
print(f"Recall:    {val_rec:.4f}")
print(f"F1 Score:  {val_f1:.4f}")
print(f"AUC:       {val_auc:.4f}")
print("Confusion Matrix:\n", val_cm)


In [None]:
#MISC Visualizations

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Only numeric columns
numeric_df = df_scaled_with_label.select_dtypes(include=[np.number])

plt.figure(figsize=(12,10))
sns.heatmap(numeric_df.corr(), cmap='coolwarm', center=0, annot=False)
plt.title("Correlation Heatmap of All Numeric Features", fontsize=16)
plt.show()

In [None]:
corr_target = numeric_df.corr()[target_col].sort_values(ascending=False)

plt.figure(figsize=(6,10))
sns.barplot(y=corr_target.index, x=corr_target.values, palette="viridis")
plt.title("Correlation of Features with Wet/Dry Target")
plt.xlabel("Correlation")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

In [None]:
df = site_df.copy()
df = df.sort_values("Date")

df["discharge_lag1"] = df["Discharge_CMS"].shift(1)
df["discharge_lag7"] = df["Discharge_CMS"].shift(7)
df["discharge_lag30"] = df["Discharge_CMS"].shift(30)

plt.figure(figsize=(14,4))
plt.subplot(1,3,1)
sns.scatterplot(x=df["discharge_lag1"], y=df["Discharge_CMS"], s=10)
plt.title("Discharge vs Lag-1")
plt.xlabel("Lag 1")
plt.ylabel("Current")

plt.subplot(1,3,2)
sns.scatterplot(x=df["discharge_lag7"], y=df["Discharge_CMS"], s=10)
plt.title("Discharge vs Lag-7")
plt.xlabel("Lag 7")

plt.subplot(1,3,3)
sns.scatterplot(x=df["discharge_lag30"], y=df["Discharge_CMS"], s=10)
plt.title("Discharge vs Lag-30")
plt.xlabel("Lag 30")

plt.tight_layout()
plt.show()

In [None]:
df = site_df.copy().sort_values("Date")

df["rolling_wet"] = df[target_col].rolling(100).mean()

plt.figure(figsize=(12,4))
plt.plot(df["Date"], df["rolling_wet"])
plt.title("Rolling % of Wet Days (Window=100)")
plt.ylabel("Pct Wet")
plt.xlabel("Date")
plt.grid(True)
plt.show()

In [None]:
counts = merged_df.groupby("NHDPlusID")[target_col].mean().sort_values()

plt.figure(figsize=(14,5))
sns.barplot(x=counts.index.astype(str), y=counts.values, palette="plasma")
plt.xticks([], [])  # hide long site IDs
plt.title("Fraction of Wet Observations per Site")
plt.ylabel("Pct Wet")
plt.show()

In [None]:
import numpy as np

def perm_importance(model, X_val, y_val, n_repeats=5):
    base_preds = torch.sigmoid(model(X_val)).detach().cpu().numpy()
    base_f1 = f1_score(y_val, (base_preds>0.5))

    importances = []

    for col in range(X_val.shape[2]):
        f1_scores = []
        for _ in range(n_repeats):
            X_permuted = X_val.clone()
            X_permuted[:,:,col] = X_val[:,:,col][torch.randperm(X_val.shape[0])]
            perm_preds = torch.sigmoid(model(X_permuted)).detach().cpu().numpy()
            f1_scores.append(f1_score(y_val, (perm_preds>0.5)))
        importances.append(base_f1 - np.mean(f1_scores))

    return np.array(importances)

imps = perm_importance(model, X_val, y_val.squeeze().numpy())
plt.figure(figsize=(8,6))
sns.barplot(x=np.arange(len(imps)), y=imps)
plt.title("Permutation Importance per Feature Index")
plt.xlabel("Feature Index")
plt.ylabel("Importance (F1 Drop)")
plt.show()

NameError: name 'model' is not defined

In [None]:
# some quick sanity checks before more modeling
print("Rows:", len(merged_df))
print("Date range:", merged_df["Date"].min(), "→", merged_df["Date"].max())

print("\nTarget balance (wetdry_discharge):")
print(merged_df["wetdry_discharge"].value_counts())

#make sure no nulls and stuff
num_cols = merged_df.select_dtypes(include=[np.number]).columns
print("\nNaNs in numeric cols:", merged_df[num_cols].isna().sum().sum())
print("Infs in numeric cols:", np.isinf(merged_df[num_cols]).sum().sum())


Rows: 158603
Date range: 1980-01-01 → 2023-06-26

Target balance (wetdry_discharge):


KeyError: 'wetdry_discharge'

In [None]:
# correlation with target
corr = merged_df[num_cols].corr()["wetdry_discharge"].sort_values()
plt.figure(figsize=(6, 4))
corr.drop("wetdry_discharge").plot(kind="barh")
plt.title("Correlation with wetdry_discharge")
plt.tight_layout()
plt.show()


In [None]:
# discharge vs lagged discharge
merged_df["Discharge_lag1"] = merged_df.groupby("NHDPlusID")["Discharge_CMS"].shift(1)

plt.figure(figsize=(4, 4))
plt.scatter(merged_df["Discharge_CMS"],merged_df["Discharge_lag1"],s=3,alpha=0.3)
plt.xlabel("Discharge (today)")
plt.ylabel("Discharge (lag 1)")
plt.title("Discharge persistence")
plt.tight_layout()
plt.show()

NameError: name 'plt' is not defined

In [None]:
# class imbalance summary(train split)
print("Train label counts:")
print(pd.Series(y_train.flatten()).value_counts())
# pos_weight used in BceWithLogitsLoss
pos = (y_train == 1).sum()
neg = (y_train == 0).sum()
print(f"pos_weight used = {neg / max(pos,1):.2f}")

Train label counts:
1.0    12682
Name: count, dtype: int64
pos_weight used = 0.00
