In [None]:
#imports

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix
)

from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.set(style="whitegrid", context="talk")
pd.set_option("display.max_columns", None)

RANDOM_STATE = 42
EPS = 1e-6  # small constant to avoid divide-by-zero issues

In [None]:
# Reshape observational + static data
# Observations and static variables are stored in long format, so we pivot them wide to get one row per site-date.

obs_wide = (
    obs
    .pivot_table(
        index=["NHDPlusID", "Date"],
        columns="variable",
        values="value",
        aggfunc="first"
    )
    .reset_index()
)
obs_wide.columns.name = None

static_wide = (
    static
    .pivot_table(
        index="NHDPlusID",
        columns="variable",
        values="value",
        aggfunc="first"
    )
    .reset_index()
)
static_wide.columns.name = None

# Make sure IDs and dates line up cleanly before merging
for df in [obs_wide, drivers, static_wide, degrees]:
    df["NHDPlusID"] = df["NHDPlusID"].astype(str)

obs_wide["Date"] = pd.to_datetime(obs_wide["Date"], errors="coerce")
drivers["Date"] = pd.to_datetime(drivers["Date"], errors="coerce")

# Pull out month as a simple seasonal feature
obs_wide["month"] = obs_wide["Date"].dt.month
drivers["month"] = drivers["Date"].dt.month

# Keep only rows where the wet/dry label exists
obs_wide = obs_wide[obs_wide["HoboWetDry0.05"].notna()].copy()

In [None]:

# Create lagged climate variables
# Lags are computed *within each site* to avoid leakage.
# shift(+k) means "k days in the past" (true lag).

drivers = drivers.sort_values(["NHDPlusID", "Date"]).copy()

LAG_FEATURES = ["prcp", "tmax", "tmin", "srad", "rhmax", "rhmin", "vp", "ws"]
LAG_DAYS = 1  # set to 0 (no lag), 1 (1-day), or 7 (1-week)

for col in LAG_FEATURES:
    drivers[f"{col}_lag{LAG_DAYS}"] = (
        drivers
        .groupby("NHDPlusID")[col]
        .shift(LAG_DAYS)
    )


In [None]:
# Merge everything into one table
# Inner joins ensure we only keep rows where all required information is available.

merged = (
    obs_wide
    .merge(drivers, on=["NHDPlusID", "Date"], how="inner")
    .merge(static_wide, on="NHDPlusID", how="inner")
    .merge(degrees, on="NHDPlusID", how="inner")
)

print("Merged shape:", merged.shape)

In [None]:
# Feature engineering
# These features try to relate dynamic signals (e.g. precip, temp) to static watershed properties rather than letting the model memorize static indicators alone.

if "AreaSqKm" in merged.columns:
    merged["prcp_per_area"] = merged["prcp"] / (merged["AreaSqKm"] + EPS)
    merged["discharge_per_area"] = merged["Discharge_CMS"] / (merged["AreaSqKm"] + EPS)

if "Slope" in merged.columns:
    merged["tmax_times_slope"] = merged["tmax"] * merged["Slope"]
    merged["tmin_times_slope"] = merged["tmin"] * merged["Slope"]

if "LengthKM" in merged.columns:
    merged["flow_per_length"] = merged["Discharge_CMS"] / (merged["LengthKM"] + EPS)

if "elev_mean_cm" in merged.columns:
    merged["temp_minus_elev"] = merged["tmax"] - (merged["elev_mean_cm"] / 100)

In [None]:
# Build feature matrix and target

TARGET_COL = "HoboWetDry0.05"
META_COLS = ["NHDPlusID", "Date", "Flow_Status"]

drop_cols = [TARGET_COL] + [c for c in META_COLS if c in merged.columns]

X = merged.drop(columns=drop_cols, errors="ignore")
X = X.select_dtypes(include=[np.number])
y = merged[TARGET_COL].astype(int)

# Clean up any weird values before modeling
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.mean()).fillna(0)

print("Feature matrix shape:", X.shape)
print("Label distribution:", y.value_counts(normalize=True))

In [None]:
# Temporal train/test split

# We split by date to mimic a real forecasting scenario:
# train on earlier data, test on later data.

merged_sorted = merged.sort_values("Date")
cutoff = int(len(merged_sorted) * 0.8)

train_df = merged_sorted.iloc[:cutoff]
test_df = merged_sorted.iloc[cutoff:]

X_train = train_df.drop(columns=drop_cols, errors="ignore").select_dtypes(include=[np.number])
X_test  = test_df.drop(columns=drop_cols, errors="ignore").select_dtypes(include=[np.number])

y_train = train_df[TARGET_COL].astype(int)
y_test  = test_df[TARGET_COL].astype(int)

# Fill missing values using training statistics only
X_train = X_train.replace([np.inf, -np.inf], np.nan).fillna(X_train.mean()).fillna(0)
X_test  = X_test.replace([np.inf, -np.inf], np.nan).fillna(X_train.mean()).fillna(0)

print("Train dates:", train_df["Date"].min(), "→", train_df["Date"].max())
print("Test dates:", test_df["Date"].min(), "→", test_df["Date"].max())


In [None]:
# Scale features
# Standardization helps LR converge and keeps coefficients comparable.

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Handle class imbalance with SMOTE
# SMOTE is applied *only to the training set*.

print("Before SMOTE:", np.bincount(y_train))
smote = SMOTE(random_state=RANDOM_STATE, k_neighbors=5)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)
print("After SMOTE:", np.bincount(y_train_smote))


In [None]:
# Train + evaluate logistic regression

model = LogisticRegression(max_iter=3000, random_state=RANDOM_STATE)
model.fit(X_train_smote, y_train_smote)

y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# AUC can be undefined if the test set has only one class
auc = np.nan
if len(np.unique(y_test)) == 2:
    auc = roc_auc_score(y_test, y_prob)

print(f"\nAccuracy: {acc:.3f}")
print(f"F1 Score: {f1:.3f}")
print(f"AUC: {auc:.3f}" if not np.isnan(auc) else "AUC: nan")

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, digits=3))

# Confusion matrix for quick sanity check
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=["Pred Dry", "Pred Wet"],
    yticklabels=["True Dry", "True Wet"]
)
plt.title("Confusion Matrix (Temporal Split)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.show()
