In [107]:
#Make necessary imports
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [108]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [None]:
#Load drivers dataset as CSV
drivers = pd.read_csv('/content/drive/MyDrive/drivers (2).csv')

In [None]:
drivers

In [None]:
#Loads obs data
data = pd.read_parquet('obs.parquet')

In [None]:
data.head()

In [None]:
#Isolate the target variable by creating dictionary of all metrics and choosing 'value'
dfs = {val: data[data['variable'] == val ]for val in data['variable'].unique()}

wet_dry_df = dfs['HoboWetDry0.05']

In [None]:
#Load in remaining datasets
static_df = pd.read_parquet('static_vars.parquet')
degrees = pd.read_parquet('degrees.parquet')

In [None]:
#Make sure all NHD IDs are strings, normalize before merging
wet_dry_df['NHDPlusID'] = wet_dry_df['NHDPlusID'].astype(str)
drivers['NHDPlusID'] = drivers['NHDPlusID'].astype(str)
static_df['NHDPlusID'] = static_df['NHDPlusID'].astype(str)

#Make sure all date values are in datetime format, normalize before merging
wet_dry_df['Date'] = pd.to_datetime(wet_dry_df['Date'])
drivers['Date'] = pd.to_datetime(drivers['Date'])

static_df_pivot = static_df.pivot(
    index='NHDPlusID',
    columns='variable',
    values='value'
).reset_index()

#Merge all datasets to create one central dataset
central_df = wet_dry_df.merge(drivers, on=['NHDPlusID', 'Date'], how='inner')
central_df = central_df.merge(static_df_pivot, on='NHDPlusID', how='left')
central_df = central_df.merge(degrees, on='NHDPlusID', how='left')

In [None]:
#Sort by date within each NHDPlusID
central_df = central_df.sort_values(['NHDPlusID', 'Date'])

#Introduce lag by shifting the label back
#EXPERIMENT WITH THIS VALUE (which is currently -7)
central_df['wet_dry_next'] = central_df.groupby('NHDPlusID')['value'].shift(-7)

#Drop all null values
central_df = central_df.dropna(subset = 'wet_dry_next')

## **Random Splitting Approach**

In [None]:
#Features and target
X = central_df.drop('wet_dry_next', axis=1)
y = central_df['wet_dry_next']

#Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

## **Temporal Splitting Approach**

In [None]:
#Sort by time
df = central_df.sort_values(['Date'])
split_date = '2020-9-15'

#Introduce temporal based splitting
#Every entry before 9/15 is training; rest is testing
train = df[df['Date'] < split_date]
test = df[df['Date'] >= split_date]

X_train = train.drop('wet_dry_next', axis=1)
y_train = train['wet_dry_next']
X_test = test.drop('wet_dry_next', axis=1)
y_test = test['wet_dry_next']

## **Site-based Splitting Approach**

In [None]:
#Sort data chronologically
central_df = central_df.sort_values(['Date']).reset_index(drop=True)

#Split sites into train and test groups
sites = central_df['SiteIDCode'].unique()
train_sites = sites[:int(0.8 * len(sites))]
test_sites  = sites[int(0.8 * len(sites)):]

#Create site-based train and test sets
train = central_df[central_df['SiteIDCode'].isin(train_sites)].copy()
test  = central_df[central_df['SiteIDCode'].isin(test_sites)].copy()

#Features and target
X_train = train.drop('wet_dry_next', axis=1)
y_train = train['wet_dry_next']
X_test = test.drop('wet_dry_next', axis=1)
y_test = test['wet_dry_next']

In [None]:
#Drop non-numerical, non-influential columns
X_train = X_train.drop(['variable', 'NHDPlusID',	'SiteIDCode',	'Date'], axis = 1)
X_test = X_test.drop(['variable', 'NHDPlusID',	'SiteIDCode',	'Date'], axis = 1)

In [None]:
#Apply SMOTE to fix class imbalance
sm = SMOTE(random_state=42, k_neighbors=5)
X_train, y_train = sm.fit_resample(X_train, y_train)

In [None]:
#Define XGBoost model and fit it on training data
XGB_model = XGBClassifier(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    random_state=42
)

XGB_model.fit(X_train, y_train)

In [None]:
#Apply model onto test data and save predictions
y_pred = XGB_model.predict(X_test)

#Obtain performance metrics
accuracy = accuracy_score(y_test, y_pred)
roc = roc_auc_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:
accuracy

In [None]:
roc

In [None]:
#Plot confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=[0, 1])

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Dry', 'Wet'])
disp.plot(cmap='Blues', values_format='d')

plt.title("Confusion Matrix - Site-based splitting, No lag")
plt.show()

In [None]:
#Make feature importances table
feature_cols = X_train.columns.tolist()

importances = pd.DataFrame({
    "Feature": feature_cols,
    "Importance": XGB_model.feature_importances_
})

print(importances)


In [None]:
#Define function to make predictions given site and date
FEATURE_COLS = list(X_train.columns)
def predict_site_date(model, central_df, site_id, date):
  date = pd.to_datetime(date)
  row = central_df[(central_df["SiteIDCode"] == site_id) & (central_df["Date"] == date)]
  Xq = row[FEATURE_COLS]
  pred_class = model.predict(Xq)
  pred_prob = model.predict_proba(Xq)[:, 1]

  return pd.DataFrame({
      "SiteIDCode": row["SiteIDCode"].values,
      "Date": row["Date"].values,
      "pred_class": pred_class,
      "pred_prob": pred_prob
  })

In [None]:
predict_site_date(
    model=XGB_model,
    central_df=central_df,
    site_id="01137500",
    date="2020-09-15"
)