In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [None]:
df_moving_avg = pd.read_csv("merged_fips_moving_average_data.csv")
df_moving_avg.drop(columns=['smoothed_wvaccine_likely_govt_health', 'smoothed_wwearing_mask'], inplace=True)

In [None]:
df_moving_avg['time_value'] = pd.to_datetime(df_moving_avg['time_value'])
df_moving_avg['month'] = df_moving_avg['time_value'].dt.month
df_moving_avg['week'] = df_moving_avg['time_value'].dt.isocalendar().week
df_moving_avg['dayofweek'] = df_moving_avg['time_value'].dt.dayofweek

In [None]:
# Identify boolean columns
bool_cols = df_moving_avg.select_dtypes(include='bool').columns

# Convert all boolean columns to float32
df_moving_avg[bool_cols] = df_moving_avg[bool_cols].astype(np.float32)

In [None]:
df_moving_avg = df_moving_avg.sort_values(['FIPS County', 'time_value'])

In [None]:
# Prepare lag features
df_moving_avg['time_value'] = pd.to_datetime(df_moving_avg['time_value'])
df_moving_avg = df_moving_avg.sort_values(['FIPS County', 'time_value'])

df_moving_avg['lag_1'] = df_moving_avg.groupby('FIPS County')['smoothed_wtested_positive_14d'].shift(1)
df_moving_avg['lag_2'] = df_moving_avg.groupby('FIPS County')['smoothed_wtested_positive_14d'].shift(2)

# Neighbor lag function
def compute_state_neighbor_lag(df, lag_col):
    neighbor_means = []
    for idx, row in df.iterrows():
        state = row['FIPS State']
        county = row['FIPS County']
        date = row['time_value']
        mask = (
            (df['time_value'] == date) &
            (df['FIPS State'] == state) &
            (df['FIPS County'] != county)
        )
        mean_val = df.loc[mask, lag_col].mean()
        neighbor_means.append(mean_val)
    return neighbor_means

# Apply neighbor lag computation
df_moving_avg['neighbor_lag_1'] = compute_state_neighbor_lag(df_moving_avg, 'lag_1')
df_moving_avg['neighbor_lag_2'] = compute_state_neighbor_lag(df_moving_avg, 'lag_2')

# Fill missing lags with same-day national average
lag_cols = ['neighbor_lag_1', 'neighbor_lag_2', 'lag_1', 'lag_2']
for col in lag_cols:
    daily_avg = df_moving_avg.groupby('time_value')[col].transform('mean')
    df_moving_avg[col] = df_moving_avg[col].fillna(daily_avg)

# Define features and target
drop_cols = ['time_value', 'geo_value', 'County Name', 'smoothed_wtested_positive_14d']
X = df_moving_avg.drop(columns=drop_cols)
y = df_moving_avg['smoothed_wtested_positive_14d']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)


In [None]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from torch.optim.lr_scheduler import StepLR
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# Fix any remaining dtype issues before splitting the dataset
# Convert boolean columns to float32
bool_cols = df_moving_avg.select_dtypes(include='bool').columns
df_moving_avg[bool_cols] = df_moving_avg[bool_cols].astype(np.float32)

# Convert object columns to numeric
obj_cols = df_moving_avg.select_dtypes(include='object').columns
df_moving_avg[obj_cols] = df_moving_avg[obj_cols].apply(pd.to_numeric, errors='coerce')

# Fill any resulting NaNs
df_moving_avg = df_moving_avg.fillna(0)

# Now define features and target
drop_cols = ['time_value', 'geo_value', 'County Name', 'smoothed_wtested_positive_14d']
X = df_moving_avg.drop(columns=drop_cols)
y = df_moving_avg['smoothed_wtested_positive_14d']

# Continue with scaling and training
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y_array = y.values.reshape(-1, 1)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
results = []

# Define the MLP model
class MLPRegressor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.model(x)

# Training loop with K-Fold
for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled)):
    print(f"\n▶ Fold {fold + 1} started")

    X_train = torch.tensor(X_scaled[train_idx], dtype=torch.float32)
    y_train = torch.tensor(y_array[train_idx], dtype=torch.float32)
    X_val = torch.tensor(X_scaled[val_idx], dtype=torch.float32)
    y_val = torch.tensor(y_array[val_idx], dtype=torch.float32)

    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)

    model = MLPRegressor(input_dim=X.shape[1])
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-4)
    scheduler = StepLR(optimizer, step_size=100, gamma=0.5)
    criterion = nn.MSELoss()

    best_loss = float('inf')
    patience = 15
    wait = 0
    best_model_state = None

    for epoch in range(1, 601):
        model.train()
        running_loss = 0
        for xb, yb in train_loader:
            optimizer.zero_grad()
            pred = model(xb)
            loss = criterion(pred, yb)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        scheduler.step()
        avg_loss = running_loss / len(train_loader)

        print(f"Epoch {epoch:3d} - Loss: {avg_loss:.4f} - LR: {scheduler.get_last_lr()[0]:.6f}")

        if avg_loss < best_loss:
            best_loss = avg_loss
            wait = 0
            best_model_state = model.state_dict()
        else:
            wait += 1
            if wait >= patience:
                print("⏹ Early stopping triggered.")
                break

    # Load best model and evaluate
    model.load_state_dict(best_model_state)
    model.eval()
    with torch.no_grad():
        y_pred = model(X_val).squeeze().numpy()
        y_true = y_val.squeeze().numpy()

    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f" Fold {fold + 1} Results — RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.3f}")
    results.append({"Fold": fold + 1, "RMSE": rmse, "MAE": mae, "R2": r2})

# Final result DataFrame
df_results = pd.DataFrame(results)
df_results_summary = df_results.mean(numeric_only=True)
df_results, df_results_summary
