# ISSR Prediction Models

### Imports

In [None]:
#imports
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
import warnings
import numpy as np
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
from pandas.plotting import autocorrelation_plot
from imblearn.under_sampling import RandomUnderSampler
import keras_tuner as kt
from joblib import Parallel, delayed
from tensorflow.keras.optimizers import Adam

### Load Datasets

In [None]:
Buffalo_data = pd.read_csv("C:/Users/jstej/anaconda3/George Mason DAEN MS/DAEN 690/Buffalo_issr2021.csv")
Buffalo_data.head()

In [None]:
Upton_data = pd.read_csv("C:/Users/jstej/anaconda3/George Mason DAEN MS/DAEN 690/Upton_issr.csv")
Upton_data.head()

## Filter to 2022 and 2023

In [None]:
Buffalo_data = Buffalo_data[(Buffalo_data['year'] >= 2022) & (Buffalo_data['year'] <= 2023) & (Buffalo_data['press'] != -9999)]

In [None]:
Upton_data = Upton_data[(Upton_data['year'] >= 2022) & (Upton_data['year'] <= 2023) & (Upton_data['press'] != -9999)]

## Preprocessing

### Convert issc from characters to integers

In [None]:
warnings.filterwarnings("ignore")
Buffalo_data['issc'] = Buffalo_data['issc'].replace({'yes': 1, 'no': 0})

In [None]:
warnings.filterwarnings("ignore")
Upton_data['issc'] = Upton_data['issc'].replace({'yes': 1, 'no': 0})

In [None]:
Buffalo_data

In [None]:
Upton_data

## Time Series Modeling

In [None]:
#Create DataFrame with volume column. Volume is based off of the number of times per day that an issc was yes
Buffalo_data['date'] = pd.to_datetime(Buffalo_data['date'])
issc_counts_per_day_Buffalo = Buffalo_data[Buffalo_data['issc'] == 1].groupby(Buffalo_data['date'].dt.date).size()
temp_count_Buffalo = Buffalo_data[Buffalo_data['temp_f'] < -42].groupby(Buffalo_data['date'].dt.date).size()
rh_ice_count_Buffalo = Buffalo_data[Buffalo_data['rh_ice'] > 1].groupby(Buffalo_data['date'].dt.date).size()

all_dates_Buffalo = pd.DataFrame(index=pd.date_range(start=Buffalo_data['date'].min(), end=Buffalo_data['date'].max(), freq='D'))
issc_counts_per_day_Buffalo.name = 'volume Buffalo'
temp_count_Buffalo.name = 'temp_volume Buffalo'
rh_ice_count_Buffalo.name = 'rhi_volume Buffalo'

df2 = all_dates_Buffalo.join(issc_counts_per_day_Buffalo, how='left').fillna(0)

df3 = pd.merge(df2, temp_count_Buffalo, left_index = True, right_index = True, how = 'left').fillna(0)

new_df_Buffalo = pd.merge(df3, rh_ice_count_Buffalo, left_index = True, right_index = True, how = 'left').fillna(0)

In [None]:
new_df_Buffalo

In [None]:
#Create DataFrame with volume column. Volume is based off of the number of times per day that an issc was yes
Upton_data['date'] = pd.to_datetime(Upton_data['date'])
issc_counts_per_day_Upton = Upton_data[Upton_data['issc'] == 1].groupby(Upton_data['date'].dt.date).size()
temp_count_Upton = Upton_data[Upton_data['temp_f'] < -42].groupby(Upton_data['date'].dt.date).size()
rh_ice_count_Upton = Upton_data[Upton_data['rh_ice'] > 1].groupby(Upton_data['date'].dt.date).size()

all_dates_Upton = pd.DataFrame(index=pd.date_range(start=Upton_data['date'].min(), end=Upton_data['date'].max(), freq='D'))
issc_counts_per_day_Upton.name = 'volume Upton'
temp_count_Upton.name = 'temp_volume Upton'
rh_ice_count_Upton.name = 'rhi_volume Upton'

df2 = all_dates_Upton.join(issc_counts_per_day_Upton, how='left').fillna(0)

df3 = pd.merge(df2, temp_count_Upton, left_index = True, right_index = True, how = 'left').fillna(0)

new_df_Upton = pd.merge(df3, rh_ice_count_Upton, left_index = True, right_index = True, how = 'left').fillna(0)

In [None]:
new_df_Upton

In [None]:
plt.figure(figsize = (12, 8))
plt.plot(new_df_Buffalo['volume Buffalo'])

In [None]:
plt.figure(figsize = (12, 8))
plt.plot(new_df_Upton['volume Upton'])

In [None]:
df_merge = new_df_Buffalo.merge(new_df_Upton, left_index = True, right_index = True, how = 'outer')

In [None]:
df_merge

In [None]:
# Add 'ISSC in both' attribute
df_merge['ISSC in both'] = ((df_merge['volume Buffalo'] > 0) & (df_merge['volume Upton'] > 0)).astype(int)

In [None]:
df_merge[df_merge['ISSC in both'] == 1]

In [None]:
df_merge = df_merge[df_merge.index != '2023-07-11']

### Find optimal lag variables

In [None]:
#Evaluate optimal number of lag features for ISSC volume, Temp Volume, and RHI Volume
issc_lag_B = [0, 1]
temp_lag_B = [0, 1]
rhi_lag_B = [0, 1]
issc_lag_U = [0, 1]
temp_lag_U = [0, 1]
rhi_lag_U = [0, 1]


search_results2 = []

def lag_evaluation(issc_lag_B, temp_lag_B, rhi_lag_B, issc_lag_U, temp_lag_U, rhi_lag_U):
    lstm_df = df_merge.copy()
    
    for i in range(1, issc_lag_B + 1):
        lstm_df[f'volume Buffalo lag{i}'] = lstm_df['volume Buffalo'].shift(i)
        lstm_df[f'volume Buffalo lag{i}'].fillna(0, inplace = True)

    for i in range(1, temp_lag_B + 1):
        lstm_df[f'temp_volume Buffalo lag{i}'] = lstm_df['temp_volume Buffalo'].shift(i)
        lstm_df[f'temp_volume Buffalo lag{i}'].fillna(0, inplace = True)

    for i in range(1, rhi_lag_B + 1):
        lstm_df[f'rhi_volume Buffalo lag{i}'] = lstm_df['rhi_volume Buffalo'].shift(i)
        lstm_df[f'rhi_volume Buffalo lag{i}'].fillna(0, inplace = True)
        
    for i in range(1, issc_lag_U + 1):
        lstm_df[f'volume Upton lag{i}'] = lstm_df['volume Upton'].shift(i)
        lstm_df[f'volume Upton lag{i}'].fillna(0, inplace = True)

    for i in range(1, temp_lag_U + 1):
        lstm_df[f'temp_volume Upton lag{i}'] = lstm_df['temp_volume Upton'].shift(i)
        lstm_df[f'temp_volume Upton lag{i}'].fillna(0, inplace = True)

    for i in range(1, rhi_lag_U + 1):
        lstm_df[f'rhi_volume Upton lag{i}'] = lstm_df['rhi_volume Upton'].shift(i)
        lstm_df[f'rhi_volume Upton lag{i}'].fillna(0, inplace = True)
            
        
    X3 = lstm_df.iloc[:, 1:]
    y3 = lstm_df['ISSC in both'].values
    
    X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, random_state=42)
    
    model = Sequential([
        LSTM(units=50, activation='relu', input_shape=(X3_train.shape[1], 1)),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=0.01), loss='mse')
    
    # Train the model
    model.fit(X3_train, y3_train, epochs=50, batch_size = 32, verbose=0, validation_split=0.2)
    
    # Evaluate the model
    predictions = model.predict(X3_test)
    mse = mean_squared_error(y3_test, predictions)
    return mse


for vl_B in issc_lag_B:
    for tl_B in temp_lag_B:
        for rl_B in rhi_lag_B:
            for vl_U in issc_lag_U:
                for tl_U in temp_lag_U:
                    for rl_U in rhi_lag_U:
                        mse = lag_evaluation(vl_B, tl_B, rl_B, vl_U, tl_U, rl_U)
                        search_results2.append((vl_B, tl_B, rl_B, vl_U, tl_U, rl_U, mse))
                        print(f"ISSC_Lag_B: {vl_B}, Temp_Lag_B: {tl_B}, RHI_Lag_B: {rl_B}, ISSC_Lag:_U {vl_U}, Temp_Lag_U: {tl_U}, RHI_Lag_U: {rl_U}, MSE: {mse}")
                        

In [None]:
best_lag = min(search_results2, key=lambda x: x[3])
print(f"Optimal number of lag variables: ISSC Lag B={best_lag[0]}, Temp Lag B={best_lag[1]}, RHI Lag B={best_lag[2]}, ISSC Lag U={best_lag[3]}, Temp Lag U={best_lag[4]}, RHI Lag U={best_lag[5]}, with MSE={best_lag[6]}")

In [None]:
#Input the number of lag features
num_lags = 1

#Create loop to add lag features to dataframe
for i in range(1, num_lags + 1):
    df_merge[f'temp_volume Upton lag{i}'] = df_merge['temp_volume Upton'].shift(i)
    
for i in range (1, num_lags + 1):
    df_merge[f'rhi_volume Upton lag{i}'] = df_merge['rhi_volume Upton'].shift(i)
    
for i in range (1, num_lags + 1):
    df_merge[f'volume Upton lag{i}'] = df_merge['volume Upton'].shift(i)
    

for i in range(1, num_lags + 1):
    df_merge[f'temp_volume Buffalo lag{i}'] = df_merge['temp_volume Buffalo'].shift(i)
    
for i in range (1, num_lags + 1):
    df_merge[f'rhi_volume Buffalo lag{i}'] = df_merge['rhi_volume Buffalo'].shift(i)
    
for i in range (1, num_lags + 1):
    df_merge[f'volume Buffalo lag{i}'] = df_merge['volume Buffalo'].shift(i)    


    
# Fill missing values with 0
df_merge['temp_volume Upton lag1'] = df_merge['temp_volume Upton lag1'].fillna(0)
df_merge['rhi_volume Upton lag1'] = df_merge['rhi_volume Upton lag1'].fillna(0)
df_merge['volume Upton lag1'] = df_merge['volume Upton lag1'].fillna(0)

df_merge['temp_volume Buffalo lag1'] = df_merge['temp_volume Buffalo lag1'].fillna(0)
df_merge['rhi_volume Buffalo lag1'] = df_merge['rhi_volume Buffalo lag1'].fillna(0)
df_merge['volume Buffalo lag1'] = df_merge['volume Buffalo lag1'].fillna(0)
df_merge

### Hyperparameter tuning

In [None]:
# Define the grid of hyperparameters to search
units = [50, 100, 150]
learning_rates = [0.01, 0.001, 0.0001]
batch_sizes = [16, 32, 64]

# Placeholder for storing the results
search_results = []

def build_and_evaluate_model(units, learning_rate, batch_size):
    # Build the model
    X4 = df_merge[['volume Buffalo', 'temp_volume Buffalo', 'rhi_volume Buffalo', 'volume Upton', 'temp_volume Upton', 'rhi_volume Upton']]
    y4 = df_merge['ISSC in both'].values

    X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, test_size=0.2, random_state=42)
    
    model = Sequential([
        LSTM(units=units, activation='relu', input_shape=(X4_train.shape[1], 1)),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse')
    
    # Train the model
    model.fit(X4_train, y4_train, epochs=50, batch_size=batch_size, verbose=0, validation_split=0.2)
    
    # Evaluate the model
    predictions = model.predict(X4_test)
    mse = mean_squared_error(y4_test, predictions)
    return mse

# Iterate over each combination of hyperparameters
for u in units:
    for lr in learning_rates:
        for bs in batch_sizes:
            mse = build_and_evaluate_model(u, lr, bs)
            search_results.append((u, lr, bs, mse))
            print(f"Units: {u}, Learning Rate: {lr}, Batch Size: {bs}, MSE: {mse}")

# Find the best hyperparameters
best_hyperparams = min(search_results, key=lambda x: x[3])
print(f"Best Hyperparameters: Units={best_hyperparams[0]}, Learning Rate={best_hyperparams[1]}, Batch Size={best_hyperparams[2]} with MSE={best_hyperparams[3]}")

In [None]:
X5 = df_merge[['volume Buffalo lag1', 'temp_volume Buffalo lag1', 'rhi_volume Buffalo lag1', 'volume Upton lag1', 'temp_volume Upton lag1', 'rhi_volume Upton lag1']]
y5 = df_merge['ISSC in both']

X5_train, X5_test, y5_train, y5_test = train_test_split(X5, y5, test_size = 0.2, random_state = 42)

model = Sequential([
    LSTM(units = 150, activation = 'relu', input_shape =(X5.shape[1], 1)),
    Dense(1)
])

model.compile(optimizer =Adam(learning_rate = 0.01), loss = 'mse')

model.fit(X5_train, y5_train, epochs = 50, batch_size = 32, validation_split = 0.2, verbose = 1)
test_loss = model.evaluate(X5_test, y5_test)
print(f'Test Loss: {test_loss}')

In [None]:
actual_volume = df_merge['ISSC in both']
predicted_volume = model.predict(X5)

plt.figure(figsize=(10, 6))
plt.plot(actual_volume.index, actual_volume, label='Actual Volume', color='blue')
plt.plot(actual_volume.index, predicted_volume, label='Predicted Volume', color='red', linestyle='--')
plt.xlabel('Date')
plt.ylabel('ISSC both Stations')
plt.title('Actual vs. Predicted Volume')
plt.legend()
plt.grid(True)
plt.show()