# Installing Prerequisites

In [9]:
!python3 -m pip install --upgrade pip

[0m

In [10]:
#Prerequisites for running the Python script
!pip install pandas numpy scikit-learn keras matplotlib seaborn dill keras-tuner


[0m

# Import Libraries

In [11]:
#Import libraries
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

import tensorflow as tf

from keras.models import Model
from keras.layers import Input, Conv1D, MaxPooling1D, LSTM, Dropout, Dense, Bidirectional, Attention, Flatten
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras_tuner import RandomSearch
from keras_tuner.tuners import BayesianOptimization
from kerastuner.engine.hypermodel import HyperModel

from tensorflow.keras.metrics import MeanSquaredError, MeanAbsoluteError
from keras_tuner.engine import trial as trial_module

from tensorflow.keras import Input
from tensorflow.keras.models import Sequential
from keras.layers import Bidirectional

from keras_tuner.engine.trial import Trial

# Data Preprocessing and Initialization

In [12]:

# Load dataset
file_path = 'Data/data.csv'
df = pd.read_csv(file_path)
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')
df.dropna(subset=['Date'], inplace=True)

# Make a copy before preprocessing
df_raw = df.copy()

# Define the target variable and features
numeric_cols = ['Cases', 'tsf', 'mst', 'rh', 'rfm', 'sca']

# Convert numeric columns and handle errors
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
df.dropna(inplace=True)


In [78]:
horizon=2

# Lag feature generation from cases
def Lag(data, lags):
    for lag in lags:
        data[f'lag_{lag}'] = data['Cases'].shift(lag)
    return data

lags = [1, 2, 3, 4, 5, 6] # Specify how many lagged features are used
df = Lag(df, lags=lags)
df.dropna(inplace=True)

feature_cols = [f'lag_{lag}' for lag in lags] + ['tsf','mst','rh','rfm','sca'] #Insert the lagged cases into the numeric cols
n_features = len(feature_cols) 

def build_model(hp):
    # Pull window from the hyperparameters
    window = hp.Int('window', 2, 16, step=1)

    model = Sequential()
    model.add(Input(shape=(window, n_features)))

    num_layers = hp.Int('num_layers', 1, 3, 1) #(1,4,1)
    for i in range(num_layers):
        units      = hp.Int(f'lstm_units_{i}',    64, 512, 32)
        rec_drop   = hp.Float(f'rec_dropout_{i}',  0.0, 0.3, 0.05)
        return_seq = (i < num_layers - 1)

        if hp.Boolean(f'bidirectional_{i}'):
            model.add(Bidirectional(LSTM(units,return_sequences=return_seq,recurrent_dropout=rec_drop)))
        else:
            model.add(LSTM(units,return_sequences=return_seq, recurrent_dropout=rec_drop))

        model.add(Dropout(hp.Float(f'dropout_{i}', 0.1, 0.3, 0.05)))

    model.add(Dense(1))

    lr  = hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4, 1e-5])
    opt = Adam(learning_rate=lr)

    model.compile(optimizer=opt, loss='mse', metrics=['mae', 'mse'])
    return model
    
class Optimize(BayesianOptimization):
    '''
    Hyperparameters are formatted as (hyperparemter, min, max, steps)
    '''
    def run_trial(self, trial: Trial, df, features, **fit_kwargs):
        window = trial.hyperparameters.Int('window', 2, 16, step=1) # Match the window parameters to the model
        batch_size = trial.hyperparameters.Int('batch_size', 32, 128, 16) 
        epochs     = trial.hyperparameters.Int('epochs',     20, 150, 10)
        
        # Prepare data per trial
        df_proc = df.copy() #Use the copy of the data
        df_proc['y_future'] = df_proc['Cases'].shift(-horizon) #Shifting the cases t-horizon days to line up with features in t
        df_proc.dropna(subset=['y_future'], inplace=True)
        X_loc, y_loc = [], []

        #Rolling window
        for i in range(window, len(df_proc)-horizon): 
            X_loc.append(df_proc[features].iloc[i-window:i].values)
            y_loc.append(df_proc['y_future'].iloc[i])
            
        X_loc = np.array(X_loc)
        y_loc = np.array(y_loc)
        
        y_loc_scaled = y_scaler.transform(y_loc.reshape(-1,1)).flatten()

        # Sample weights
        sw = 1.0 + np.where(y_loc > 0, 5.0, 0.5)
        sw_train = sw[:train_end]
        sw_val   = sw[train_end:val_end]

        # Callbacks
        callbacks = [EarlyStopping(monitor='mse', patience=10, restore_best_weights=True)]

        # Build and attach scaler
        model = build_model(trial.hyperparameters)
        model.y_scaler = y_scaler

        # 5) Delegate to the parent class
        return super().run_trial(
            trial,
            X_loc[:train_end], y_loc_scaled[:train_end],
            validation_data=(X_loc[train_end:val_end], y_loc_scaled[train_end:val_end]),
            sample_weight=sw_train,
            batch_size=batch_size,
            epochs=epochs,
            callbacks=callbacks,
            **fit_kwargs
        )

n_features = len(feature_cols)



In [79]:
# Prepare X and y arrays (again)
X, y = [], []
for i in range(lags[-1], len(df) - horizon):
    X.append(df[feature_cols].iloc[i-lags[-1]:i].values)
    y.append(df['Cases'].shift(-horizon).iloc[i])
X = np.array(X)
y = np.array(y)

# Scale targets once on the full array, then split
y_scaler = MinMaxScaler()
y_scaled = y_scaler.fit_transform(y.reshape(-1,1)).flatten()

# Split into train, val, test (80/10/10)/ This is based on indexing; make sure that data is arranged linearly.
n = len(X)
train_end = int(0.8 * n)
val_end   = int(0.9 * n)

x_train, x_val, x_test = X[:train_end], X[train_end:val_end], X[val_end:]
y_train, y_val, y_test = (
    y_scaled[:train_end],
    y_scaled[train_end:val_end],
    y_scaled[val_end:]
)

#Scale the features
feature_scaler = MinMaxScaler()
df[feature_cols] = feature_scaler.fit_transform(df[feature_cols])

In [None]:
#Assign weights based on correlation with cases
weights = {
    col: df[col].corr(df['Cases'])  # or use lagged cases
    for col in ['tsf','mst','rh','rfm','sca']
}
weights_df = pd.DataFrame.from_dict(weights, orient='index', columns=['Weight']).sort_values('Weight')
print("Feature weights (corr with Cases):")
print(weights_df)

weighted = df[numeric_cols].copy()
for col, w in weights.items():
    weighted[col] = weighted[col] * w
corr_matrix = weighted.corr()

#Plotting
plt.figure(figsize=(8,6))
plt.imshow(corr_matrix, interpolation='nearest', aspect='auto')
plt.colorbar(label='Correlation')
plt.xticks(range(len(corr_matrix)), corr_matrix.columns, rotation=45, ha='right')
plt.yticks(range(len(corr_matrix)), corr_matrix.index)
plt.title("Heatmap of Weighted Feature Correlations")

# Add annotations
for i in range(corr_matrix.shape[0]):
    for j in range(corr_matrix.shape[1]):
        plt.text(j, i, f"{corr_matrix.iloc[i, j]:.2f}", ha='center', va='center', color='black')

plt.tight_layout()
plt.savefig("Results/Plots/CorrMatrix.tiff", dpi=600, format='tiff', bbox_inches='tight')
plt.show()

# Tuning

In [None]:
#### Instantiate the tuner
tuner = Optimize(
    hypermodel=build_model,
    objective='val_loss', #MSE in this case
    max_trials=100,
    num_initial_points=10,
    executions_per_trial=2,
    directory='Models',
    project_name=f'Horizon({horizon})',
    #overwrite=True, #Only activate when Overwriting!
)

tuner.search(df=df, features=feature_cols)

In [None]:
best_trial = tuner.oracle.get_best_trials(num_trials=1)[0]

best_hp = best_trial.hyperparameters

window = (best_hp.get('window'))
best_batch   = best_hp.get('batch_size')
best_epochs  = best_hp.get('epochs')

num_layers = best_hp.get('num_layers')
lstm_units = [best_hp.get(f'lstm_units_{i}') for i in range(num_layers)]
dropouts = [best_hp.get(f'dropout_{i}') for i in range(num_layers)]
rec_dropouts = [best_hp.get(f'rec_dropout_{i}') for i in range(num_layers)]

best_lr  = best_hp.get('learning_rate')

best_model = tuner.get_best_models(1)[0]
best_model.y_scaler = y_scaler

print(f"Horizon: {horizon}")
print(f"Best window:  {window}")
print(f"Best batch:  {best_batch}")
print(f"Best epoch: {best_epochs}")
print(f"Best learning rate: {best_lr}")
print(f"Best Num Layers: {num_layers}")
print(f"LSTM Units: {lstm_units}")
print(f"Dropouts: {dropouts}")
print(f"Recurrent dropouts: {rec_dropouts}")