#Modules and Data upload

In [None]:
#importing necessary modules
!pip install keras-tuner

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input, LSTM, Reshape, TimeDistributed, Bidirectional
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from tensorflow.keras.layers import RepeatVector
from kerastuner.tuners import BayesianOptimization
#uploading data
train_data = pd.read_csv('train_data_covid.csv')
test_data = pd.read_csv('test_data_covid.csv')


#Feature Engg.

In [2]:

#converting 'Date' to datetime to sort data
train_data['Date'] = pd.to_datetime(train_data['Date'])
test_data['Date'] = pd.to_datetime(test_data['Date'])
train_data.sort_values(by=['State/UnionTerritory', 'Date'], inplace=True)
test_data.sort_values(by=['State/UnionTerritory', 'Date'], inplace=True)

#Feature engineering - adding high-correlation features

train_data['Confirmed_lag_12'] = train_data.groupby('State/UnionTerritory')['Confirmed'].shift(12).fillna(method='bfill')
train_data['Cured_lag_12'] = train_data.groupby('State/UnionTerritory')['Cured'].shift(12).fillna(method='bfill')

test_data['Confirmed_lag_12'] = test_data.groupby('State/UnionTerritory')['Confirmed'].shift(12).fillna(method='bfill')
test_data['Cured_lag_12'] = test_data.groupby('State/UnionTerritory')['Cured'].shift(12).fillna(method='bfill')

#adding rolling means and standard deviations for confirmed and cured cases
train_data['Confirmed_Rolling_Mean'] = train_data.groupby('State/UnionTerritory')['Confirmed'].transform(lambda x: x.rolling(window=7, min_periods=1).mean()).fillna(method='bfill')
train_data['Cured_Rolling_Mean'] = train_data.groupby('State/UnionTerritory')['Cured'].transform(lambda x: x.rolling(window=7, min_periods=1).mean()).fillna(method='bfill')
train_data['Confirmed_Rolling_Std'] = train_data.groupby('State/UnionTerritory')['Confirmed'].transform(lambda x: x.rolling(window=7, min_periods=1).std()).fillna(method='bfill')
train_data['Cured_Rolling_Std'] = train_data.groupby('State/UnionTerritory')['Cured'].transform(lambda x: x.rolling(window=7, min_periods=1).std()).fillna(method='bfill')

test_data['Confirmed_Rolling_Mean'] = test_data.groupby('State/UnionTerritory')['Confirmed'].transform(lambda x: x.rolling(window=7, min_periods=1).mean()).fillna(method='bfill')
test_data['Cured_Rolling_Mean'] = test_data.groupby('State/UnionTerritory')['Cured'].transform(lambda x: x.rolling(window=7, min_periods=1).mean()).fillna(method='bfill')
test_data['Confirmed_Rolling_Std'] = test_data.groupby('State/UnionTerritory')['Confirmed'].transform(lambda x: x.rolling(window=7, min_periods=1).std()).fillna(method='bfill')
test_data['Cured_Rolling_Std'] = test_data.groupby('State/UnionTerritory')['Cured'].transform(lambda x: x.rolling(window=7, min_periods=1).std()).fillna(method='bfill')

#applying exponentially weighted moving averages (EWMA)
train_data['Confirmed_EWMA'] = train_data.groupby('State/UnionTerritory')['Confirmed'].transform(lambda x: x.ewm(span=7, min_periods=1).mean()).fillna(method='bfill')
train_data['Cured_EWMA'] = train_data.groupby('State/UnionTerritory')['Cured'].transform(lambda x: x.ewm(span=7, min_periods=1).mean()).fillna(method='bfill')

test_data['Confirmed_EWMA'] = test_data.groupby('State/UnionTerritory')['Confirmed'].transform(lambda x: x.ewm(span=7, min_periods=1).mean()).fillna(method='bfill')
test_data['Cured_EWMA'] = test_data.groupby('State/UnionTerritory')['Cured'].transform(lambda x: x.ewm(span=7, min_periods=1).mean()).fillna(method='bfill')



#Preparing the data

In [3]:
#selecting the features and target
features = [
    'Confirmed_lag_12', 'Cured_lag_12', 'State/UnionTerritory',
    'Confirmed_Rolling_Mean', 'Cured_Rolling_Mean', 'Confirmed_Rolling_Std', 'Cured_Rolling_Std',
    'Confirmed_EWMA', 'Cured_EWMA'
]
target = 'Deaths'

#Defining the preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), [
            'Confirmed_lag_12', 'Cured_lag_12', 'Confirmed_Rolling_Mean', 'Cured_Rolling_Mean', 'Confirmed_Rolling_Std', 'Cured_Rolling_Std',
    'Confirmed_EWMA', 'Cured_EWMA'
        ]),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['State/UnionTerritory'])
    ])

#applying preprocessing to the data
X = train_data[features]
y = train_data[target]
X_test = test_data[features]

X = preprocessor.fit_transform(X)
X_test = preprocessor.transform(X_test)
#spliting the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

#Sample valued Model\( for quick evaluation \)

In [None]:
""" CODE FOR A SAMPLE
sample hyp par values for faste evaulation """
#defining the sequence-to-sequence model with LSTM layers
def build_seq2seq_model(input_shape, output_sequence_length):
    model = Sequential([
        Input(shape=(input_shape,)),
        Reshape((1, input_shape)),
        LSTM(64, activation='relu', return_sequences=False),  #encoder LSTM with 64 units
        RepeatVector(output_sequence_length),  #repeat the output sequence length times
        LSTM(64, activation='relu', return_sequences=True),  #decoder LSTM with 64 units
        TimeDistributed(Dense(1))  #output layer for regression
    ])
    model.compile(optimizer=Adam(),
                  loss='mean_squared_error')
    return model

#training the sequence-to-sequence model on 80% of the data to find RMSE
seq2seq_model = build_seq2seq_model(X_train.shape[1], 1)
seq2seq_model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=1, validation_data=(X_val, y_val),
                  callbacks=[tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)])

#predicting on the validation set using sequence-to-sequence model
val_predictions_seq2seq = seq2seq_model.predict(X_val)
val_predictions_seq2seq = val_predictions_seq2seq.flatten()

#calculating RMSE for sequence-to-sequence model
val_rmse_seq2seq = np.sqrt(mean_squared_error(y_val, val_predictions_seq2seq))
print(f"Validation RMSE (Sequence-to-Sequence): {val_rmse_seq2seq}")


#sample submissions file

In [None]:
#training the final model on the full dataset
final_model = build_seq2seq_model(X.shape[1], 1)
final_model.fit(X, y, epochs=70, batch_size=32, verbose=1)

predictions= final_model.predict(X_test)
predictions= predictions.flatten()

submission = pd.DataFrame({
    'Sno': test_data['Sno'],
    'Deaths': np.round(predictions).astype(int)
})
submission.to_csv('submission_sample_seq2seq.csv', index=False)
print("submission file saved successfully.")

#Model to find the actual best hyperparamter values \(bayesian optimization \)

In [None]:
""" ACTUAL CODE USED FOR SUBMISSION FILE """

#defining the model
def build_tuned_seq2seq_model(hp):
    input_shape = X_train.shape[1]
    output_sequence_length = 1
    model = Sequential([
        Input(shape=(input_shape,)),
        Reshape((1, input_shape)),
        LSTM(hp.Int('units', min_value=64, max_value=256, step=64),
             activation='relu', return_sequences=False),  #encoder LSTM
        RepeatVector(output_sequence_length),  #repeat the output sequence length times
        LSTM(hp.Int('units', min_value=64, max_value=256, step=64),
             activation='relu', return_sequences=True),  #decoder LSTM
        TimeDistributed(Dense(1))  #output layer for regression
    ])
    model.compile(optimizer=Adam(hp.Float('lr', min_value=1e-4, max_value=1e-2, sampling='LOG')),
                  loss='mean_squared_error')
    return model

#tuning the model using bayesian optimisation

tuner = BayesianOptimization(
    build_tuned_seq2seq_model,
    objective='val_loss',
    max_trials=50, #use lower number of trials for faster, BUT LESS ACCURATE convergence
    executions_per_trial=2,
    directory='my_dir',
    project_name='seq2seq'
)

tuner.search(X_train, y_train, epochs=70, batch_size=32, validation_data=(X_val, y_val),
             callbacks=[tf.keras.callbacks.EarlyStopping(patience=10)])

#saving the best model
best_model = tuner.get_best_models(num_models=1)[0]
#predicting and calculate RMSE
val_predictions_seq2seq = best_model.predict(X_val)
val_predictions_seq2seq = val_predictions_seq2seq.flatten()
val_rmse_seq2seq = np.sqrt(mean_squared_error(y_val, val_predictions_seq2seq))
print(f"Validation RMSE (Sequence-to-Sequence with Tuning): {val_rmse_seq2seq}")

# original submissions file creation

In [None]:
#training the final model on the full dataset
final_model = tuner.get_best_models(num_models=1)[0]
final_model.fit(X, y, epochs=70, batch_size=32, verbose=1)

predictions= final_model.predict(X_test)
predictions= predictions.flatten()

submission = pd.DataFrame({
    'Sno': test_data['Sno'],
    'Deaths': np.round(predictions).astype(int)
})
submission.to_csv('submission_seq2seq.csv', index=False)
print("submission file saved successfully.")
