In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import joblib
from datetime import datetime, timedelta
import copy

In [13]:
df = pd.read_csv("data/dataset2.csv")
df["date"] = pd.to_datetime(df["date"])
df.fillna(method='ffill', inplace=True)

  df.fillna(method='ffill', inplace=True)


In [14]:
df['day_of_week'] = df['date'].dt.weekday
df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
df['month'] = df['date'].dt.month
df['prev_day_cases'] = df.groupby('problem_type')['reported_cases'].shift(1).fillna(0)
df['prev_3day_avg_cases'] = df.groupby('problem_type')['reported_cases'].rolling(3, min_periods=1).mean().reset_index(0, drop=True)
df['problem_severity_interaction'] = df['severity_score'] * df['prev_day_cases']

In [15]:
categorical_cols = ['problem_type','region']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_encoded = encoder.fit_transform(df[categorical_cols])
cat_encoded_df = pd.DataFrame(cat_encoded, columns=encoder.get_feature_names_out(categorical_cols))
df = pd.concat([df.reset_index(drop=True), cat_encoded_df], axis=1)
df = df.drop(columns=categorical_cols)

In [6]:
numerical_features = [
    'severity_score','day_of_week','is_weekend','month','holiday_flag',
    'prev_day_cases','prev_3day_avg_cases','weather_score','rainfall_mm','problem_severity_interaction'
]

In [7]:
features = numerical_features + list(cat_encoded_df.columns)
target = 'reported_cases'

In [16]:
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(df[features])

scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(df[[target]])

In [9]:
sequence_length = 30
X_seq, y_seq = [], []
for i in range(sequence_length, len(X_scaled)):
    X_seq.append(X_scaled[i-sequence_length:i])
    y_seq.append(y_scaled[i])
X_seq, y_seq = np.array(X_seq), np.array(y_seq)

In [18]:
split_idx = int(0.8 * len(X_seq))
X_train, X_test = X_seq[:split_idx], X_seq[split_idx:]
y_train, y_test = y_seq[:split_idx], y_seq[split_idx:]


In [19]:
model = Sequential()
model.add(LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mse')
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


In [20]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=50,
    batch_size=64,
    callbacks=[early_stop]
)

Epoch 1/50
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 126ms/step - loss: 0.4204 - val_loss: 1.1603
Epoch 2/50
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 125ms/step - loss: 0.4121 - val_loss: 2.1449
Epoch 3/50
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 128ms/step - loss: 0.4108 - val_loss: 1.7262
Epoch 4/50
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 115ms/step - loss: 0.4098 - val_loss: 1.8178
Epoch 5/50
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 110ms/step - loss: 0.4081 - val_loss: 3.8900
Epoch 6/50
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 112ms/step - loss: 0.4086 - val_loss: 3.3043


In [21]:
model.save('./model/model.keras')
joblib.dump(scaler_X, './model/scaler_X.pkl')
joblib.dump(scaler_y, './model/scaler_y.pkl')
joblib.dump(encoder, './model/encoder.pkl')

['./model/encoder.pkl']