In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import joblib


In [None]:
df = pd.read_csv("data/dataset2.csv")
df['date'] = pd.to_datetime(df['date'])
df.fillna(method='ffill', inplace=True)

In [None]:
df['day_of_week_sin'] = np.sin(2 * np.pi * df['date'].dt.weekday / 7)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['date'].dt.weekday / 7)
df['month_sin'] = np.sin(2 * np.pi * df['date'].dt.month / 12)
df['month_cos'] = np.cos(2 * np.pi * df['date'].dt.month / 12)

df['is_weekend'] = (df['date'].dt.weekday >= 5).astype(int)

In [None]:
df['prev_day_cases'] = df.groupby('problem_type')['reported_cases'].shift(1).fillna(0)
df['prev_3day_avg_cases'] = df.groupby('problem_type')['reported_cases'].rolling(3, min_periods=1).mean().reset_index(0, drop=True)
df['problem_severity_interaction'] = df['severity_score'] * df['prev_day_cases']

In [None]:
categorical_cols = ['problem_type','region']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_encoded = encoder.fit_transform(df[categorical_cols])
cat_encoded_df = pd.DataFrame(cat_encoded, columns=encoder.get_feature_names_out(categorical_cols))
df = pd.concat([df.reset_index(drop=True), cat_encoded_df], axis=1)
df.drop(columns=categorical_cols, inplace=True)

In [None]:
numerical_features = [
    'severity_score','is_weekend','holiday_flag',
    'prev_day_cases','prev_3day_avg_cases','weather_score',
    'rainfall_mm','problem_severity_interaction',
    'day_of_week_sin','day_of_week_cos','month_sin','month_cos'
]

features = numerical_features + list(cat_encoded_df.columns)
target = 'reported_cases'

In [None]:
scaler_X = MinMaxScaler()
X_scaled = scaler_X.fit_transform(df[features])

scaler_y = MinMaxScaler()
y_scaled = scaler_y.fit_transform(df[[target]])

In [None]:
sequence_length = 30
X_seq, y_seq = [], []
for i in range(sequence_length, len(X_scaled)):
    X_seq.append(X_scaled[i-sequence_length:i])
    y_seq.append(y_scaled[i])

X_seq, y_seq = np.array(X_seq), np.array(y_seq)

In [None]:
split_idx = int(0.8 * len(X_seq))
X_train, X_test = X_seq[:split_idx], X_seq[split_idx:]
y_train, y_test = y_seq[:split_idx], y_seq[split_idx:]

In [None]:
model = Sequential()
model.add(LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mse')
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [None]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=50,
    batch_size=64,
    callbacks=[early_stop]
)


In [None]:
model.save('./model/model.keras')
joblib.dump(scaler_X, './model/scaler_X.pkl')
joblib.dump(scaler_y, './model/scaler_y.pkl')
joblib.dump(encoder, './model/encoder.pkl')
