In [None]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import joblib

# %% Load dataset
df = pd.read_csv("data/dataset2.csv")
df['date'] = pd.to_datetime(df['date'])
df.fillna(method='ffill', inplace=True)

# %% Cyclical features for LSTM
df['day_of_week_sin'] = np.sin(2 * np.pi * df['date'].dt.weekday / 7)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['date'].dt.weekday / 7)
df['month_sin'] = np.sin(2 * np.pi * df['date'].dt.month / 12)
df['month_cos'] = np.cos(2 * np.pi * df['date'].dt.month / 12)

df['is_weekend'] = (df['date'].dt.weekday >= 5).astype(int)

# %% Lag features
df['prev_day_cases'] = df.groupby('problem_type')['reported_cases'].shift(1).fillna(0)
df['prev_3day_avg_cases'] = df.groupby('problem_type')['reported_cases'].rolling(3, min_periods=1).mean().reset_index(0, drop=True)
df['problem_severity_interaction'] = df['severity_score'] * df['prev_day_cases']

# %% Encode categorical
categorical_cols = ['problem_type','region']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_encoded = encoder.fit_transform(df[categorical_cols])
cat_encoded_df = pd.DataFrame(cat_encoded, columns=encoder.get_feature_names_out(categorical_cols))
df = pd.concat([df.reset_index(drop=True), cat_encoded_df], axis=1)
df.drop(columns=categorical_cols, inplace=True)

# %% Features & Target
numerical_features = [
    'severity_score','is_weekend','holiday_flag',
    'prev_day_cases','prev_3day_avg_cases','weather_score',
    'rainfall_mm','problem_severity_interaction',
    'day_of_week_sin','day_of_week_cos','month_sin','month_cos'
]

features = numerical_features + list(cat_encoded_df.columns)
target = 'reported_cases'

# %% Scale data using MinMaxScaler
scaler_X = MinMaxScaler()
X_scaled = scaler_X.fit_transform(df[features])

scaler_y = MinMaxScaler()
y_scaled = scaler_y.fit_transform(df[[target]])

# %% Create sequences for LSTM
sequence_length = 30
X_seq, y_seq = [], []
for i in range(sequence_length, len(X_scaled)):
    X_seq.append(X_scaled[i-sequence_length:i])
    y_seq.append(y_scaled[i])

X_seq, y_seq = np.array(X_seq), np.array(y_seq)

# %% Train-test split
split_idx = int(0.8 * len(X_seq))
X_train, X_test = X_seq[:split_idx], X_seq[split_idx:]
y_train, y_test = y_seq[:split_idx], y_seq[split_idx:]

# %% Build LSTM
model = Sequential()
model.add(LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mse')
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# %% Train
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=50,
    batch_size=64,
    callbacks=[early_stop]
)

# %% Save model & scalers & encoder
model.save('./model/model.keras')
joblib.dump(scaler_X, './model/scaler_X.pkl')
joblib.dump(scaler_y, './model/scaler_y.pkl')
joblib.dump(encoder, './model/encoder.pkl')





  df.fillna(method='ffill', inplace=True)
  super().__init__(**kwargs)


Epoch 1/50
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m258s[0m 194ms/step - loss: 0.0227 - val_loss: 0.0169
Epoch 2/50
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m233s[0m 186ms/step - loss: 0.0215 - val_loss: 0.0184
Epoch 3/50
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m237s[0m 189ms/step - loss: 0.0213 - val_loss: 0.0206
Epoch 4/50
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 155ms/step - loss: 0.0211 - val_loss: 0.0194
Epoch 5/50
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m235s[0m 188ms/step - loss: 0.0211 - val_loss: 0.0169
Epoch 6/50
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 166ms/step - loss: 0.0210 - val_loss: 0.0188
Epoch 7/50
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 145ms/step - loss: 0.0210 - val_loss: 0.0156
Epoch 8/50
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m242s[0m 194ms/step - loss: 0.0209 - val_loss:

['./model/encoder.pkl']