# LSTM Forecasting Model

In [76]:
!pip install tensorflow
!pip install matplotlib
!pip install optuna
!pip install seaborn
!pip install pandas
!pip install scikit-learn

In [77]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import optuna

# Load the dataset
file_path = '/Users/ricoschmid/git/xai_budgeting/data/final/merged_complete.csv'
df = pd.read_csv(file_path)
df = df.fillna(method='ffill').fillna(method='bfill')

# Define cutoff year for train-test split
cutoff_year = df["Year"].max() - 1

# Split the data into training and testing sets
train_data = df[df["Year"] <= cutoff_year]
test_data = df[df["Year"] > cutoff_year]

# Select the target column
target_column = "Realized"



# Convert the scaled data back to a DataFrame for easier manipulation
scaled_train_df = pd.DataFrame(train_data, columns=train_data.columns.drop(["Year", "Region"]))
scaled_test_df = pd.DataFrame(test_data, columns=test_data.columns.drop(["Year", "Region"]))

# Create sequences for LSTM
def create_sequences(data, target_column, sequence_length=10):
    sequences = []
    targets = []
    data_length = len(data)
    
    for i in range(data_length - sequence_length):
        seq = data.iloc[i:i+sequence_length].drop(columns=[target_column]).values
        target = data.iloc[i + sequence_length][target_column]
        sequences.append(seq)
        targets.append(target)
        
    return np.array(sequences), np.array(targets)

sequence_length = 10

X_train, y_train = create_sequences(scaled_train_df, target_column, sequence_length)
X_test, y_test = create_sequences(scaled_test_df, target_column, sequence_length)

In [78]:
import optuna

# Define the objective function for Optuna
def objective(trial):
    n_units = trial.suggest_int('n_units', 32, 128)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.4)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-3, log=True)
    batch_size = trial.suggest_int('batch_size', 16, 64)
    
    # Build the LSTM model
    input_layer = Input(shape=(sequence_length, X_train.shape[2]))
    x = LSTM(n_units, activation='relu', return_sequences=True)(input_layer)
    x = BatchNormalization()(x)
    x = Dropout(dropout_rate)(x)
    x = LSTM(n_units // 2, activation='relu')(x)
    output_layer = Dense(1)(x)

    model = tf.keras.models.Model(inputs=input_layer, outputs=output_layer)
    
    # Compile the model
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mean_squared_error')
    
    # Train the model
    history = model.fit(
        X_train, y_train,
        epochs=20,
        batch_size=batch_size,
        validation_data=(X_test, y_test),
        verbose=0
    )
    
    # Evaluate the model
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    return mse

# Create the Optuna study and optimize
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20, timeout=1800)

# Print the best hyperparameters
print("Best hyperparameters: ", study.best_params)

## Build and Train the LSTM Model

In [None]:
# Train the best model with the best hyperparameters
best_params = study.best_params

# Build the best LSTM model
best_model = Sequential()
best_model.add(LSTM(best_params['n_units'], activation='relu', input_shape=(sequence_length, X_train.shape[2]), return_sequences=True))
best_model.add(BatchNormalization())
best_model.add(Dropout(best_params['dropout_rate']))
best_model.add(LSTM(best_params['n_units'] // 2, activation='relu'))
best_model.add(Dense(1))

# Compile the best model
best_model.compile(optimizer=Adam(learning_rate=best_params['learning_rate']), loss='mean_squared_error')

# Train the best model
history = best_model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=best_params['batch_size'],
    validation_data=(X_test, y_test),
    verbose=1
)

# Plot training & validation loss values
plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(loc='upper right')
plt.grid(True)
plt.show()

# Evaluate the best model
train_mse = best_model.evaluate(X_train, y_train, verbose=0)
test_mse = best_model.evaluate(X_test, y_test, verbose=0)

print(f'Train MSE: {train_mse}')
print(f'Test MSE: {test_mse}')

# Save the MSE values
mse_values = {'train_mse': train_mse, 'test_mse': test_mse}
with open('mse_values.json', 'w') as f:
    json.dump(mse_values, f)
print('MSE values saved successfully')

## Evaluate the Model

In [None]:
import matplotlib.pyplot as plt

# Plot the distribution of 'Year' in training and test data
plt.figure(figsize=(14, 7))

# Plot for training data
plt.subplot(2, 1, 1)
plt.plot(train_data['Year'], train_data['Realized'], 'b', label='Train Data')
plt.title('Training Data')
plt.xlabel('Year')
plt.ylabel('Realized')
plt.legend()

# Plot for test data
plt.subplot(2, 1, 2)
plt.plot(test_data['Year'], test_data['Realized'], 'r', label='Test Data')
plt.title('Test Data')
plt.xlabel('Year')
plt.ylabel('Realized')
plt.legend()

plt.tight_layout()
plt.show()