In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.data import DataLoader, TensorDataset
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Define Attention Mechanism
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        self.init_weights()

    def init_weights(self):
        nn.init.xavier_uniform_(self.attn.weight)
        nn.init.constant_(self.attn.bias, 0)
        nn.init.uniform_(self.v, -0.1, 0.1)

    def forward(self, hidden, encoder_outputs):
        seq_len = encoder_outputs.size(1)
        hidden = hidden.unsqueeze(1).repeat(1, seq_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        energy = energy.transpose(1, 2)
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)
        attention_weights = torch.bmm(v, energy).squeeze(1)
        return torch.softmax(attention_weights, dim=1)

# Define LSTM Encoder
class LSTMEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(LSTMEncoder, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)

    def forward(self, x):
        outputs, (hidden, cell) = self.lstm(x)
        return outputs, hidden, cell

# Define LSTM Decoder with Attention
class LSTMDecoderWithAttention(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers, dropout, context_size):
        super(LSTMDecoderWithAttention, self).__init__()
        self.attention = Attention(hidden_size)
        self.lstm = nn.LSTM(hidden_size + context_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, hidden, cell, current_context):
        attn_weights = self.attention(hidden[-1], encoder_outputs)
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1)
        lstm_input = torch.cat([context, current_context], dim=1).unsqueeze(1)  # Add sequence dimension
        outputs, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        prediction = self.fc(outputs.squeeze(1))
        return prediction, hidden, cell

# Define Transformer Encoder
class TransformerEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_heads, dropout):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Linear(input_size, hidden_size)
        self.positional_encoding = nn.Parameter(torch.zeros(1, 1000, hidden_size))  # Assuming max sequence length of 1000
        self.encoder_layers = nn.TransformerEncoderLayer(hidden_size, num_heads, hidden_size * 4, dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layers, num_layers)

    def forward(self, x):
        x = self.embedding(x) + self.positional_encoding[:, :x.size(1), :]
        x = self.transformer_encoder(x)
        return x

# Define Convolutional Layers
class ConvLayers(nn.Module):
    def __init__(self, input_size, hidden_size, kernel_size, dropout):
        super(ConvLayers, self).__init__()
        self.conv1 = nn.Conv1d(input_size, hidden_size, kernel_size, padding=(kernel_size - 1) // 2)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size, padding=(kernel_size - 1) // 2)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.permute(0, 2, 1)  # Change shape to (batch_size, input_size, seq_len)
        x = self.conv1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = x.permute(0, 2, 1)  # Change shape back to (batch_size, seq_len, hidden_size)
        return x

# Define Hybrid Model with Models in Series
class HybridForecastingModel(nn.Module):
    def __init__(self, input_size, lstm_hidden_size, transformer_hidden_size, conv_hidden_size, context_size, output_size, lstm_num_layers, transformer_num_layers, transformer_num_heads, conv_kernel_size, dropout):
        super(HybridForecastingModel, self).__init__()
        self.conv_layers = ConvLayers(input_size, conv_hidden_size, conv_kernel_size, dropout)
        self.lstm_encoder = LSTMEncoder(conv_hidden_size, lstm_hidden_size, lstm_num_layers, dropout)
        self.lstm_decoder = LSTMDecoderWithAttention(lstm_hidden_size, output_size, lstm_num_layers, dropout, context_size)
        self.transformer_encoder = TransformerEncoder(output_size, transformer_hidden_size, transformer_num_layers, transformer_num_heads, dropout)
        self.fc = nn.Linear(transformer_hidden_size, output_size)

    def forward(self, past_data, current_context):
        conv_outputs = self.conv_layers(past_data)
        encoder_outputs, hidden, cell = self.lstm_encoder(conv_outputs)
        lstm_output, _, _ = self.lstm_decoder(encoder_outputs, hidden, cell, current_context)
        transformer_outputs = self.transformer_encoder(lstm_output.unsqueeze(1))
        output = self.fc(transformer_outputs.squeeze(1))
        return output

# Define Autoencoder class
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim, hidden_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim),
            nn.Sigmoid()
        )
        self.init_weights()

    def init_weights(self):
        for layer in self.encoder:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_uniform_(layer.weight)
                nn.init.constant_(layer.bias, 0)
        for layer in self.decoder:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_uniform_(layer.weight)
                nn.init.constant_(layer.bias, 0)

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

def load_and_preprocess_data(file_path, features, target, context_features):
    df = pd.read_csv(file_path)

    # Check for missing values
    if df.isnull().sum().sum() > 0:
        df = df.fillna(method='ffill').fillna(method='bfill')

    # Normalize data
    scaler_features = MinMaxScaler()
    scaler_target = MinMaxScaler()
    scaler_context = MinMaxScaler()

    # Normalize feature columns
    df[features] = scaler_features.fit_transform(df[features])

    # Normalize target columns
    df[target] = scaler_target.fit_transform(df[target])

    # Normalize context columns
    df[context_features] = scaler_context.fit_transform(df[context_features])

    return df, scaler_target, scaler_context

def create_sequences(data, target_data, context_data, n_timesteps):
    X, y, context_features = [], [], []
    for i in range(len(data) - n_timesteps):
        X.append(data[i:i + n_timesteps].values)
        y.append(target_data.iloc[i + n_timesteps].values)
        context_features.append(context_data.iloc[i + n_timesteps].values)
    return np.array(X), np.array(y), np.array(context_features)

def prepare_data(df, features, target, context_features, n_timesteps):
    X, y, context = create_sequences(df[features], df[target], df[context_features], n_timesteps)

    # Convert to PyTorch tensors
    X = torch.tensor(X, dtype=torch.float32)
    y = torch.tensor(y, dtype=torch.float32)
    context = torch.tensor(context, dtype=torch.float32)

    return X, y, context

def train_autoencoder(autoencoder, train_loader, n_epochs, ae_optimizer, ae_criterion, scheduler=None):
    for epoch in range(n_epochs):
        autoencoder.train()
        total_loss = 0
        for batch in train_loader:
            ae_optimizer.zero_grad()
            X_batch = batch[0]
            encoded, decoded = autoencoder(X_batch)
            loss = ae_criterion(decoded, X_batch)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(autoencoder.parameters(), max_norm=1.0)
            ae_optimizer.step()
            total_loss += loss.item()
        
        if scheduler:
            scheduler.step(total_loss / len(train_loader))
        
        if (epoch + 1) % 10 == 0:
            print(f'Autoencoder Epoch [{epoch+1}/{n_epochs}], Loss: {total_loss/len(train_loader):.4f}')


def encode_data(autoencoder, data):
    autoencoder.eval()
    with torch.no_grad():
        encoded_data, _ = autoencoder.encoder(data)
    return encoded_data

def train_hybrid(model, train_loader, val_loader, n_epochs, optimizer, criterion, scheduler=None, patience=10):
    best_loss = float('inf')
    epochs_no_improve = 0
    early_stop = False

    for epoch in range(n_epochs):
        if early_stop:
            print("Early stopping")
            break

        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            past_data, y_batch, context_data = batch
            output = model(past_data, context_data)
            loss = criterion(output, y_batch)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            total_loss += loss.item()

        if scheduler:
            scheduler.step()

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                past_data, y_batch, context_data = batch
                output = model(past_data, context_data)
                loss = criterion(output, y_batch)
                val_loss += loss.item()

        val_loss /= len(val_loader)
        print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {total_loss/len(train_loader):.4f}, Val Loss: {val_loss:.4f}')

        # Early stopping
        if val_loss < best_loss:
            best_loss = val_loss
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print("Early stopping triggered")
                break

def evaluate_model(model, test_loader, scaler_target):
    model.load_state_dict(torch.load('best_model.pth'))
    model.eval()
    predictions, true_values, attention_weights = [], [], []
    with torch.no_grad():
        for batch in test_loader:
            past_data, y_batch, context_data = batch
            outputs, attn_weights = model(past_data, context_data)
            predictions.append(outputs.cpu().numpy())
            true_values.append(y_batch.cpu().numpy())
            attention_weights.append(attn_weights.cpu().numpy())

    predictions = np.concatenate(predictions, axis=0)
    true_values = np.concatenate(true_values, axis=0)
    attention_weights = np.concatenate(attention_weights, axis=0)

    predictions_original_scale = scaler_target.inverse_transform(predictions)
    true_values_original_scale = scaler_target.inverse_transform(true_values)

    mse = mean_squared_error(true_values_original_scale, predictions_original_scale)
    print(f'Mean Squared Error: {mse}')

    return predictions_original_scale, true_values_original_scale, attention_weights

# def adaptive_learning_rate(boosting_round):
#     initial_learning_rate = 0.01
#     decay_rate = 0.99
#     return initial_learning_rate * (decay_rate ** boosting_round)

# def train_xgboost(X_train, y_train, X_val, y_val, early_stopping_rounds=10):
#     dtrain = xgb.DMatrix(X_train, label=y_train)
#     dval = xgb.DMatrix(X_val, label=y_val)

#     params = {
#         'objective': 'reg:squarederror',
#         'eval_metric': 'rmse',
#         'learning_rate': 0.01,
#         'max_depth': 6,
#         'subsample': 0.8,
#         'colsample_bytree': 0.8,
#         'seed': 42
#     }

#     evals = [(dtrain, 'train'), (dval, 'eval')]
#     model = xgb.train(params, dtrain, num_boost_round=1000, evals=evals, early_stopping_rounds=early_stopping_rounds, verbose_eval=True, learning_rates=adaptive_learning_rate)
#     return model

class AdaptiveLearningRateCallback(xgb.callback.TrainingCallback):
    def __init__(self, initial_learning_rate=0.01, decay_rate=0.99):
        self.initial_learning_rate = initial_learning_rate
        self.decay_rate = decay_rate

    def after_iteration(self, model, epoch, evals_log):
        new_learning_rate = self.initial_learning_rate * (self.decay_rate ** epoch)
        model.set_param('learning_rate', new_learning_rate)
        return False  # Return False to indicate training should continue

def train_xgboost_with_callback(X_train, y_train, X_val, y_val, early_stopping_rounds=10):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)

    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'learning_rate': 0.01,  # Initial learning rate
        'max_depth': 6,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'seed': 42
    }

    evals = [(dtrain, 'train'), (dval, 'eval')]
    callbacks = [AdaptiveLearningRateCallback(initial_learning_rate=0.01, decay_rate=0.99)]
    model = xgb.train(params, dtrain, num_boost_round=1000, evals=evals, early_stopping_rounds=early_stopping_rounds, verbose_eval=True, callbacks=callbacks)
    return model

def main(file_path, features, target, context_features, n_timesteps, encoding_dim, hidden_dim, lstm_hidden_size, transformer_hidden_size, conv_hidden_size, context_size, output_size, lstm_num_layers, transformer_num_layers, transformer_num_heads, conv_kernel_size, dropout, learning_rate, n_epochs, ae_n_epochs, batch_size, n_splits=5):
    # Load and preprocess data
    df, scaler_target, scaler_context = load_and_preprocess_data(file_path, features, target, context_features)

    # Prepare data
    X, y, context = prepare_data(df, features, target, context_features, n_timesteps)

    # Split data into training and test sets
    X_train, X_test, y_train, y_test, context_train, context_test = train_test_split(X, y, context, test_size=0.2, random_state=42)

    # Train the autoencoder on the entire training set
    train_dataset = TensorDataset(X_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

    # Define the input dimension, encoding dimension, and hidden dimension
    input_dim = X_train.size(2)

    # Initialize the autoencoder
    autoencoder = Autoencoder(input_dim, encoding_dim, hidden_dim)

    # Define the optimizer and loss function for autoencoder
    # Define the optimizer and loss function for autoencoder
    ae_optimizer = optim.AdamW(autoencoder.parameters(), lr=0.001)
    ae_criterion = nn.MSELoss()
    ae_scheduler = optim.lr_scheduler.ReduceLROnPlateau(ae_optimizer, mode='min', factor=0.1, patience=5, verbose=True)

    # Train the autoencoder
    train_autoencoder(autoencoder, train_loader, ae_n_epochs, ae_optimizer, ae_criterion, ae_scheduler)

    # Encode the entire training and test data
    encoded_train = encode_data(autoencoder, X_train.view(-1, X_train.size(2)))
    encoded_test = encode_data(autoencoder, X_test.view(-1, X_test.size(2)))

    # Reshape the encoded features back to the original sequence shape
    encoded_train = encoded_train.view(X_train.size(0), X_train.size(1), -1)
    encoded_test = encoded_test.view(X_test.size(0), X_test.size(1), -1)

    # Define K-Fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=False, random_state=42)

    hybrid_predictions = []
    true_values = []

    for fold, (train_index, val_index) in enumerate(kf.split(encoded_train)):
        print(f"Fold {fold+1}/{n_splits}")

        X_train_fold, X_val_fold = encoded_train[train_index], encoded_train[val_index]
        y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
        context_train_fold, context_val_fold = context_train[train_index], context_train[val_index]

        # Create DataLoader for hybrid model training
        train_dataset = TensorDataset(X_train_fold, y_train_fold, context_train_fold)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
        val_dataset = TensorDataset(X_val_fold, y_val_fold, context_val_fold)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        # Initialize model
        input_size = X_train_fold.size(2)  # Encoded feature size
        model = HybridForecastingModel(input_size, conv_hidden_size, lstm_hidden_size, transformer_hidden_size, context_size, output_size, lstm_num_layers, transformer_num_layers, transformer_num_heads, conv_kernel_size, dropout)
        optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-5)  # L2 regularization
        criterion = nn.MSELoss()
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

        # Train the hybrid model with early stopping
        train_hybrid(model, train_loader, val_loader, n_epochs, optimizer, criterion, scheduler)

        # Get hybrid model predictions for XGBoost training
        model.eval()
        with torch.no_grad():
            hybrid_train_preds = []
            for batch in train_loader:
                past_data, _, context_data = batch
                hybrid_output = model(past_data, context_data)
                hybrid_train_preds.append(hybrid_output.cpu().numpy())
            hybrid_train_preds = np.concatenate(hybrid_train_preds, axis=0)

            hybrid_val_preds = []
            for batch in val_loader:
                past_data, _, context_data = batch
                hybrid_output = model(past_data, context_data)
                hybrid_val_preds.append(hybrid_output.cpu().numpy())
            hybrid_val_preds = np.concatenate(hybrid_val_preds, axis=0)

        # Train XGBoost model
        xgb_model = train_xgboost(hybrid_train_preds, y_train_fold.numpy(), hybrid_val_preds, y_val_fold.numpy())

        # Store predictions and true values for evaluation
        hybrid_predictions.append(hybrid_val_preds)
        true_values.append(y_val_fold.numpy())

    # Concatenate all predictions and true values
    hybrid_predictions = np.concatenate(hybrid_predictions, axis=0)
    true_values = np.concatenate(true_values, axis=0)

    # Evaluate the XGBoost model on the test set
    test_dataset = TensorDataset(encoded_test, y_test, context_test)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    model.eval()
    with torch.no_grad():
        hybrid_test_preds = []
        for batch in test_loader:
            past_data, _, context_data = batch
            hybrid_output = model(past_data, context_data)
            hybrid_test_preds.append(hybrid_output.cpu().numpy())
        hybrid_test_preds = np.concatenate(hybrid_test_preds, axis=0)

    dtest = xgb.DMatrix(hybrid_test_preds)
    # Evaluate the XGBoost model on the test set
    xgb_model = train_xgboost(hybrid_train_preds, y_train_fold.numpy(), hybrid_val_preds, y_val_fold.numpy())
    predictions = xgb_model.predict(dtest)

    # Inverse transform the predictions to the original scale
    predictions_original_scale = scaler_target.inverse_transform(predictions.reshape(-1, 1))
    true_values_original_scale = scaler_target.inverse_transform(y_test.numpy().reshape(-1, 1))

    # Calculate the mean squared error
    mse = mean_squared_error(true_values_original_scale, predictions_original_scale)
    print(f'Mean Squared Error on Test Set: {mse}')

    # Plot the predictions vs true values
    plt.figure(figsize=(10, 6))
    plt.plot(true_values_original_scale, label='True Values')
    plt.plot(predictions_original_scale, label='Predictions')
    plt.legend()
    plt.title('Predictions vs True Values')
    plt.show()