In [200]:
import os
import random
import warnings
warnings.filterwarnings('ignore')


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, mean_squared_error,
    mean_absolute_error
)
from scipy.stats import pearsonr

import torch
import torch.nn as nn
import torch.optim as optim


In [201]:

# Set random seeds for reproducibility
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

warnings.filterwarnings('ignore')
os.makedirs("comparison_plots", exist_ok=True)

In [202]:

def create_model_folder(model_name, epochs):
    model_folder = f'model_{model_name}_{epochs}_epochs'
    
    os.makedirs(model_folder, exist_ok=True)
    
    return model_folder

DATA PREPARATION

In [203]:
def load_and_prepare_data():
    data = pd.read_csv("merged_data.csv")
    data = data.dropna()

    data['timestamp'] = pd.to_datetime(data['timestamp'])
    data = data.rename(columns={'pid': 'PID', 'Time': 'time'})

    data['time_index'] = data.groupby('PID').cumcount()

    if 'PID' not in data.columns:
        raise ValueError("PID column not found in data. Available columns: " + ", ".join(data.columns))

    X = data[['Pe_results', 'Comp_results']].values
    y = data['TAC_Reading'].values

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    X_tensor = torch.tensor(X, dtype=torch.float32)
    y_tensor = torch.tensor(y, dtype=torch.float32).view(-1, 1)

    return X_tensor, y_tensor

MODEL DEFINITIONS

In [204]:
class FNN(nn.Module):
    def __init__(self):
        super(FNN, self).__init__()
        self.fc1 = nn.Linear(2, 16)
        self.fc2 = nn.Linear(16, 8)
        self.fc3 = nn.Linear(8, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x



In [205]:
class RNN(nn.Module):
    def __init__(self, input_size=2, hidden_size=16, num_layers=1):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = x.unsqueeze(1)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

In [206]:
class LSTM(nn.Module):
    def __init__(self, input_size=2, hidden_size=16, num_layers=1):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = x.unsqueeze(1)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [207]:
class CNN(nn.Module):
    def __init__(self, input_size):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(1, 16, kernel_size=1)
        self.pool = nn.MaxPool1d(kernel_size=1)
        self.fc1 = nn.Linear(16 * input_size, 8)
        self.fc2 = nn.Linear(8, 1)

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.pool(torch.relu(self.conv1(x)))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

UTILITY FUNCTION

In [208]:
def calculate_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


def calculate_mae(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)


def calculate_normalized_rmse(y_true, y_pred):
    rmse = calculate_rmse(y_true, y_pred)
    return rmse / (np.max(y_true) - np.min(y_true))



PLOTTING FUNCTION

In [209]:
def plot_predictions_with_boundary(y_train, y_test, y_pred, model_name):
    y_train_cpu = y_train.detach().cpu().numpy().flatten()
    y_test_cpu = y_test.detach().cpu().numpy().flatten()
    y_pred_cpu = y_pred.detach().cpu().numpy().flatten()

    full_true = np.concatenate([y_train_cpu, y_test_cpu])
    full_pred = np.concatenate([y_train_cpu, y_pred_cpu])
    true_zoomed = full_true[-200:]
    pred_zoomed = full_pred[-200:]

    plt.figure(figsize=(12, 5))
    plt.plot(true_zoomed, label='True TAC', linewidth=2)
    plt.plot(pred_zoomed, label='Predicted TAC', linewidth=2, linestyle='--')
    plt.axhline(0.08, color='red', linestyle='--', label='Sober Boundary')
    plt.title(f'{model_name} - Last 200 Steps Prediction')
    plt.xlabel('Time Step')
    plt.ylabel('TAC Reading')
    plt.legend()
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(f"comparison_plots/{model_name}_predictions.png")
    plt.close()


def plot_metric_histogram(values, metric_name, model_names):
    plt.figure(figsize=(8, 4))
    bars = plt.bar(model_names, values, color='skyblue', edgecolor='black')
    plt.title(f'{metric_name} Comparison Across Models')
    plt.ylabel(metric_name)
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2.0, height * 0.98, f'{height:.2f}',
                 ha='center', va='top', fontsize=10, color='black')

    plt.tight_layout()
    plt.savefig(f'comparison_plots/{metric_name.replace(" ", "_").lower()}_histogram.png')
    plt.close()


TRAINING AND EVALUATION

In [210]:
def train_model_with_validation(model, X_train, y_train, X_test, y_test, model_name, epochs, lr=0.001):
    model_folder = f'model_{model_name}_{epochs}_epochs'
    os.makedirs(model_folder, exist_ok=True)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    train_losses, test_losses, total_losses = [], [], []

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train)
        train_loss = criterion(outputs, y_train)
        train_loss.backward()
        optimizer.step()
        train_losses.append(train_loss.item())

        model.eval()
        with torch.no_grad():
            test_outputs = model(X_test)
            test_loss = criterion(test_outputs, y_test)
            test_losses.append(test_loss.item())

        total_loss = (train_loss.item() * len(X_train) + test_loss.item() * len(X_test)) / (len(X_train) + len(X_test))
        total_losses.append(total_loss)

    pd.DataFrame({
        'epoch': range(epochs),
        'train_loss': train_losses,
        'test_loss': test_losses,
        'total_loss': total_losses
    }).to_csv(f'{model_folder}/learning_curve.csv', index=False)

    torch.save(model.state_dict(), f'{model_folder}/{model_name}.pth')

    plt.figure(figsize=(10, 5))
    plt.plot(train_losses, label='Train Loss')
    plt.plot(test_losses, label='Test Loss')
    plt.plot(total_losses, label='Total Loss')
    plt.title(f'Learning Curve - {model_name}')
    plt.xlabel('Epoch')
    plt.ylabel('Loss (MSE)')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'{model_folder}/learning_curve.png')
    plt.close()

    return train_losses, test_losses, total_losses, model_folder

MAIN EXECUTION

In [211]:
if __name__ == "__main__":

    X_tensor, y_tensor = load_and_prepare_data()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    X_tensor, y_tensor = X_tensor.to(device), y_tensor.to(device)
    
    X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=seed)
    X_train, y_train, X_test, y_test = X_train.to(device), y_train.to(device), X_test.to(device), y_test.to(device)

    model_classes = {
        'FNN': lambda: FNN(),
        'RNN': lambda: RNN(),
        'LSTM': lambda: LSTM(),
        'CNN': lambda: CNN(X_tensor.shape[1])
    }

    epochs_map = {
        'FNN': 300,
        'RNN': 300,
        'LSTM': 300,
        'CNN': 300
    }

    rmse_scores = []
    mae_scores = []
    nrmse_scores = []
    model_names = []
    sobriety_accuracies = []
    pearson_corrs = []
    model_comparison_metrics = []

    for model_name in model_classes.keys():
        print(f"\nTraining {model_name} with validation for {epochs_map[model_name]} epochs...")
        model = model_classes[model_name]().to(device)

        train_losses, test_losses, total_losses, model_folder = train_model_with_validation(
            model, X_train, y_train, X_test, y_test, model_name, epochs=epochs_map[model_name]
        )

        with torch.no_grad():
            model.eval()
            y_pred = model(X_test)
            y_pred_cpu = y_pred.detach().cpu().numpy().flatten()
            y_test_cpu = y_test.detach().cpu().numpy().flatten()

            y_true_class = (y_test_cpu >= 0.08).astype(int)
            y_pred_class = (y_pred_cpu >= 0.08).astype(int)
            sobriety_accuracy = accuracy_score(y_true_class, y_pred_class)

            rmse = calculate_rmse(y_test_cpu, y_pred_cpu)
            mae = calculate_mae(y_test_cpu, y_pred_cpu)
            nrmse = calculate_normalized_rmse(y_test_cpu, y_pred_cpu)
            pearson_corr, _ = pearsonr(y_test_cpu, y_pred_cpu)
            pearson_corrs.append(pearson_corr)

            rmse_scores.append(rmse)
            mae_scores.append(mae)
            nrmse_scores.append(nrmse)
            sobriety_accuracies.append(sobriety_accuracy)
            model_names.append(model_name)

            pd.DataFrame({
                'True_TAC': y_test_cpu,
                'Predicted_TAC': y_pred_cpu
            }).to_csv(f"{model_folder}/predictions_vs_actuals.csv", index=False)

            pd.DataFrame([{
                'RMSE': rmse,
                'MAE': mae,
                'NRMSE': nrmse,
                'Sobriety_Accuracy': sobriety_accuracy
            }]).to_csv(f"{model_folder}/metrics.csv", index=False)

            # Append model metrics for comparison
            model_comparison_metrics.append({
                'Model': model_name,
                'RMSE': rmse,
                'MAE': mae,
                'Pearson_Corr': pearson_corr,
                'Sobriety_Accuracy': sobriety_accuracy
            })

            print(f"{model_name} Sobriety Accuracy: {sobriety_accuracy:.4f}")

        plot_predictions_with_boundary(y_train, y_test, y_pred, model_name)

    comparison_df = pd.DataFrame(model_comparison_metrics)


    comparison_df.to_csv("comparison_plots/model_comparison_metrics.csv", index=False)

    plot_metric_histogram(rmse_scores, 'RMSE', model_names)
    plot_metric_histogram(mae_scores, 'MAE', model_names)
    plot_metric_histogram(nrmse_scores, 'Normalized RMSE', model_names)
    plot_metric_histogram(sobriety_accuracies, 'Sobriety Classification Accuracy', model_names)
    plot_metric_histogram(pearson_corrs, 'Pearson Correlation', model_names)


Training FNN with validation for 300 epochs...
FNN Sobriety Accuracy: 0.6944

Training RNN with validation for 300 epochs...
RNN Sobriety Accuracy: 0.6944

Training LSTM with validation for 300 epochs...
LSTM Sobriety Accuracy: 0.6944

Training CNN with validation for 300 epochs...
CNN Sobriety Accuracy: 0.6944


METRICS SUMMARY

In [212]:
model_comparison_metrics = []

for model_name in model_classes.keys():
        model_folder = f'model_{model_name}_{epochs_map[model_name]}_epochs'
        predictions_df = pd.read_csv(f"{model_folder}/predictions_vs_actuals.csv")

        predictions_df['True_Class'] = (predictions_df['True_TAC'] >= 0.08).astype(int)
        predictions_df['Pred_Class'] = (predictions_df['Predicted_TAC'] >= 0.08).astype(int)

        rmse = np.sqrt(mean_squared_error(predictions_df['True_TAC'], predictions_df['Predicted_TAC']))
        mae = mean_absolute_error(predictions_df['True_TAC'], predictions_df['Predicted_TAC'])
        pearson_corr, _ = pearsonr(predictions_df['True_TAC'], predictions_df['Predicted_TAC'])
        sobriety_accuracy = accuracy_score(predictions_df['True_Class'], predictions_df['Pred_Class'])

        model_comparison_metrics.append({
            'Model': model_name,
            'RMSE': rmse,
            'MAE': mae,
            'Pearson_Corr': pearson_corr,
            'Sobriety_Accuracy': sobriety_accuracy
        })

comparison_df = pd.DataFrame(model_comparison_metrics)
print("\n[Metrics Comparison Across Models]")
print(comparison_df)

comparison_df.to_csv("comparison_plots/model_comparison_metrics.csv", index=False)


[Metrics Comparison Across Models]
  Model      RMSE       MAE  Pearson_Corr  Sobriety_Accuracy
0   FNN  0.054140  0.043892      0.141715           0.694444
1   RNN  0.054580  0.043741      0.034549           0.694444
2  LSTM  0.054472  0.043760      0.110622           0.694444
3   CNN  0.054466  0.043567      0.065686           0.694444
