# MODELS

In [None]:
import time
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt


from pathlib import Path
import os
import ast
import h5py

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report, precision_recall_curve, average_precision_score, auc
from torchsummary import summary
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from load_data import load_dataset, load_embeddings

In [None]:
sub_ontology = 'biological_process'


n_test = ''


training_data_path = Path('../data/train')
test_data_path = Path('../data/test')
baseline_data_path = Path('../data/baseline')
datasets_path = Path('../data/datasets')
models_path = Path('../data/models')
plots_path = Path('../data/plots')


# Load data

X_path = datasets_path / f'X_df_BP{n_test}.csv'
y_path = datasets_path / f'y_df_BP{n_test}.csv'
X_test_path = datasets_path / f'X_test_df_BP{n_test}.csv'
y_test_path = datasets_path / f'y_test_df_BP{n_test}.csv'


if X_path.exists() and y_path.exists() and X_test_path.exists() and y_test_path.exists():

    columns_to_convert = ['embeddings', 'reduced_embeddings']
    
    # Training data
    X_df_BP = pd.read_csv(X_path)
    
    y_df_BP = pd.read_csv(y_path)
    y_df_BP = y_df_BP.squeeze()
    y_df_BP = y_df_BP.apply(ast.literal_eval)

    
    # Test data
    X_test_df_BP = pd.read_csv(X_test_path)
    
    y_test_df_BP = pd.read_csv(y_test_path)
    y_df_BP = y_df_BP.squeeze()
    y_df_BP = y_df_BP.apply(ast.literal_eval)

else:

    X_df_BP, y_df_BP, X_test_df_BP, y_test_df_BP = load_dataset(training_data_path, test_data_path, sub_ontology)
    X_df_BP.to_csv(X_path, index=False)
    y_df_BP.to_csv(y_path, index=False)
    # X_test_df_BP.to_csv(X_test_path, index=False)
    # y_test_df_BP.to_csv(y_test_path, index=False)


# Extract embeddings
train_embeddings_path = training_data_path / 'train_embeddings.h5'

train_embeddings = load_embeddings(train_embeddings_path)

X_df_BP = pd.merge(train_embeddings, X_df_BP["ID"], on='ID', how='right')


X = X_df_BP['embeddings']  # Extract embeddings
X = pd.DataFrame(X.tolist())  # Convert list of lists to DataFrame

# Preprocess target (MultiLabelBinarizer for multi-label classification)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y_df_BP)
y_tags = mlb.classes_

print(f"X shape: {X.shape} \ny shape: {y.shape}\n")

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Convert NumPy array to PyTorch tensor

# Training data
X_train_tensor = torch.tensor(X_train.to_numpy(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)

# Test data
X_test_tensor = torch.tensor(X_test.to_numpy(), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).to(device)

In [None]:
# X_prova = X_train_tensor[:5000]
# y_prova = y_train_tensor[:5000]

# X_prova_test = X_test_tensor[:5000]
# y_prova_test = y_test_tensor[:5000]

# X_train_tensor = X_prova
# y_train_tensor = y_prova.to(device)
# X_test_tensor = X_prova_test
# y_test_tensor = y_prova_test.to(device)

## Models

In [None]:
class NN(nn.Module):
    def __init__(self, input_size, hidden_sizes, dropouts, output_size):
        super(NN, self).__init__()
        
        layers = []
        in_size = input_size
        
        for i, hidden_size in enumerate(hidden_sizes):
            layers.append(nn.Linear(in_size, hidden_size))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropouts[i]))
            in_size = hidden_size
        
        layers.append(nn.Linear(in_size, output_size))
        layers.append(nn.Sigmoid())  # For multilabel classification
        
        self.model = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.model(x)

##### Creating/Extracting the models DataFrame

In [None]:
# Check if the file exists and create it if it doesn't

filename_df = f'NN_BP{n_test}.csv'
nn_df_path = datasets_path / filename_df

filename_results = f'NN_BP_results{n_test}.csv'
results_path = datasets_path / filename_results

filename_params = f'NN_BP_params{n_test}.csv'
params_path = datasets_path / filename_params

filename_probs = f'BP_prob{n_test}.csv'
probs_path = datasets_path / filename_probs

filename_y_pred = f'NN_BP_y_pred{n_test}.csv'
y_pred_path = datasets_path / filename_y_pred





if nn_df_path.exists():
    
    # If path exists, load the DataFrame
    NN_BP = pd.read_csv(nn_df_path)
    for c_list, c_int in zip(['hidden_sizes', 'dropouts'], ['batch_size', 'epochs']):
        NN_BP[c_list] = NN_BP[c_list].apply(ast.literal_eval)
        NN_BP[c_int] = NN_BP[c_int].astype(int)
    
    NN_BP['lr'] = NN_BP['lr'].astype(float)

else:
    if not datasets_path.exists():
        datasets_path.mkdir(parents=True, exist_ok=True)
    
    NN_BP = {
        'name': [],
        'hidden_sizes': [],
        'dropouts': [],
        'lr': [],
        'batch_size': [],
        'epochs': [],
        'macro_P': [],
        'macro_R': [],
        'macro_F1': [],
        'weighted_P': [],
        'weighted_R': [],
        'weighted_F1': [],
        'samples_P': [],
        'samples_R': [],
        'samples_F1': [],
        'training_time': [],
        'probs' : []     
    }
    NN_BP = pd.DataFrame(NN_BP)


NN_BP.head()

##### Filling NN_BP with model parameters

In [None]:
def insert_parameters(df, names, hidden_sizes_list, dropouts_list, lrs, batch_sizes, epochs_list):
    for name, hidden_sizes, dropouts, lr, batch_size, epochs in zip(names, hidden_sizes_list, dropouts_list, lrs, batch_sizes, epochs_list):
        exists = name in df['name'].values
        
        if not exists:
            new_row = {
                'name': name,
                'hidden_sizes': [hidden_sizes],
                'dropouts': [dropouts],
                'lr': lr,
                'batch_size': batch_size,
                'epochs': epochs,
            }
            
            new_row_df = pd.DataFrame(new_row)
            
            NN_BP['epochs'] = NN_BP['epochs'].astype(int)
            NN_BP['batch_size'] = NN_BP['batch_size'].astype(int)

            df = pd.concat([df, new_row_df], ignore_index=True)
        else:
            print(f"{name} already exists in the DataFrame.")
    
    return df

In [None]:
def create_parameters(df, hidden_sizes, dropouts, lrs, batch_sizes, epochs):
    for hidden_size in hidden_sizes:
        
        
        
        new_row = {
             'hidden_sizes': [hidden_size],
             'dropouts': [dropout],
             'lr': lr,
             'batch_size': int(batch_size),
             'epochs': int(epoch),
             }
        new_row_df = pd.DataFrame(new_row)
        
        df = pd.concat([df, new_row_df], ignore_index=True)
         


def insert_parameters1(df, hidden_sizes_list, dropouts_list, lr_list, batch_sizes_list, epochs_list):
    
    create_paremeters(df, hidden_sizes_list, dropouts_list, lr_list, batch_sizes_list, epochs_list)
    
    
    for hidden_sizes, dropouts, lr, batch_size, epochs in zip(hidden_sizes_list, dropouts_list, lrs, batch_sizes, epochs_list):
        exists = name in df['name'].values
        
        if not exists:
            new_row = {
                'hidden_sizes': [hidden_sizes],
                'dropouts': [dropouts],
                'lr': lr,
                'batch_size': int(batch_size),
                'epochs': int(epochs),
            }
            new_row_df = pd.DataFrame(new_row)
            df = pd.concat([df, new_row_df], ignore_index=True)
        else:
            print(f"{name} already exists in the DataFrame.")
    
    return df

In [None]:
# hidden_sizes = [[4096, 2048],   #NN1
#                 [4096, 2048],   #NN2
#                 [4096, 2048],   #NN3
#                 [2048, 1024],   #NN4
#                 [2048, 1024],   #NN5
#                 [2048, 1024],   #NN6
#                 [4096, 2048, 1024],   #NN7
#                 [4096, 2048, 2048],   #NN8
#                 [2048, 2048, 2048],   #NN9
#                 [2048, 4096, 2048],   #NN10
#                 [4096, 4096, 2048],   #NN11        
#                 [4096, 4096, 2048]]   #NN12    


# dropouts = [0.1, 0.2, 0.3]


# lrs = [0.1, 0.01, 0.001, 0.03, 0.003, 0.05, 0.005]


# batch_sizes = [512, 1024, 2048]

# epochs = [75, 100, 150]

names = ['NN1', 'NN2', 'NN3', 'NN4', 'NN5', 'NN6', 'NN7', 'NN8', 'NN9', 'NN10']


hidden_sizes = [[4096, 2048],   #NN1
                [4096, 2048],   #NN2
                [4096, 2048],   #NN3
                [2048, 1024],   #NN4
                [2048, 1024],   #NN5
                [2048, 1024],   #NN6
                [4096, 2048, 1024],   #NN7
                [4096, 2048, 2048],   #NN8
                [2048, 2048, 2048],   #NN9
                [2048, 4096, 2048],   #NN10
                [4096, 4096, 2048],   #NN11        
                [4096, 4096, 2048]]   #NN12    


dropouts = [[0.1, 0.1],   #NN1 
            [0.2, 0.2],   #NN2
            [0.4, 0.4],   #NN3
            [0.1, 0.1],   #NN4
            [0.2, 0.2],   #NN5
            [0.4, 0.4],   #NN6
            [0.1, 0.2, 0.3],  #NN7
            [0.2, 0.2, 0.2],  #NN8
            [0.2, 0.2, 0.2],  #NN9
            [0.3, 0.3, 0.3],  #NN10
            [0.1, 0.2, 0.3],  #NN11        
            [0.1, 0.2, 0.3]]  #NN12 


lrs = [0.001, #NN1 
       0.003, #NN2
       0.005, #NN3
       0.001, #NN4
       0.003, #NN5
       0.005, #NN6
       0.005, #NN7
       0.003, #NN8
       0.003, #NN9
       0.005, #NN10
       0.003, #NN11        
       0.001] #NN12 


batch_sizes = [1024, #NN1
               2048, #NN2
               1024, #NN3
               2048, #NN4
               1024, #NN5
               1024, #NN6
               2048, #NN7
               1024, #NN8
               1024, #NN9
               2048, #NN10
               2048, #N11        
               1024] #N12 


epochs = [150,  #NN1 
          75,  #NN2
          100, #NN3
          75,  #NN4
          75,  #NN5
          100, #NN6
          75,  #NN7
          75,  #NN8
          75,  #NN9
          75,  #NN10
          75,  #NN11
          75]  #NN12

# epochs = [15,  #NN1 
#           15,  #NN2
#           15]#, #NN3
#        #    25,  #NN4
#        #    25,  #NN5
#        #    25, #NN6
#        #    25,  #NN7
#        #    25,  #NN8
#        #    25,  #NN9
#        #    25,  #NN10
#        #    25,  #NN11
#        #    25]  #NN12



NN_BP = insert_parameters(df = NN_BP,
                          names=names, 
                          hidden_sizes_list=hidden_sizes, 
                          dropouts_list=dropouts, 
                          lrs=lrs, 
                          batch_sizes=batch_sizes, 
                          epochs_list=epochs)

NN_BP.head()

##### Training the models

In [None]:
def create_model(df, name, input_size, output_size):

    # Initialize the model
    model = NN(
        input_size = input_size,
        hidden_sizes = df[df['name'] == name]['hidden_sizes'].values[0],
        dropouts = df[df['name'] == name]['dropouts'].values[0], 
        output_size = output_size
        ).to(device)
    

    # Loss function and optimizer
    criterion = nn.BCELoss()  # Binary Cross-Entropy Loss for multilabel
    optimizer = optim.Adam(model.parameters(), lr=df[df['name'] == name]['lr'].values[0])

    return model, optimizer, criterion

In [None]:
def train_model(model, name, X_train, y_train, optimizer, criterion, batch_size, epochs):
    # Determine the device (model should already be on this device)
    device = next(model.parameters()).device

    print(f"\nTraining {name} model:")

    # Move training data to the device
    X_train = X_train.to(device)
    y_train = y_train.to(device)

    model.train()
    start_time = time.time()  # Record the start time

    for epoch in range(epochs):
        for i in range(0, len(X_train), batch_size):
            # Get the current batch
            X_batch = X_train[i:i+batch_size]
            y_batch = y_train[i:i+batch_size]

            # Forward pass
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        
        t = time.time() - start_time

        if epoch == 0:
            print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f} - time: {(t/60):.2f}min")
        elif epoch % 10 == 0:
            print(f"Epoch [{epoch}/{epochs}], Loss: {loss.item():.4f} - time: {(t/60):.2f}min")

    end_time = time.time()  # Record the end time

    training_time = end_time - start_time  # Calculate the elapsed time


    if not models_path.exists():
        models_path.mkdir(parents=True, exist_ok=True)
    
    model_path = models_path / f'{name}_BP{n_test}.pt'
    torch.save(model.state_dict(), model_path)

    return training_time / 60

In [None]:
def model_evaluation(df, name, model, X, y, target_names, training_time):

    model.eval()
    with torch.no_grad():
        y_pred = model(X)
        probs = torch.sigmoid(y_pred)  # Assuming multilabel classification with sigmoid activation
        
        y_pred = (y_pred.cpu().numpy() > 0.5).astype(int)

    exists = any(df[df['name'] == name])
    
    if exists:
        
        report = classification_report(y.cpu().numpy(), y_pred, target_names=target_names, output_dict=True)
        # Convert the classification report to a DataFrame
        report = pd.DataFrame(report).transpose().drop(columns=['support'])
        
        report.loc['macro avg'] = report.loc['macro avg'].apply(lambda x: round(x, 3))
        report.loc['weighted avg'] = report.loc['weighted avg'].apply(lambda x: round(x, 3))
        report.loc['samples avg'] = report.loc['samples avg'].apply(lambda x: round(x, 3))

        # Update the DataFrame with the classification report information
        df.loc[df['name'] == name, 'macro_P'] = report.loc['macro avg']['precision']
        df.loc[df['name'] == name, 'macro_R'] = report.loc['macro avg']['recall']
        df.loc[df['name'] == name, 'macro_F1'] = report.loc['macro avg']['f1-score']

        df.loc[df['name'] == name, 'weighted_P'] = report.loc['weighted avg']['precision']
        df.loc[df['name'] == name, 'weighted_R'] = report.loc['weighted avg']['recall']
        df.loc[df['name'] == name, 'weighted_F1'] = report.loc['weighted avg']['f1-score']
        
        df.loc[df['name'] == name, 'samples_P'] = report.loc['samples avg']['precision']
        df.loc[df['name'] == name, 'samples_R'] = report.loc['samples avg']['recall']
        df.loc[df['name'] == name, 'samples_F1'] = report.loc['samples avg']['f1-score']

        df.loc[df['name'] == name, 'training_time'] = training_time


        return df, y_pred, probs
    else:
        print(f"{name} already exists in the DataFrame.")
        return df, None, probs

In [None]:
target_names = [str(cls) for cls in mlb.classes_]


if probs_path.exists():
    probs = pd.read_csv(probs_path)
    y_pred = pd.read_csv(y_pred_path)
else:
    probs = {}
    y_pred = {}


for name in NN_BP['name']:

    # Create the model
    model, optimizer, criterion = create_model(NN_BP, name, X_train_tensor.shape[1], y_train_tensor.shape[1])
    

    batch_size = int(NN_BP[NN_BP['name'] == name]['batch_size'].values[0])
    epochs = int(NN_BP[NN_BP['name'] == name]['epochs'].values[0])

    # If there is no training time, train the model
    if np.isnan(NN_BP[NN_BP['name'] == name]['training_time'].values[0]):

        filename_model = f'{name}_BP.pth'
        model_path = models_path / filename_model

        # Train the model
        training_time = train_model(model, name, X_train_tensor, y_train_tensor, optimizer, criterion, batch_size, epochs)

        # Ensure X_test_tensor is on the same device as the model
        device = next(model.parameters()).device
        X_test_tensor = X_test_tensor.to(device)

        # Evaluation
        NN_BP, y_pred[name], probs[name] = model_evaluation(NN_BP, name, model, X_test_tensor, y_test_tensor, y_tags, training_time)

        # Save the model
        if not models_path.exists():
            models_path.mkdir(parents=True, exist_ok=True)
        
        torch.save(model.state_dict(), model_path)
    
    

    # Save the DataFrame to a CSV file
    NN_BP.to_csv(nn_df_path, index=False)
    

# probs = pd.DataFrame(probs)
      
NN_BP.head()

##### Extracting results

In [None]:
metrics = ['weighted_F1', 'weighted_P', 'weighted_R', 
           'macro_F1', 'macro_P', 'macro_R']



NN_BP_results = NN_BP.sort_values(by=metrics, ascending=False)[['weighted_F1', 'weighted_P', 'weighted_R', 
           'macro_F1', 'macro_P', 'macro_R']][:5]

NN_BP_params = NN_BP.sort_values(by=metrics, ascending=False)[['name', 'hidden_sizes', 'dropouts', 'lr', 
                                                'batch_size', 'epochs', 'training_time']][:5]

NN_BP_params.head(), NN_BP_results.head()

In [None]:
row_names = [f'NN{i}' for i in range(1, NN_BP_results.shape[0] + 1)]

# Add the row means to the DataFrame
NN_BP_results['name'] = row_names
NN_BP_params['name'] = row_names

# Save the DataFrame to a CSV file
NN_BP_results.to_csv(results_path, index=False)
NN_BP_params.to_csv(params_path, index=False)

# Print the updated DataFrame
NN_BP_results.head(), NN_BP_params.head()

In [None]:
def plot_precision_recall_curves(df, X_test, y_test):
    plt.figure(figsize=(8, 6))
    
    for name in df['name']:

        # Load the model
        filename_model = f'{name}_BP.pth'
        model_path = models_path / filename_model
        
        model.load_state_dict(torch.load(model_path))
        model.eval()

        with torch.no_grad():
            y_pred = model(X_test)
            y_pred_prob = y_pred.cpu().numpy()
        
        # Compute micro-average precision-recall curve
        precision, recall, _ = precision_recall_curve(y_test.ravel(), y_pred_prob.ravel())
        avg_precision = average_precision_score(y_test, y_pred_prob, average="weighted")
        
        plt.plot(recall, precision, label=f'{name} (AP={avg_precision:.4f})')
    
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title(f"Precision-Recall Curve, model: {name}")
    plt.legend(loc="best")
    plt.grid()


    filename_plot = f'NN_PC_curve_BP.png'
    plot_path = plots_path / filename_plot


    if not plots_path.exists():
        plots_path.mkdir(parents=True, exist_ok=True)

    plt.savefig(plot_path)
    plt.show()


# Plot the precision-recall curve
plot_precision_recall_curves(NN_BP, X_test_tensor, y_test_tensor)

In [None]:
def show_probabilities(y_pred, probs, ids, y_tags):
    new_rows = {
        'ID': [],
        'GO_term': [],
        'probabilities': []
    }
    
    for i, y_p in enumerate(y_pred):
        pos = np.where(y_p == 1)[0]
        
        for p in pos:
            new_rows['ID'].append(ids[i])
            new_rows['GO_term'].append(y_tags[p])
            new_rows['probabilities'].append(float(probs[i][p]))
    
    new_rows_df = pd.DataFrame(new_rows)

    top_500_df = new_rows_df.sort_values(by='probabilities', ascending=False).head(500)
    
    return top_500_df

indexes = X_test.index.to_list()    # Indexes of the test data
ids = X_df_BP.loc[indexes, 'ID'].to_list()  # Protein IDs

for name in NN_BP['name']:
    filename_submission = f'{name}_BP_submission{n_test}.csv'
    submission_path = datasets_path / filename_submission

    p = show_probabilities(y_pred[name], probs[name], ids, y_tags)

    p_sorted = p.groupby('ID', group_keys=False).apply(lambda x: x.sort_values(by='probabilities', ascending=False, ignore_index=True))
    p_sorted.to_csv(submission_path, index=False)

##### LaTex tables

In [None]:
def generate_latex_table(df):
    metrics = ['macro_P', 'macro_R', 'macro_F1', 'weighted_P', 'weighted_R', 'weighted_F1', 'samples_P', 'samples_R', 'samples_F1']
    metric_names = ['Macro P', 'Macro R', 'Macro F1', 'Weighted P', 'Weighted R', 'Weighted F1', 'Samples P', 'Samples R', 'Samples F1']
    
    # Initialize the LaTeX table
    latex_table = "\\begin{tabular}{l" + "c" * len(df['name']) + "}\n"
    latex_table += "    \\toprule\n"
    latex_table += "    \\textit{Metric} & " + " & ".join([f"\\textit{{{name}}}" for name in df['name']]) + " \\\\\n"
    latex_table += "    \\midrule\n"
    
    for metric, metric_name in zip(metrics, metric_names):
        values = df[metric]
        formatted_values = [f"{value:.2f}" for value in values]
        latex_table += f"    {metric_name} & " + " & ".join(formatted_values) + " \\\\\n"
    
    latex_table += "    \\bottomrule\n"
    latex_table += "\\end{tabular}"
    
    return latex_table


def generate_parameters_latex_table(df):
    # Initialize the LaTeX table
    latex_table = "\\begin{table}[H]\n"
    latex_table += "    \\centering\n"
    latex_table += "    \\begin{tabular}{lcccccc}\n"
    latex_table += "        \\toprule\n"
    latex_table += "        \\textit{Model} & \\textit{Hidden Layer Sizes} & \\textit{Dropout} & \\textit{Learning Rate} & \\textit{Batch Size} & \\textit{Epochs} & \\textit{Regularization} \\\\\n"
    latex_table += "        \\midrule\n"
    
    for index, row in df.iterrows():
        model_name = row['name']
        hidden_sizes = row['hidden_sizes']
        dropouts = row['dropouts']
        lr = row['lr']
        batch_size = row['batch_size']
        epochs = row['epochs']
        regularization = 'None'  # Assuming regularization is not provided in the DataFrame
        
        latex_table += f"        {model_name} & {hidden_sizes} & {dropouts} & {lr} & {batch_size} & {epochs} & {regularization} \\\\\n"
    
    latex_table += "        \\bottomrule\n"
    latex_table += "    \\end{tabular}\n"
    latex_table += "    \\caption{Model Parameters.}\n"
    latex_table += "\\end{table}"
    
    return latex_table

# Generate the LaTeX table
latex_table = generate_latex_table(NN_BP_results)
print(latex_table)

latex_table = generate_parameters_latex_table(NN_BP_params)
print(latex_table)
