### Libraries

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

from Bio.Seq import Seq
from Bio import SeqIO
from Bio import Align
from Bio import AlignIO
from Bio.Align import substitution_matrices
from Bio.Data import IUPACData
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.SeqRecord import SeqRecord
from Bio import pairwise2
from Bio.pairwise2 import format_alignment

'''import cafaeval
from cafaeval.evaluation import cafa_eval
from cafaeval.parser import obo_parser, gt_parser'''

from pathlib import Path
import os
import ast
import h5py

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

Creating the path to the main folder we are going to use to create our datasets

In [2]:
# Path to train data
training_data_path = Path('../data/train')

# Path to test data
test_data_path = Path('../data/test')

# Path to baseline data
baseline_data_path = Path('../data/baseline')


# Training Set

In this section we are **creating the Training Set**, fusing togheter the data from:
- _train_set.tsv_
- _train_ids.txt_
- _train.fasta_
- _train_embeddings.h5_
- _train_protein2ipr.dat_
- _go-basic.obo_

### Extracting `train_set.tsv`

In [None]:
train_set = pd.read_csv(training_data_path / 'train_set.tsv', sep='\t')

# Rename Protein_ID and aspect columns
train_set.rename(columns={'Protein_ID': 'ID', 'aspect' : 'sub_ontology'}, inplace=True)

# Display the first few rows of the train set
train_set.head()

### Extracting `train_ids.txt`

In [None]:
# Extracting train_ids.txt
with open(training_data_path / 'train_ids.txt', 'r') as file:
    train_ids = file.read().splitlines()

# Display the first few IDs to verify
print(train_ids[:5])

In [None]:
if len(train_ids) == len(train_set['ID'].unique()):
    print(f"The number of IDs in train_ids.txt is equal to the number of unique IDs in the train set ({len(train_ids)}).\n"
          "Proceeding with the analysis.")

### Extracting `train.fasta`

In [None]:
train_fasta_list = list(SeqIO.parse(training_data_path / 'train.fasta', 'fasta'))

# Print the first sequence to verify
print(train_fasta_list[0])

Let's transform it into a DataFrame

In [None]:
# Extract relevant information from SeqRecord
train_fasta_dict = [{
    'ID': record.id,
    'name': record.name,
    'description': record.description,
    'num_features': len(record.features),
    'sequence': record.seq,
} for record in train_fasta_list]

# Create a DataFrame from the extracted data
train_fasta = pd.DataFrame(train_fasta_dict)

# Display the DataFrame
train_fasta.head()

Checking if `ID`, `name` and `description` have the same information

In [None]:
# Checking for differences between the ID and name columns
diff_id_name = sum(train_fasta['ID'] != train_fasta['name'])

# Checking for differences between the ID and description columns
diff_id_description = sum(train_fasta['ID'] != train_fasta['description'])

print(f"We have a total of {diff_id_name} differences between the ID and name columns.\nWe have a total of {diff_id_description} differences between the ID and description columns.")

Checking if `num_features` has value different from 0

In [None]:
num_features_values = sum(train_fasta['num_features'] != 0)

print(f"We have a total of {num_features_values} sequences with features.")

**Nice!**

We found that `ID`, `name` and `description` columns are the same. We can remove two of them (`name`, `descritpion`). 

In the same way we saw that `num_features` is not very informative since it has only 0, let's remove it as well



In [None]:
train_fasta.drop(columns=['name', 'description', 'num_features'], inplace=True)


train_fasta.head()

### Extracting `train_embeddings.h5`

In [None]:
filename = "train_embeddings.h5"

data_list = []

with h5py.File(training_data_path / filename, "r") as f:
    for dataset_name in f.keys():
        dataset = f[dataset_name][:]
        data_list.append([dataset_name, dataset])

train_embeddings = pd.DataFrame(data_list, columns=["ID", "embeddings"])

train_embeddings.head()

### Extracting `train_protein2ipr.dat`

In [None]:
train_protein2ipr = pd.read_csv(training_data_path / 'train_protein2ipr.dat', sep='\t')

# Rename Protein_ID and aspect columns
train_protein2ipr.columns = ['ID', 'ipr', 'domain', 'familyID', 'start', 'end']

# Display the first few rows of the train set
train_protein2ipr.head()

In [None]:
# Group by 'ID' and aggregate other columns into lists
train_protein2ipr_grouped = train_protein2ipr.groupby('ID').agg(lambda x: tuple(x)).reset_index()

print(f"Train protein2ipr ({train_protein2ipr.shape}):")
train_protein2ipr_grouped.head()

Still have to understand what *tizio, caio, sempronio* are.

### Extracting `go-basic.obo`

In [None]:
import re

file_path = "../data/train/go-basic.obo"  # Replace with your file path

# Step 1: Initialize storage for GO terms
go_terms = []

# Step 2: Parse the .obo file
with open(file_path, 'r') as file:
    current_term = {}
    for line in file:
        line = line.strip()
        
        # Start of a new term
        if line == "[Term]":
            if current_term:  # Save the previous term
                go_terms.append(current_term)
            current_term = {}  # Start a new term
            
        elif line.startswith("id:"):
            current_term['ID'] = line.split("id: ")[1]
            
        elif line.startswith("alt_id:"):
            alt_id = line.split("alt_id: ")[1]
            current_term.setdefault('alt_ids', []).append(alt_id)
            
        elif line.startswith("name:"):
            current_term['name'] = line.split("name: ")[1]
            
        elif line.startswith("namespace:"):
            current_term['namespace'] = line.split("namespace: ")[1]
            
        elif line.startswith("is_a"):
            match = re.search(r"GO:\d+", line)  # Search for GO ID
            if match:  # Check if a match was found
                is_a_id = match.group()
                current_term.setdefault('is_a', []).append(is_a_id)
                
        elif line.startswith("relationship: part_of"):
            match = re.search(r"GO:\d+", line)  # Search for GO ID
            if match:  # Check if a match was found
                part_of_id = match.group()
                current_term.setdefault('part_of', []).append(part_of_id)

            
    # Add the last term
    if current_term:
        go_terms.append(current_term)

# Step 3: Create a unified list of all IDs (primary and alt_ids)
expanded_terms = []
for term in go_terms:
    primary_id = term['ID']
    alt_ids = term.get('alt_ids', [])
    all_ids = [primary_id] + alt_ids
    
    for term_id in all_ids:
        expanded_terms.append({
            'ID': term_id,
            'name': term.get('name'),
            'namespace': term.get('namespace'),
            'is_a': term.get('is_a', []),
            'part_of': term.get('part_of', [])
        })

# Step 4: Convert to a DataFrame
df = pd.DataFrame(expanded_terms)

# Step 5: Filter by namespace and explode relationships
df_is_a = df.explode('is_a').dropna(subset=['is_a'])
df_part_of = df.explode('part_of').dropna(subset=['part_of'])


print("All IDs with namespaces:")
df.head()

In [None]:
print("'is_a' relationships:")
df_is_a.head()

In [None]:
print("'part_of' relationships:")
df_part_of.head()

## Merging all previous extractions

Let us collect the dataframes and check lengths. Noting that:
- _train_set.tsv_ contains the proteins, their GO annotations and their corresponding aspects which will be used to subdivde it into three separate datasets
- _train.fasta_ and _train_embeddings.h5_ can be grouped together since they refers to the whole input
- _train_protein2ipr.dat_
- _go-basic.obo_

In [None]:
def group_and_combine(df, sub_ontology_value):
    return df[df['sub_ontology'] == sub_ontology_value].groupby('ID')['GO_term'].apply(tuple).reset_index()

# Create three dataframes for each sub_ontology value
df_CC = group_and_combine(train_set, 'cellular_component')
df_MF = group_and_combine(train_set, 'molecular_function')
df_BP = group_and_combine(train_set, 'biological_process')

print(f"Shapes: CC {df_CC.shape}, MF {df_MF.shape}, BP {df_BP.shape}")

print(f"Cellular Component ({df_CC.shape[0]}):")
df_CC.head()

In [None]:
combined_train = pd.merge(train_embeddings, train_fasta, on='ID')
combined_train = pd.merge(combined_train, train_protein2ipr_grouped, on='ID', how='left')

missing_rows = combined_train[combined_train['ipr'].isna()].shape[0]
print(f"Number of rows missing from train_protein2ipr_grouped: {missing_rows}")

# combined_train = combined_train.drop('domain', axis=1)

print(f"Combined DataFrame shape: {combined_train.shape}")
combined_train.head()

In [None]:
# Merge combined_train with df_ab
df_CC_full= pd.merge(combined_train, df_CC, on='ID', how='right')
df_MF_full= pd.merge(combined_train, df_MF, on='ID', how='right')
df_BP_full= pd.merge(combined_train, df_BP, on='ID', how='right')

print(f"Shapes: CC {df_CC_full.shape}, MF {df_MF_full.shape}, BP {df_BP_full.shape}")

print(f"Full df CC {df_CC_full.shape}:")
df_CC_full.head()

In [None]:
x_df_CC = df_CC_full.iloc[:, :-1]
y_df_BP = df_CC_full.iloc[:, -1]

x_df_MF = df_MF_full.iloc[:, :-1]
y_df_MF = df_MF_full.iloc[:, -1]

X_df_BP = df_BP_full.iloc[:, :-1]
y_df_BP = df_BP_full.iloc[:, -1]

print(f"Shapes: x_df_CC {x_df_CC.shape}, y_df_CC {y_df_BP.shape}, x_df_MF {x_df_MF.shape}, y_df_MF {y_df_MF.shape}, x_df_BP {X_df_BP.shape}, y_df_BP {y_df_BP.shape}")

print("CC Input:")
x_df_CC.head()

In [None]:
print("CC Target:")
y_df_BP.head()

# Test set

In this section we are **creating the Test set**, fusing togheter the data from:
- _test_ids.txt_
- _test.fasta_
- _test_embeddings.h5_
- _test_protein2ipr.dat_
- _blast_test_results.tsv_

### Extracting `test_ids.txt`

In [None]:
# Extracting test_ids.txt
with open(test_data_path / 'test_ids.txt', 'r') as file:
    test_ids = file.read().splitlines()

# Display the first few IDs to verify
print(test_ids[:5])

### Extracting `test.fasta`

In [None]:
test_fasta_list = list(SeqIO.parse(test_data_path / 'test.fasta', 'fasta'))

# Print the first sequence to verify
print(test_fasta_list[0])

In [None]:
# Extract relevant information from SeqRecord
test_fasta_dict = [{
    'ID': record.id,
    'name': record.name,
    'description': record.description,
    'num_features': len(record.features),
    'sequence': record.seq,
} for record in test_fasta_list]

# Create a DataFrame from the extracted data
test_fasta = pd.DataFrame(test_fasta_dict)

# Display the DataFrame
test_fasta.head()

Let's check if ID, name and description are the same thing, as well as seeing if num_features has relevant information.

In [None]:
# Checking for differences between the ID and name columns
diff_id_name = sum(test_fasta['ID'] != test_fasta['name'])

# Checking for differences between the ID and description columns
diff_id_description = sum(test_fasta['ID'] != test_fasta['description'])

print(f"We have a total of {diff_id_name} differences between the ID and name columns.\nWe have a total of {diff_id_description} differences between the ID and description columns.")

In [None]:
num_features_values = sum(test_fasta['num_features'] != 0)

print(f"We have a total of {num_features_values} sequences with features.")

Let's drop the useless columns, as before

In [None]:
test_fasta.drop(columns=['name', 'description', 'num_features'], inplace=True)


test_fasta.head()

Let's also check if the IDs are the same as the ones in the txt file

In [None]:
len_ID = len(test_fasta['ID'].unique()) # assigned because gave problem on else statement print

if len(test_ids) == len_ID:
    print(f"The number of IDs in train_ids.txt is equal to the number of unique IDs in the train set ({len(test_ids)}).\n"
          "Proceeding with the analysis.")
else:
    print(f'The numbers are not the same: test_ids are {len(test_ids)}, while the length of the fasta file is {len_ID})')

### Extracting `test_embeddings.h5`

In [None]:
data_list = []

with h5py.File(test_data_path / "test_embeddings.h5", "r") as f:
    for dataset_name in f.keys():
        dataset = f[dataset_name][:]
        data_list.append([dataset_name, dataset])

test_embeddings = pd.DataFrame(data_list, columns=["ID", "embeddings"])

test_embeddings.head()

### Extracting `test_protein2ipr.dat`

In [None]:
test_protein2ipr = pd.read_csv(test_data_path / 'test_protein2ipr.dat', sep='\t')

# Rename Protein_ID and aspect columns
test_protein2ipr.columns = ['ID', 'ipr', 'domain', 'familyID', 'start', 'end']

# Remove 'domain' that is useless
test_protein2ipr.drop('domain', axis=1)

test_protein2ipr.head()

In [None]:
# Group by 'ID' and aggregate other columns into lists
test_protein2ipr_grouped = test_protein2ipr.groupby('ID').agg(lambda x: tuple(x)).reset_index()

print(f"Test protein2ipr ({test_protein2ipr.shape}):")
test_protein2ipr_grouped.head()

### Merging

In [None]:
combined_test = pd.merge(test_embeddings, test_fasta, on='ID')
combined_test = pd.merge(combined_test, test_protein2ipr_grouped, on='ID', how='left')

missing_rows = combined_test[combined_test['ipr'].isna()].shape[0]
print(f"Number of rows missing from train_protein2ipr_grouped: {missing_rows}")

print(f"Combined DataFrame shape: {combined_test.shape}")
combined_test.head()

In [None]:
# Check if combined_train and combined_test have the same columns
print("Combined_train and combined_test have the same columns:" , set(combined_train.columns) == set(combined_test.columns))

# PCA for embeddings

We'll see that considering each df separately doesn't change much.

In [None]:
def pca_col(df, name, column_name='embeddings', variance_threshold=0.90):
    embeddings = np.array(df[column_name].tolist())
    
    scaler = StandardScaler()
    embeddings_standardized = scaler.fit_transform(embeddings)
    
    pca = PCA()
    pca.fit(embeddings_standardized)
    explained_variance = pca.explained_variance_ratio_

    # Components to retain (in our case 90% of variance)
    n_components = np.argmax(np.cumsum(explained_variance) >= variance_threshold) + 1
        
    plt.figure(figsize=(8, 4.5))
    plt.axhline(y=variance_threshold, color='r', linestyle='--')
    x_value = np.argmax(np.cumsum(explained_variance) >= variance_threshold) + 1
    plt.axvline(x=x_value, color='g', linestyle='--')
    plt.text(x_value + 10, 0.025, f'{x_value}', color='green', size='large', weight='bold')
    plt.text(- 50, variance_threshold + 0.025, f'{variance_threshold:.2f}', color='red', size='large', weight='bold')

    plt.plot(np.cumsum(explained_variance), marker='o')
    plt.title(f'Cumulative Explained Variance by PCA Components for {name}')
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.grid(True)
    plt.show()
    
    pca = PCA(n_components=n_components)
    reduced_embeddings = pca.fit_transform(embeddings_standardized)
    
    # Integrate reduced embeddings into the DataFrame
    df[f'reduced_{column_name}'] = reduced_embeddings.tolist()
    return df

# Apply PCA and add column for each dataset
X_df_BP = pca_col(X_df_BP, 'BP')
#x_df_CC = pca_col(x_df_CC, 'CC')
#x_df_MF = pca_col(x_df_MF, 'MF')

##### Save the datasets

In [35]:
# Ensure the directory exists
datasets_path = Path('../data/datasets')
datasets_path.mkdir(parents=True, exist_ok=True)

# Save the DataFrame to a CSV file
X_df_BP.to_csv(datasets_path / 'X_df_BP.csv', index=False)
y_df_BP.to_csv(datasets_path / 'y_df_BP.csv', index=False)

# MODELS

In [1]:
import time
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt


from pathlib import Path
import os
import ast
import h5py

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report
from torchsummary import summary
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from BP import load_dataset



In [None]:
sub_ontology = 'biological_process'

# Load data
training_data_path = Path('../data/train')
test_data_path = Path('../data/test')
baseline_data_path = Path('../data/baseline')
datasets_path = Path('../data/datasets')

n_test = '1'

X_path = datasets_path / f'X_df_BP{n_test}.csv'
y_path = datasets_path / f'y_df_BP{n_test}.csv'
X_test_path = datasets_path / f'X_test_df_BP{n_test}.csv'
y_test_path = datasets_path / f'y_test_df_BP{n_test}.csv'


if X_path.exists() and y_path.exists() and X_test_path.exists() and y_test_path.exists():

    columns_to_convert = ['embeddings', 'reduced_embeddings']
    
    X_df_BP = pd.read_csv(X_path)
    # for column in columns_to_convert:
    #     X_df_BP[column] = X_df_BP[column].apply(ast.literal_eval)
    
    y_df_BP = pd.read_csv(y_path)
    y_df_BP = y_df_BP.squeeze()
    y_df_BP = y_df_BP.apply(ast.literal_eval)

    
    X_test_df_BP = pd.read_csv(X_test_path)
    y_test_df_BP = pd.read_csv(y_test_path)

else:

    X_df_BP, y_df_BP, X_test_df_BP, y_test_df_BP = load_dataset(training_data_path, test_data_path, sub_ontology)
    X_df_BP.to_csv(X_path, index=False)
    y_df_BP.to_csv(y_path, index=False)
    X_test_df_BP.to_csv(X_test_path, index=False)
    y_test_df_BP.to_csv(y_test_path, index=False)


# Extract embeddings
train_embeddings_path = training_data_path / 'train_embeddings.h5'

data_list = []

with h5py.File(train_embeddings_path, "r") as f:
    for dataset_name in f.keys():
        dataset = f[dataset_name][:]
        data_list.append([dataset_name, dataset])

train_embeddings = pd.DataFrame(data_list, columns=["ID", "embeddings"])

X_df_BP = pd.merge(train_embeddings, X_df_BP["ID"], on='ID', how='right')


X = X_df_BP['embeddings']  # Extract embeddings
X = pd.DataFrame(X.tolist())  # Convert list of lists to DataFrame

# Preprocess target (MultiLabelBinarizer for multi-label classification)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y_df_BP)

print(f"X shape: {X.shape} \ny shape: {y.shape}\n")

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X shape: (83064, 1024) 
y shape: (83064, 1487)

y_df_BP: 0    (GO:0072524, GO:0006796, GO:1901361, GO:000905...
1    (GO:0009891, GO:0019222, GO:0010556, GO:000989...
2    (GO:0030336, GO:0050920, GO:0048523, GO:000996...
3    (GO:1903530, GO:0051051, GO:0046883, GO:004852...
4    (GO:0008150, GO:2000027, GO:0050789, GO:002260...
Name: GO_term, dtype: object 
y.head: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

type(y_df_BP): <class 'pandas.core.series.Series'> 
type(y): <class 'numpy.ndarray'>


In [19]:
# Convert NumPy array to PyTorch tensor
X_train_tensor = torch.tensor(X_train.to_numpy(), dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.to_numpy(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).to(device)

In [20]:
X_prova = X_train_tensor[:5000]
y_prova = y_train_tensor[:5000]

X_prova_test = X_test_tensor[:5000]
y_prova_test = y_test_tensor[:5000]

X_train_tensor = X_prova
y_train_tensor = y_prova.to(device)
X_test_tensor = X_prova_test
y_test_tensor = y_prova_test.to(device)

## Models

In [21]:
class NN(nn.Module):
    def __init__(self, input_size, hidden_sizes, dropouts, output_size):
        super(NN, self).__init__()
        
        layers = []
        in_size = input_size
        
        for i, hidden_size in enumerate(hidden_sizes):
            layers.append(nn.Linear(in_size, hidden_size))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropouts[i]))
            in_size = hidden_size
        
        layers.append(nn.Linear(in_size, output_size))
        layers.append(nn.Sigmoid())  # For multilabel classification
        
        self.model = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.model(x)

##### Creating/Extracting the models DataFrame

In [22]:
# Check if the file exists and create it if it doesn't

filename_df = f'NN_BP{n_test}.csv'
filename_probs = f'BP_prob{n_test}.csv'
filename_results = f'BP_results{n_test}.csv'
filename_params = f'BP_params{n_test}.csv'

nn_df_path = datasets_path / filename_df

if nn_df_path.exists():
    
    # If path exists, load the DataFrame
    NN_BP = pd.read_csv(nn_df_path)
    for c_list, c_int in zip(['hidden_sizes', 'dropouts'], ['batch_size', 'epochs']):
        NN_BP[c_list] = NN_BP[c_list].apply(ast.literal_eval)
        NN_BP[c_int] = NN_BP[c_int].astype(int)
    
    NN_BP['lr'] = NN_BP['lr'].astype(float)

else:
    if not datasets_path.exists():
        datasets_path.mkdir(parents=True, exist_ok=True)
    
    NN_BP = {
        'name': [],
        'hidden_sizes': [],
        'dropouts': [],
        'lr': [],
        'batch_size': [],
        'epochs': [],
        'macro_P': [],
        'macro_R': [],
        'macro_F1': [],
        'weighted_P': [],
        'weighted_R': [],
        'weighted_F1': [],
        'samples_P': [],
        'samples_R': [],
        'samples_F1': [],
        'training_time': [],
    }
    NN_BP = pd.DataFrame(NN_BP)


NN_BP.head()

Unnamed: 0,name,hidden_sizes,dropouts,lr,batch_size,epochs,macro_P,macro_R,macro_F1,weighted_P,weighted_R,weighted_F1,samples_P,samples_R,samples_F1,training_time
0,NN1,"[4096, 2048]","[0.1, 0.1]",0.001,1024,25,0.007,0.004,0.004,0.122,0.095,0.094,0.748,0.144,0.218,0.266525
1,NN2,"[4096, 2048]","[0.2, 0.2]",0.003,2048,25,0.006,0.004,0.005,0.122,0.101,0.103,0.744,0.151,0.226,0.244709
2,NN3,"[4096, 2048]","[0.4, 0.4]",0.005,1024,25,0.032,0.009,0.012,0.236,0.124,0.138,0.738,0.174,0.249,0.25916


##### Filling NN_BP with model parameters

In [23]:
def insert_parameters(df, names, hidden_sizes_list, dropouts_list, lrs, batch_sizes, epochs_list):
    for name, hidden_sizes, dropouts, lr, batch_size, epochs in zip(names, hidden_sizes_list, dropouts_list, lrs, batch_sizes, epochs_list):
        exists = name in df['name'].values
        
        if not exists:
            new_row = {
                'name': name,
                'hidden_sizes': [hidden_sizes],
                'dropouts': [dropouts],
                'lr': lr,
                'batch_size': int(batch_size),
                'epochs': int(epochs),
            }
            new_row_df = pd.DataFrame(new_row)
            df = pd.concat([df, new_row_df], ignore_index=True)
        else:
            print(f"{name} already exists in the DataFrame.")
    
    return df

In [24]:
def create_parameters(df, hidden_sizes, dropouts, lrs, batch_sizes, epochs):
    for hidden_size in hidden_sizes:
        
        
        
        new_row = {
             'hidden_sizes': [hidden_size],
             'dropouts': [dropout],
             'lr': lr,
             'batch_size': int(batch_size),
             'epochs': int(epoch),
             }
        new_row_df = pd.DataFrame(new_row)
        
        df = pd.concat([df, new_row_df], ignore_index=True)
         


def insert_parameters1(df, hidden_sizes_list, dropouts_list, lr_list, batch_sizes_list, epochs_list):
    
    create_paremeters(df, hidden_sizes_list, dropouts_list, lr_list, batch_sizes_list, epochs_list)
    
    
    for hidden_sizes, dropouts, lr, batch_size, epochs in zip(hidden_sizes_list, dropouts_list, lrs, batch_sizes, epochs_list):
        exists = name in df['name'].values
        
        if not exists:
            new_row = {
                'hidden_sizes': [hidden_sizes],
                'dropouts': [dropouts],
                'lr': lr,
                'batch_size': int(batch_size),
                'epochs': int(epochs),
            }
            new_row_df = pd.DataFrame(new_row)
            df = pd.concat([df, new_row_df], ignore_index=True)
        else:
            print(f"{name} already exists in the DataFrame.")
    
    return df

In [25]:
# hidden_sizes = [[4096, 2048],   #NN1
#                 [4096, 2048],   #NN2
#                 [4096, 2048],   #NN3
#                 [2048, 1024],   #NN4
#                 [2048, 1024],   #NN5
#                 [2048, 1024],   #NN6
#                 [4096, 2048, 1024],   #NN7
#                 [4096, 2048, 2048],   #NN8
#                 [2048, 2048, 2048],   #NN9
#                 [2048, 4096, 2048],   #NN10
#                 [4096, 4096, 2048],   #NN11        
#                 [4096, 4096, 2048]]   #NN12    


# dropouts = [0.1, 0.2, 0.3]


# lrs = [0.1, 0.01, 0.001, 0.03, 0.003, 0.05, 0.005]


# batch_sizes = [512, 1024, 2048]

# epochs = [75, 100, 150]

names = ['NN1', 'NN2', 'NN3', 'NN4', 'NN5', 'NN6', 'NN7', 'NN8', 'NN9', 'NN10']


hidden_sizes = [[4096, 2048],   #NN1
                [4096, 2048],   #NN2
                [4096, 2048],   #NN3
                [2048, 1024],   #NN4
                [2048, 1024],   #NN5
                [2048, 1024],   #NN6
                [4096, 2048, 1024],   #NN7
                [4096, 2048, 2048],   #NN8
                [2048, 2048, 2048],   #NN9
                [2048, 4096, 2048],   #NN10
                [4096, 4096, 2048],   #NN11        
                [4096, 4096, 2048]]   #NN12    


dropouts = [[0.1, 0.1],   #NN1 
            [0.2, 0.2],   #NN2
            [0.4, 0.4]]#,   #NN3
       #      [0.1, 0.1],   #NN4
       #      [0.2, 0.2],   #NN5
       #      [0.4, 0.4],   #NN6
       #      [0.1, 0.2, 0.3],  #NN7
       #      [0.2, 0.2, 0.2],  #NN8
       #      [0.2, 0.2, 0.2],  #NN9
       #      [0.3, 0.3, 0.3],  #NN10
       #      [0.1, 0.2, 0.3],  #NN11        
       #      [0.1, 0.2, 0.3]]  #NN12 


lrs = [0.001, #NN1 
       0.003, #NN2
       0.005]#, #NN3
       # 0.001, #NN4
       # 0.003, #NN5
       # 0.005, #NN6
       # 0.005, #NN7
       # 0.003, #NN8
       # 0.003, #NN9
       # 0.005, #NN10
       # 0.003, #NN11        
       # 0.001] #NN12 


batch_sizes = [1024, #NN1
               2048, #NN2
               1024]#, #NN3
              #  2048, #NN4
              #  1024, #NN5
              #  1024, #NN6
              #  2048, #NN7
              #  1024, #NN8
              #  1024, #NN9
              #  2048, #NN10
              #  2048, #N11        
              #  1024] #N12 


# epochs = [120,  #NN1 
#           75,  #NN2
#           100, #NN3
#           75,  #NN4
#           75,  #NN5
#           100, #NN6
#           75,  #NN7
#           75,  #NN8
#           75,  #NN9
#           75,  #NN10
#           75,  #NN11
#           75]  #NN12

epochs = [25,  #NN1 
          25,  #NN2
          25]#, #NN3
       #    25,  #NN4
       #    25,  #NN5
       #    25, #NN6
       #    25,  #NN7
       #    25,  #NN8
       #    25,  #NN9
       #    25,  #NN10
       #    25,  #NN11
       #    25]  #NN12



NN_BP = insert_parameters(df = NN_BP,
                          names=names, 
                          hidden_sizes_list=hidden_sizes, 
                          dropouts_list=dropouts, 
                          lrs=lrs, 
                          batch_sizes=batch_sizes, 
                          epochs_list=epochs)

NN_BP.head()

NN1 already exists in the DataFrame.
NN2 already exists in the DataFrame.
NN3 already exists in the DataFrame.


Unnamed: 0,name,hidden_sizes,dropouts,lr,batch_size,epochs,macro_P,macro_R,macro_F1,weighted_P,weighted_R,weighted_F1,samples_P,samples_R,samples_F1,training_time
0,NN1,"[4096, 2048]","[0.1, 0.1]",0.001,1024,25,0.007,0.004,0.004,0.122,0.095,0.094,0.748,0.144,0.218,0.266525
1,NN2,"[4096, 2048]","[0.2, 0.2]",0.003,2048,25,0.006,0.004,0.005,0.122,0.101,0.103,0.744,0.151,0.226,0.244709
2,NN3,"[4096, 2048]","[0.4, 0.4]",0.005,1024,25,0.032,0.009,0.012,0.236,0.124,0.138,0.738,0.174,0.249,0.25916


##### Training the models

In [26]:
def create_model(df, name, input_size, output_size):

    hidd_s = df[df['name'] == name]['hidden_sizes'].tolist()
    dropouts = df[df['name'] == name]['dropouts'].tolist()
    # Initialize the model
    model = NN(
        input_size = input_size,
        hidden_sizes = hidd_s,
        dropouts = dropouts, 
        output_size = output_size
        ).to(device)
    
    # Loss function and optimizer
    criterion = nn.BCELoss()  # Binary Cross-Entropy Loss for multilabel
    optimizer = optim.Adam(model.parameters(), lr=df[df['name'] == name]['lr'])

    return model, optimizer, criterion

In [27]:
def train_model(model, name, X_train, y_train, optimizer, criterion, batch_size, epochs):
    # Determine the device (model should already be on this device)
    device = next(model.parameters()).device

    print(f"\nTraining {name} model:")

    # Move training data to the device
    X_train = X_train.to(device)
    y_train = y_train.to(device)

    model.train()
    start_time = time.time()  # Record the start time

    for epoch in range(epochs):
        for i in range(0, len(X_train), batch_size):
            # Get the current batch
            X_batch = X_train[i:i+batch_size]
            y_batch = y_train[i:i+batch_size]

            # Forward pass
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        
        t = time.time() - start_time

        if epoch == 0:
            print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f} - time: {(t/60):.2f}min")
        elif epoch % 10 == 0:
            print(f"Epoch [{epoch}/{epochs}], Loss: {loss.item():.4f} - time: {(t/60):.2f}min")

    end_time = time.time()  # Record the end time

    training_time = end_time - start_time  # Calculate the elapsed time
    
    return training_time / 60

In [28]:
def model_evaluation(df, name, model, X, y, target_names, training_time):

    model.eval()
    with torch.no_grad():
        y_pred = model(X)
        probabilities = torch.sigmoid(y_pred)  # Assuming multilabel classification with sigmoid activation
        prob = probabilities[probabilities > 0.20].float()
        
        y_pred = (y_pred.cpu().numpy() > 0.5).astype(int)

    exists = any(df[df['name'] == name])
    
    if exists:
        
        report = classification_report(y.cpu().numpy(), y_pred, target_names=target_names, output_dict=True)
        # Convert the classification report to a DataFrame
        report = pd.DataFrame(report).transpose().drop(columns=['support'])
        
        report.loc['macro avg'] = report.loc['macro avg'].apply(lambda x: round(x, 3))
        report.loc['weighted avg'] = report.loc['weighted avg'].apply(lambda x: round(x, 3))
        report.loc['samples avg'] = report.loc['samples avg'].apply(lambda x: round(x, 3))

        # Update the DataFrame with the classification report information
        df.loc[df['name'] == name, 'macro_P'] = report.loc['macro avg']['precision']
        df.loc[df['name'] == name, 'macro_R'] = report.loc['macro avg']['recall']
        df.loc[df['name'] == name, 'macro_F1'] = report.loc['macro avg']['f1-score']

        df.loc[df['name'] == name, 'weighted_P'] = report.loc['weighted avg']['precision']
        df.loc[df['name'] == name, 'weighted_R'] = report.loc['weighted avg']['recall']
        df.loc[df['name'] == name, 'weighted_F1'] = report.loc['weighted avg']['f1-score']
        
        df.loc[df['name'] == name, 'samples_P'] = report.loc['samples avg']['precision']
        df.loc[df['name'] == name, 'samples_R'] = report.loc['samples avg']['recall']
        df.loc[df['name'] == name, 'samples_F1'] = report.loc['samples avg']['f1-score']

        df.loc[df['name'] == name, 'training_time'] = training_time

        return df, prob
    else:
        print(f"{name} already exists in the DataFrame.")
        return df, None, prob

In [29]:
target_names = [str(cls) for cls in mlb.classes_]

probs_path = datasets_path / filename_probs


if probs_path.exists():
    probs = pd.read_csv(datasets_path / filename_probs)
else:
    probs = {}

for name in NN_BP['name']:

    # Define model parameters
    input_size = X_train_tensor.shape[1] 
    hidden_sizes = NN_BP[NN_BP['name'] == name]['hidden_sizes'].values[0]
    dropouts = NN_BP[NN_BP['name'] == name]['dropouts'].values[0]
    output_size = y_train_tensor.shape[1]
    lr = float(NN_BP[NN_BP['name'] == name]['lr'].values[0])
    batch_size = int(NN_BP[NN_BP['name'] == name]['batch_size'].values[0])
    epochs = int(NN_BP[NN_BP['name'] == name]['epochs'].values[0])

    # Initialize the model
    model = NN(
        input_size = input_size,
        hidden_sizes = hidden_sizes,
        dropouts = dropouts, 
        output_size = output_size
        ).to(device)

    # Loss function and optimizer
    criterion = nn.BCELoss()  # Binary Cross-Entropy Loss for multilabel
    optimizer = optim.Adam(model.parameters(), lr=lr)


    # model, optimizer, criterion = create_model(NN_BP, name, X_train_tensor.shape[1] , y_train_tensor.shape[1])
    if np.isnan(NN_BP[NN_BP['name'] == name]['training_time'].values[0]):
        # Train the model
        training_time = train_model(model, name, X_train_tensor, y_train_tensor, optimizer, criterion, batch_size, epochs)

        # Ensure X_test_tensor is on the same device as the model
        device = next(model.parameters()).device
        X_test_tensor = X_test_tensor.to(device)

        # Evaluation
        
        NN_BP, probs[name] = model_evaluation(NN_BP, name, model, X_test_tensor, y_test_tensor, target_names, training_time)

    # Save the DataFrame to a CSV file
    NN_BP.to_csv(nn_df_path, index=False)
    

# probs = pd.DataFrame(probs)




        
NN_BP.head()

Unnamed: 0,name,hidden_sizes,dropouts,lr,batch_size,epochs,macro_P,macro_R,macro_F1,weighted_P,weighted_R,weighted_F1,samples_P,samples_R,samples_F1,training_time
0,NN1,"[4096, 2048]","[0.1, 0.1]",0.001,1024,25,0.007,0.004,0.004,0.122,0.095,0.094,0.748,0.144,0.218,0.266525
1,NN2,"[4096, 2048]","[0.2, 0.2]",0.003,2048,25,0.006,0.004,0.005,0.122,0.101,0.103,0.744,0.151,0.226,0.244709
2,NN3,"[4096, 2048]","[0.4, 0.4]",0.005,1024,25,0.032,0.009,0.012,0.236,0.124,0.138,0.738,0.174,0.249,0.25916


##### Extracting results

In [30]:
probs1 = pd.DataFrame(probs)
probs1.shape

(0, 0)

In [31]:
metrics = ['weighted_F1', 'weighted_P', 'weighted_R', 
           'macro_F1', 'macro_P', 'macro_R',  
           'samples_F1', 'samples_P', 'samples_R']



BP_results = NN_BP.sort_values(by=metrics, ascending=False)[['weighted_F1', 'weighted_P', 'weighted_R', 
           'macro_F1', 'macro_P', 'macro_R',  
           'samples_F1', 'samples_P', 'samples_R']][:5]

BP_params = NN_BP.sort_values(by=metrics, ascending=False)[['name', 'hidden_sizes', 'dropouts', 'lr', 
                                                'batch_size', 'epochs', 'training_time']][:5]

BP_params.head(), BP_results.head()

(  name  hidden_sizes    dropouts     lr  batch_size  epochs  training_time
 2  NN3  [4096, 2048]  [0.4, 0.4]  0.005        1024      25       0.259160
 1  NN2  [4096, 2048]  [0.2, 0.2]  0.003        2048      25       0.244709
 0  NN1  [4096, 2048]  [0.1, 0.1]  0.001        1024      25       0.266525,
    weighted_F1  weighted_P  weighted_R  macro_F1  macro_P  macro_R  \
 2        0.138       0.236       0.124     0.012    0.032    0.009   
 1        0.103       0.122       0.101     0.005    0.006    0.004   
 0        0.094       0.122       0.095     0.004    0.007    0.004   
 
    samples_F1  samples_P  samples_R  
 2       0.249      0.738      0.174  
 1       0.226      0.744      0.151  
 0       0.218      0.748      0.144  )

In [32]:
row_names = [f'NN{i}' for i in range(1, BP_results.shape[0] + 1)]

# Add the row means to the DataFrame
BP_results['name'] = row_names
BP_params['name'] = row_names

# Save the DataFrame to a CSV file
BP_results.to_csv(datasets_path / filename_results, index=False)
BP_params.to_csv(datasets_path / filename_params, index=False)

# Print the updated DataFrame
BP_results.head(), BP_params.head()

(   weighted_F1  weighted_P  weighted_R  macro_F1  macro_P  macro_R  \
 2        0.138       0.236       0.124     0.012    0.032    0.009   
 1        0.103       0.122       0.101     0.005    0.006    0.004   
 0        0.094       0.122       0.095     0.004    0.007    0.004   
 
    samples_F1  samples_P  samples_R name  
 2       0.249      0.738      0.174  NN1  
 1       0.226      0.744      0.151  NN2  
 0       0.218      0.748      0.144  NN3  ,
   name  hidden_sizes    dropouts     lr  batch_size  epochs  training_time
 2  NN1  [4096, 2048]  [0.4, 0.4]  0.005        1024      25       0.259160
 1  NN2  [4096, 2048]  [0.2, 0.2]  0.003        2048      25       0.244709
 0  NN3  [4096, 2048]  [0.1, 0.1]  0.001        1024      25       0.266525)

##### LaTex tables

In [33]:
def generate_latex_table(df):
    metrics = ['macro_P', 'macro_R', 'macro_F1', 'weighted_P', 'weighted_R', 'weighted_F1', 'samples_P', 'samples_R', 'samples_F1']
    metric_names = ['Macro P', 'Macro R', 'Macro F1', 'Weighted P', 'Weighted R', 'Weighted F1', 'Samples P', 'Samples R', 'Samples F1']
    
    # Initialize the LaTeX table
    latex_table = "\\begin{tabular}{l" + "c" * len(df['name']) + "}\n"
    latex_table += "    \\toprule\n"
    latex_table += "    \\textit{Metric} & " + " & ".join([f"\\textit{{{name}}}" for name in df['name']]) + " \\\\\n"
    latex_table += "    \\midrule\n"
    
    for metric, metric_name in zip(metrics, metric_names):
        values = df[metric]
        formatted_values = [f"{value:.2f}" for value in values]
        latex_table += f"    {metric_name} & " + " & ".join(formatted_values) + " \\\\\n"
    
    latex_table += "    \\bottomrule\n"
    latex_table += "\\end{tabular}"
    
    return latex_table


def generate_parameters_latex_table(df):
    # Initialize the LaTeX table
    latex_table = "\\begin{table}[H]\n"
    latex_table += "    \\centering\n"
    latex_table += "    \\begin{tabular}{lcccccc}\n"
    latex_table += "        \\toprule\n"
    latex_table += "        \\textit{Model} & \\textit{Hidden Layer Sizes} & \\textit{Dropout} & \\textit{Learning Rate} & \\textit{Batch Size} & \\textit{Epochs} & \\textit{Regularization} \\\\\n"
    latex_table += "        \\midrule\n"
    
    for index, row in df.iterrows():
        model_name = row['name']
        hidden_sizes = row['hidden_sizes']
        dropouts = row['dropouts']
        lr = row['lr']
        batch_size = row['batch_size']
        epochs = row['epochs']
        regularization = 'None'  # Assuming regularization is not provided in the DataFrame
        
        latex_table += f"        {model_name} & {hidden_sizes} & {dropouts} & {lr} & {batch_size} & {epochs} & {regularization} \\\\\n"
    
    latex_table += "        \\bottomrule\n"
    latex_table += "    \\end{tabular}\n"
    latex_table += "    \\caption{Model Parameters.}\n"
    latex_table += "\\end{table}"
    
    return latex_table

# Generate the LaTeX table
latex_table = generate_latex_table(BP_results)
print(latex_table)

latex_table = generate_parameters_latex_table(BP_params)
print(latex_table)


\begin{tabular}{lccc}
    \toprule
    \textit{Metric} & \textit{NN1} & \textit{NN2} & \textit{NN3} \\
    \midrule
    Macro P & 0.03 & 0.01 & 0.01 \\
    Macro R & 0.01 & 0.00 & 0.00 \\
    Macro F1 & 0.01 & 0.01 & 0.00 \\
    Weighted P & 0.24 & 0.12 & 0.12 \\
    Weighted R & 0.12 & 0.10 & 0.10 \\
    Weighted F1 & 0.14 & 0.10 & 0.09 \\
    Samples P & 0.74 & 0.74 & 0.75 \\
    Samples R & 0.17 & 0.15 & 0.14 \\
    Samples F1 & 0.25 & 0.23 & 0.22 \\
    \bottomrule
\end{tabular}
\begin{table}[H]
    \centering
    \begin{tabular}{lcccccc}
        \toprule
        \textit{Model} & \textit{Hidden Layer Sizes} & \textit{Dropout} & \textit{Learning Rate} & \textit{Batch Size} & \textit{Epochs} & \textit{Regularization} \\
        \midrule
        NN1 & [4096, 2048] & [0.4, 0.4] & 0.005 & 1024 & 25 & None \\
        NN2 & [4096, 2048] & [0.2, 0.2] & 0.003 & 2048 & 25 & None \\
        NN3 & [4096, 2048] & [0.1, 0.1] & 0.001 & 1024 & 25 & None \\
        \bottomrule
    \end{tabular}
   