# Model Trainig and Evaluation
## 1. DNN model with binary output
---
The notebook provides a machine learning model with binary output which is then used to predict each target class separately. In the first part we will prepare the data (dataset with new features described in detail in the notebook "new_features.ipynb"), while the second part will consist of training and validation of the model.

# Requirements

In [3]:
!pip install imbalanced-learn



In [4]:
import numpy as np
import torch
import os
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm


In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB, MultinomialNB, CategoricalNB, ComplementNB, BernoulliNB
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical
from timeit import default_timer as timer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import matthews_corrcoef, balanced_accuracy_score, average_precision_score, roc_auc_score


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Download updated dataset and define required directories

In [7]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [8]:
path_df = 'drive/MyDrive/SB_project/df_data.pkl'
path_pdb = 'drive/MyDrive/SB_project/pdb_files/' # Folder with pdb files matching "pdb_id" column of original dataset

In [9]:
csv_file_path = 'drive/MyDrive/SB_project/df_new.csv'
df = pd.read_csv(csv_file_path)
df

Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_up,s_down,s_phi,...,t_a4,t_a5,Interaction,CA_CA_distance,prev_s_resn,next_s_resn,prev_t_resn,next_t_resn,s_chain_length,t_chain_length
0,3m7l,A,123,,R,H,0.032,19.0,10.0,-1.747,...,1.266,-0.912,VDW,6.722287,LEU,VAL,GLU,TRP,422,422
1,3m7l,A,104,,I,H,0.485,9.0,11.0,-1.124,...,1.266,-0.912,HBOND,6.391156,LEU,ALA,VAL,ILE,422,422
2,3m7l,A,104,,I,H,0.485,9.0,11.0,-1.124,...,1.266,-0.912,VDW,6.391156,LEU,ALA,VAL,ILE,277,277
3,3m7l,A,139,,H,-,0.049,20.0,18.0,-2.085,...,-0.397,0.412,HBOND,5.929388,THR,PRO,SER,TYR,422,422
4,3m7l,A,139,,H,-,0.049,20.0,18.0,-2.085,...,-0.397,0.412,VDW,5.929388,THR,PRO,SER,TYR,422,422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1458551,4a6r,B,400,,I,H,0.101,20.0,13.0,-1.236,...,-0.397,0.412,VDW,9.463818,GLU,GLY,GLU,GLU,422,422
1458552,4a6r,A,382,,F,E,0.000,16.0,23.0,-2.681,...,0.393,0.816,HBOND,5.446033,ALA,THR,HIS,VAL,423,423
1458553,4a6r,A,266,,G,T,0.000,26.0,24.0,1.706,...,1.045,2.064,HBOND,5.035146,PHE,ARG,THR,GLU,423,423
1458554,4a6r,B,303,,G,-,0.083,14.0,19.0,-1.338,...,1.570,-0.146,HBOND,5.847882,VAL,LYS,VAL,GLU,422,422


In [10]:
df.dropna(inplace=True)
y = df['Interaction'].astype('category')

# Data preparation: Scaling and Encoding

In [11]:
#Label Encoding: the data transormation, s.t. all features are numerical
label_encoder = LabelEncoder()

def encode_object_columns(df):
    label_encoder = LabelEncoder()
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = df[column].astype(str)
            df[column] = label_encoder.fit_transform(df[column])
    return df

In [12]:
df = encode_object_columns(df)
df

Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_up,s_down,s_phi,...,t_a4,t_a5,Interaction,CA_CA_distance,prev_s_resn,next_s_resn,prev_t_resn,next_t_resn,s_chain_length,t_chain_length
0,1016,0,123,0,14,4,0.032,19.0,10.0,-1.747,...,1.266,-0.912,6,6.722287,11,21,6,19,422,422
1,1016,0,104,0,7,4,0.485,9.0,11.0,-1.124,...,1.266,-0.912,0,6.391156,11,0,20,11,422,422
2,1016,0,104,0,7,4,0.485,9.0,11.0,-1.124,...,1.266,-0.912,6,6.391156,11,0,20,11,277,277
3,1016,0,139,0,6,0,0.049,20.0,18.0,-2.085,...,-0.397,0.412,0,5.929388,17,16,16,20,422,422
4,1016,0,139,0,6,0,0.049,20.0,18.0,-2.085,...,-0.397,0.412,6,5.929388,17,16,16,20,422,422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1458551,1265,1,400,0,7,4,0.101,20.0,13.0,-1.236,...,-0.397,0.412,6,9.463818,6,7,6,7,422,422
1458552,1265,0,382,0,4,2,0.000,16.0,23.0,-2.681,...,0.393,0.816,0,5.446033,0,18,8,21,423,423
1458553,1265,0,266,0,5,7,0.000,26.0,24.0,1.706,...,1.045,2.064,0,5.035146,14,1,17,7,423,423
1458554,1265,1,303,0,5,0,0.083,14.0,19.0,-1.338,...,1.570,-0.146,0,5.847882,20,12,20,7,422,422


# Test dataset "df_mini"
That's an additional cell consisting of a cutted dataset, which can be used for testing purposes (size can be changed as required)

In [13]:
import pandas as pd

type_proportions = df['Interaction'].value_counts(normalize=True)

# Desired number of rows
desired_rows = 8000

# Calculate the number of rows to sample from each type
sample_sizes = (type_proportions * desired_rows).astype(int)

# Create an empty DataFrame for the sampled data
df_mini = pd.DataFrame()

# Sample rows for each type and concatenate them into df_mini
for type_value, sample_size in sample_sizes.items():
    type_rows = df[df['Interaction'] == type_value].sample(n=sample_size, random_state=42)
    df_mini = pd.concat([df_mini, type_rows])

df_mini = df_mini.sample(frac=1, random_state=42).reset_index(drop=True)
df_mini


Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_up,s_down,s_phi,...,t_a4,t_a5,Interaction,CA_CA_distance,prev_s_resn,next_s_resn,prev_t_resn,next_t_resn,s_chain_length,t_chain_length
0,1673,0,763,0,6,4,0.185,18.0,17.0,-1.043,...,-1.474,-0.078,6,6.459500,1,17,10,13,422,422
1,2334,0,26,0,18,4,0.013,18.0,16.0,-1.118,...,-0.259,-3.242,0,8.021832,0,21,2,3,124,124
2,155,0,323,0,2,4,0.399,10.0,18.0,-0.953,...,0.440,2.897,0,9.067962,17,21,11,13,423,423
3,1009,1,142,0,7,6,0.012,27.0,18.0,-1.826,...,-0.397,0.412,6,9.460128,0,7,20,8,422,422
4,211,0,112,0,9,3,0.000,26.0,23.0,-1.096,...,1.266,-0.912,6,6.009008,15,3,11,3,277,277
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7992,2332,1,48,0,13,2,0.146,14.0,13.0,-2.647,...,-0.503,-1.853,6,9.904935,0,10,0,11,224,224
7993,540,7,32,0,7,2,0.012,26.0,10.0,-2.111,...,-0.169,0.933,6,6.519481,1,21,11,17,83,83
7994,2859,1,163,0,9,4,0.000,19.0,25.0,-0.979,...,0.393,0.816,0,5.259478,10,6,6,11,356,356
7995,2528,0,9,0,16,4,0.007,21.0,14.0,-1.089,...,1.242,-1.262,6,4.999138,4,17,20,12,423,423


In [14]:
# Train and Test Splitting
X = df.drop(columns=['Interaction', 't_ss3', "s_ss3", "t_ins", "s_ins" ])
y = df['Interaction']
y = to_categorical(y, num_classes=7) # One-hot encode the labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling: standardizes features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Handling imbalance
 As the first method to deal with the imbalance of the dataset the compute_class_weight function is used. It calculates weights for each class to handle class imbalance

In [15]:
from sklearn.utils.class_weight import compute_class_weight

y_train_labels = np.argmax(y_train, axis=1)
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_labels), y=y_train_labels)
class_weight_dict = dict(enumerate(class_weights))

 ##Model Definition and Training

The binary classification DNN model is composed of:
* Input Layer: accepts input features with dimensionality corresponding to the number of features in the dataset
* 1st Dense Layer: a fully connected layer with 64 neurons, ReLU activation, followed by Dropout (30%) and Batch Normalization to stabilize and regularize the learning process
* 2nd Dense Layer: another fully connected layer with 32 neurons, ReLU activation, followed by Dropout (30%) for regularization
* 3rd Dense Layer: a fully connected layer with 16 neurons, ReLU activation, followed by Dropout (30%) for further regularization
* Output Layer: a final dense layer with 1 neuron and sigmoid activation, which outputs a probability score for binary classification (One-vs-Rest approach for each class)

Optimization and Loss:
* Optimizer: Adam optimizer with a learning rate of 0.001
* Loss Function: binary cross-entropy loss

Additional Techniques:
* Learning Rate Scheduler reduces learning rate when validation loss plateaus

Early Stopping:
* A callback to stop training early if the validation loss doesn’t improve after 3 consecutive epochs, preventing overfitting and unnecessary computation

K-Fold Cross-Validation:
* This model applies a One-vs-Rest (OvR) approach for multiclass classification using deep neural networks (DNN) and K-Fold Cross-Validation
---
~ Instructions~

As can be seen below the training part of the model is commented out and pre-stored weights are loaded for the tests. This part of code can be used to train the model if desired.

The file with weights is located here: "'drive/MyDrive/SB_project/model_weights/binary_model_weights/"


In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, CSVLogger
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.preprocessing import label_binarize
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm
import numpy as np
import os

n_classes = 7  # Number of classes
class_labels = ['HBOND', 'IONIC', 'PICATION', 'PIHBOND', 'PIPISTACK', 'SSBOND', 'VDW']
# Directories for saving checkpoints and logs
checkpoint_dir = 'drive/MyDrive/SB_project/model_weights/'
log_dir = 'drive/MyDrive/SB_project/model_weights/logs/'
os.makedirs(checkpoint_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)

# CSVLogger to save training logs
csv_logger = CSVLogger(os.path.join(log_dir, 'binary_training_log.csv'), append=True)

# Best fold tracking
best_fold_accuracy = 0
best_fold = -1
best_model_path = os.path.join(checkpoint_dir, 'best_binary_model_fold.weights.h5')

# Define K-Fold cross-validator
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Function to create a binary classification model with 3 layers
def create_binary_model(input_dim):
    model = Sequential()
    model.add(Dense(64, input_dim=input_dim, activation='relu'))  # Layer 1 with 64 neurons
    model.add(Dropout(0.3))
    model.add(BatchNormalization())

    model.add(Dense(32, activation='relu'))  # Layer 2 with 32 neurons
    model.add(Dropout(0.3))

    model.add(Dense(16, activation='relu'))  # Layer 3 with 16 neurons
    model.add(Dropout(0.3))

    model.add(Dense(1, activation='sigmoid'))  # Binary output
    optimizer = Adam(learning_rate=0.001)  # Start with higher learning rate
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Learning rate scheduler
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1, min_lr=1e-6)

# Store overall accuracies and per-class accuracies for each fold
fold_accuracies = {f"Class {i}": [] for i in range(n_classes)}
overall_accuracies = []

# # Progress bar for cross-validation with percentages
# with tqdm(total=kf.get_n_splits(), desc="K-Fold Cross Validation", unit="fold") as pbar:
#     for fold_idx, (train_index, val_index) in enumerate(kf.split(X_train)):
#         try:
#             # Prepare train and validation data for the fold
#             X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
#             y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

#             # Binarize labels for each class (One-vs-Rest)
#             y_train_binary = label_binarize(y_train_fold, classes=np.arange(n_classes))
#             y_val_binary = label_binarize(y_val_fold, classes=np.arange(n_classes))

#             # Store predictions from all binary models for combining later
#             val_predictions = np.zeros((X_val_fold.shape[0], n_classes))

#             for class_idx in range(n_classes):
#                 # Binary labels for the current class (OvR)
#                 y_train_class = y_train_binary[:, class_idx]
#                 y_val_class = y_val_binary[:, class_idx]

#                 # Reset the model for each binary classifier
#                 model = create_binary_model(input_dim=X_train_fold.shape[1])

#                 # Early stopping callback
#                 early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

#                 # Model checkpoint strategy: Save the best model for each class
#                 class_model_path = os.path.join(checkpoint_dir, f'best_binary_model_class_{class_idx}_fold_{fold_idx}.weights.h5')
#                 checkpoint_callback = ModelCheckpoint(
#                     filepath=class_model_path,
#                     save_weights_only=True,
#                     monitor='val_loss',
#                     mode='min',
#                     save_best_only=True,
#                     verbose=0
#                 )

#                 # Train the model
#                 class_weights = {0: 1, 1: np.sum(y_train_class == 0) / np.sum(y_train_class == 1)}
#                 model.fit(X_train_fold, y_train_class,
#                           validation_data=(X_val_fold, y_val_class),
#                           epochs=50,
#                           batch_size=32,
#                           class_weight=class_weights,
#                           callbacks=[early_stopping, lr_scheduler, checkpoint_callback, csv_logger],
#                           verbose=0)

#                 # Predict on validation fold for this binary task
#                 val_predictions[:, class_idx] = model.predict(X_val_fold).squeeze()

#                 # Compute balanced accuracy for the current binary classifier
#                 binary_acc = balanced_accuracy_score(y_val_class, (val_predictions[:, class_idx] > 0.5).astype(int))
#                 print(f"Balanced accuracy for Class {class_idx} in fold {fold_idx}: {binary_acc:.4f}")
#                 fold_accuracies[f"Class {class_idx}"].append(binary_acc)

#             # Multiclass decision: take the class with the highest prediction score
#             y_val_pred = np.argmax(val_predictions, axis=1)
#             y_val_true = np.argmax(y_val_binary, axis=1)

#             # Compute overall accuracy for the current fold (multiclass)
#             overall_acc = accuracy_score(y_val_true, y_val_pred)
#             print(f"Overall accuracy for fold {fold_idx}: {overall_acc:.4f}")
#             overall_accuracies.append(overall_acc)

#             # Check if this fold has the best accuracy so far
#             if overall_acc > best_fold_accuracy:
#                 print(f"New best fold: {fold_idx} with accuracy {overall_acc:.4f}")
#                 best_fold_accuracy = overall_acc
#                 best_fold = fold_idx

#                 # Save weights for the best fold
#                 model.save_weights(best_model_path)

#         except Exception as e:
#             print(f"Error encountered in fold {fold_idx}: {e}. Skipping this fold.")

#         # Update progress bar
#         pbar.set_postfix({'Best Accuracy': f"{best_fold_accuracy:.4f}"})
#         pbar.update(1)

# # Compute average balanced accuracy per class
# average_per_class_acc = {f"Class {cls}": np.mean(fold_accuracies[f"Class {cls}"]) for cls in range(n_classes)}
# print(f"Average balanced accuracy per class across all folds: {average_per_class_acc}")

# # Compute average overall accuracy across all folds
# average_overall_accuracy = np.mean(overall_accuracies)
# print(f"Average overall accuracy across all folds: {average_overall_accuracy:.4f}")

# # Create a table (dictionary format) for accuracies
# accuracy_table = {
#     "Overall Accuracy": average_overall_accuracy,
#     "Per-Class Accuracy": average_per_class_acc
# }

# print("\nFinal Accuracy Table:")
# for key, value in accuracy_table.items():
#     print(f"{key}: {value}")

# print(f"\nBest fold was fold {best_fold} with accuracy {best_fold_accuracy:.4f}")

# Model Testing and Performance Metrics

In [24]:
from sklearn.metrics import balanced_accuracy_score, accuracy_score, classification_report, confusion_matrix

# Placeholder to store accuracies for each class
class_accuracies_test = []
balanced_accuracies_test = []

# Iterate over each class and corresponding binary model
for i, label in enumerate(class_labels):

    # Load the weights for the binary model corresponding to the current class
    binary_model = create_binary_model(input_dim=X_train.shape[1])  # Assuming create_model works for binary case as well
    binary_model.load_weights(f'drive/MyDrive/SB_project/model_weights/binary_model_weights/best_binary_model_class_{i}_fold_6.weights.h5')

    # Predict labels for the current class
    y_test_pred = np.round(binary_model.predict(X_test))  # Assuming binary prediction (0 or 1)
    y_test_true = y_test[:, i]  # True labels for the current class

    # Calculate the confusion matrix for the current class
    cm_test = confusion_matrix(y_test_true, y_test_pred)

    # Calculate the balanced accuracy score for the current class
    balanced_accuracy = balanced_accuracy_score(y_test_true, y_test_pred) * 100
    balanced_accuracies_test.append(balanced_accuracy)
    print(f"\nBalanced accuracy for class '{label}': {balanced_accuracy:.4f}%")

    # Calculate class-wise accuracy (standard accuracy for the binary case)
    TP = cm_test[1, 1]
    TN = cm_test[0, 0]
    FP = cm_test[0, 1]
    FN = cm_test[1, 0]
    class_accuracy = (TP + TN) / (TP + TN + FP + FN) * 100
    class_accuracies_test.append(class_accuracy)

    print(f"Accuracy for class '{label}': {class_accuracy:.4f}%")

# Create a DataFrame to display the class-wise accuracies
results_df_test = pd.DataFrame({
    'Class': class_labels,
    'Balanced Accuracy': balanced_accuracies_test,
    'Accuracy': class_accuracies_test
})

# Display the class-wise results in a table
print("\nClass-wise accuracy for the test set:")
print(results_df_test)


[1m9116/9116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step

Balanced accuracy for class 'HBOND': 65.0514%
Accuracy for class 'HBOND': 66.7124%


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  saveable.load_own_variables(weights_store.get(inner_path))


[1m9116/9116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step

Balanced accuracy for class 'IONIC': 95.6820%
Accuracy for class 'IONIC': 91.5276%


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  saveable.load_own_variables(weights_store.get(inner_path))


[1m9116/9116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step

Balanced accuracy for class 'PICATION': 98.4961%
Accuracy for class 'PICATION': 97.0810%


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  saveable.load_own_variables(weights_store.get(inner_path))


[1m9116/9116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step

Balanced accuracy for class 'PIHBOND': 84.3033%
Accuracy for class 'PIHBOND': 82.8523%


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  saveable.load_own_variables(weights_store.get(inner_path))


[1m9116/9116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 2ms/step

Balanced accuracy for class 'PIPISTACK': 99.0102%
Accuracy for class 'PIPISTACK': 98.0604%


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  saveable.load_own_variables(weights_store.get(inner_path))


[1m9116/9116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step

Balanced accuracy for class 'SSBOND': 99.9272%
Accuracy for class 'SSBOND': 99.8547%


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  saveable.load_own_variables(weights_store.get(inner_path))


[1m9116/9116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step

Balanced accuracy for class 'VDW': 62.2055%
Accuracy for class 'VDW': 65.5784%

Class-wise accuracy for the test set:
       Class  Balanced Accuracy   Accuracy
0      HBOND          65.051428  66.712374
1      IONIC          95.681959  91.527603
2   PICATION          98.496148  97.081025
3    PIHBOND          84.303275  82.852265
4  PIPISTACK          99.010247  98.060416
5     SSBOND          99.927243  99.854651
6        VDW          62.205460  65.578379
