# Model Trainig and Evaluation
## 1. Multi-class DNN model
---
This notebook provides a multiclass machine learning model. In the first part we will prepare the data (dataset with new features described in detail in the notebook "new_features.ipynb"), while the second part will consist of training and validation of the model.

# Requirements

In [5]:
!pip install imbalanced-learn



In [6]:
import numpy as np
import torch
import os
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm


In [7]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB, MultinomialNB, CategoricalNB, ComplementNB, BernoulliNB
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical
from timeit import default_timer as timer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import matthews_corrcoef, balanced_accuracy_score, average_precision_score, roc_auc_score


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Download updated dataset and define required directories

In [9]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [10]:
path_df = 'drive/MyDrive/SB_project/df_new.pkl'
path_pdb = 'drive/MyDrive/SB_project/pdb_files/' # Folder with pdb files matching "pdb_id" column of original dataset

In [11]:
csv_file_path = 'drive/MyDrive/SB_project/df_new.csv'
df = pd.read_csv(csv_file_path)
df

Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_up,s_down,s_phi,...,t_a4,t_a5,Interaction,CA_CA_distance,prev_s_resn,next_s_resn,prev_t_resn,next_t_resn,s_chain_length,t_chain_length
0,3m7l,A,123,,R,H,0.032,19.0,10.0,-1.747,...,1.266,-0.912,VDW,6.722287,LEU,VAL,GLU,TRP,422,422
1,3m7l,A,104,,I,H,0.485,9.0,11.0,-1.124,...,1.266,-0.912,HBOND,6.391156,LEU,ALA,VAL,ILE,422,422
2,3m7l,A,104,,I,H,0.485,9.0,11.0,-1.124,...,1.266,-0.912,VDW,6.391156,LEU,ALA,VAL,ILE,277,277
3,3m7l,A,139,,H,-,0.049,20.0,18.0,-2.085,...,-0.397,0.412,HBOND,5.929388,THR,PRO,SER,TYR,422,422
4,3m7l,A,139,,H,-,0.049,20.0,18.0,-2.085,...,-0.397,0.412,VDW,5.929388,THR,PRO,SER,TYR,422,422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1458551,4a6r,B,400,,I,H,0.101,20.0,13.0,-1.236,...,-0.397,0.412,VDW,9.463818,GLU,GLY,GLU,GLU,422,422
1458552,4a6r,A,382,,F,E,0.000,16.0,23.0,-2.681,...,0.393,0.816,HBOND,5.446033,ALA,THR,HIS,VAL,423,423
1458553,4a6r,A,266,,G,T,0.000,26.0,24.0,1.706,...,1.045,2.064,HBOND,5.035146,PHE,ARG,THR,GLU,423,423
1458554,4a6r,B,303,,G,-,0.083,14.0,19.0,-1.338,...,1.570,-0.146,HBOND,5.847882,VAL,LYS,VAL,GLU,422,422


In [12]:
df.dropna(inplace=True)
y = df['Interaction'].astype('category')

# Data preparation: Scaling and Encoding

In [13]:
#Label Encoding: the data transormation, s.t. all features are numerical
label_encoder = LabelEncoder()

def encode_object_columns(df):
    label_encoder = LabelEncoder()
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = df[column].astype(str)
            df[column] = label_encoder.fit_transform(df[column])
    return df

In [14]:
df = encode_object_columns(df)
df

Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_up,s_down,s_phi,...,t_a4,t_a5,Interaction,CA_CA_distance,prev_s_resn,next_s_resn,prev_t_resn,next_t_resn,s_chain_length,t_chain_length
0,1016,0,123,0,14,4,0.032,19.0,10.0,-1.747,...,1.266,-0.912,6,6.722287,11,21,6,19,422,422
1,1016,0,104,0,7,4,0.485,9.0,11.0,-1.124,...,1.266,-0.912,0,6.391156,11,0,20,11,422,422
2,1016,0,104,0,7,4,0.485,9.0,11.0,-1.124,...,1.266,-0.912,6,6.391156,11,0,20,11,277,277
3,1016,0,139,0,6,0,0.049,20.0,18.0,-2.085,...,-0.397,0.412,0,5.929388,17,16,16,20,422,422
4,1016,0,139,0,6,0,0.049,20.0,18.0,-2.085,...,-0.397,0.412,6,5.929388,17,16,16,20,422,422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1458551,1265,1,400,0,7,4,0.101,20.0,13.0,-1.236,...,-0.397,0.412,6,9.463818,6,7,6,7,422,422
1458552,1265,0,382,0,4,2,0.000,16.0,23.0,-2.681,...,0.393,0.816,0,5.446033,0,18,8,21,423,423
1458553,1265,0,266,0,5,7,0.000,26.0,24.0,1.706,...,1.045,2.064,0,5.035146,14,1,17,7,423,423
1458554,1265,1,303,0,5,0,0.083,14.0,19.0,-1.338,...,1.570,-0.146,0,5.847882,20,12,20,7,422,422


# Test dataset "df_mini"
That's an additional cell consisting of a cutted dataset, which can be used for testing purposes (size can be changed as required)

In [None]:
import pandas as pd

type_proportions = df['Interaction'].value_counts(normalize=True)

# Desired number of rows
desired_rows = 8000

# Calculate the number of rows to sample from each type
sample_sizes = (type_proportions * desired_rows).astype(int)

# Create an empty DataFrame for the sampled data
df_mini = pd.DataFrame()

# Sample rows for each type and concatenate them into df_mini
for type_value, sample_size in sample_sizes.items():
    type_rows = df[df['Interaction'] == type_value].sample(n=sample_size, random_state=42)
    df_mini = pd.concat([df_mini, type_rows])

df_mini = df_mini.sample(frac=1, random_state=42).reset_index(drop=True)
df_mini


Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_up,s_down,s_phi,...,t_a2,t_a3,t_a4,t_a5,Interaction,CA_CA_distance,prev_s_resn,next_s_resn,prev_t_resn,next_t_resn
0,1673,0,763,0,6,4,0.185,18.0,17.0,-1.043,...,-0.417,-1.673,-1.474,-0.078,6,6.459500,1,17,10,13
1,2334,0,26,0,18,4,0.013,18.0,16.0,-1.118,...,0.302,-3.656,-0.259,-3.242,0,8.021832,0,21,2,3
2,155,0,323,0,2,4,0.399,10.0,18.0,-0.953,...,-0.055,1.502,0.440,2.897,0,9.067962,17,21,11,13
3,1009,1,142,0,7,6,0.012,27.0,18.0,-1.826,...,-0.590,1.891,-0.397,0.412,6,9.460128,0,7,20,8
4,211,0,112,0,9,3,0.000,26.0,23.0,-1.096,...,-0.987,-1.505,1.266,-0.912,6,6.009008,15,3,11,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7992,2332,1,48,0,13,2,0.146,14.0,13.0,-2.647,...,-0.179,-3.005,-0.503,-1.853,6,9.904935,0,10,0,11
7993,540,7,32,0,7,2,0.012,26.0,10.0,-2.111,...,0.828,1.299,-0.169,0.933,6,6.519481,1,21,11,17
7994,2859,1,163,0,9,4,0.000,19.0,25.0,-0.979,...,-0.547,2.131,0.393,0.816,0,5.259478,10,6,6,11
7995,2528,0,9,0,16,4,0.007,21.0,14.0,-1.089,...,-0.279,-0.544,1.242,-1.262,6,4.999138,4,17,20,12


In [15]:

# Train and Test Splitting
X = df.drop(columns=['Interaction', 't_ss3', "s_ss3", "t_ins", "s_ins" ])
y = df['Interaction']
y = to_categorical(y, num_classes=7) # One-hot encode the labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling: standardizes features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Handling imbalance
 As the first method to deal with the imbalance of the dataset the compute_class_weight function is used. It calculates weights for each class to handle class imbalance

In [16]:
from sklearn.utils.class_weight import compute_class_weight

y_train_labels = np.argmax(y_train, axis=1)
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_labels), y=y_train_labels)
class_weight_dict = dict(enumerate(class_weights))

 ##Model Definition and Training

The DNN model is composed of:
* Input Layer: accepts input features with dimensionality corresponding to the number of features in the dataset
* 1st Dense Layer: a fully connected layer with 128 neurons, ReLU activation, followed by Dropout (20%) and Batch Normalization.
* 2nd Dense Layer: another fully connected layer with 64 neurons, ReLU activation, followed by Dropout (20%)
* 3rd Dense Layer: another fully connected layer with 32 neurons, ReLU activation, followed by Dropout (30%)
* 4th Dense Layer: another fully connected layer with 16 neurons, ReLU activation, followed by Dropout (30%)
* Output Layer: a final dense layer with 7 neurons and a softmax activation function, which outputs class probabilities for a multi-class classification problem

Optimization and Loss:
* Optimizer: Adam optimizer with a learning rate of 0.0001 to ensure smooth convergence
* Loss Function: Categorical Crossentropy, appropriate for multi-class classification problems where the output is one-hot encoded

Additional Techniques:
* ADASYN (Adaptive Synthetic Sampling): Used to oversample the minority classes in the training data to improve model performance on underrepresented classes
* Early Stopping:
A callback to stop training early if the validation loss doesn’t improve after 3 consecutive epochs, preventing overfitting and unnecessary computation

K-Fold Cross-Validation:
* The model is trained using 10-Fold Cross-Validation, where the dataset is split into 10 equal parts. In each iteration, 9 folds are used for training, and 1 fold is used for validation. This process is repeated 10 times, ensuring the model is evaluated on all data and reduces the risk of overfitting to any specific training or validation set.
---
~ Instructions~

As can be seen below the training part of the model is commented out and pre-stored weights are loaded for the tests. This part of code can be used to train the model if desired.

The file with weights is located here: "'drive/MyDrive/SB_project/model_weights/best_multimodel.weights.h5'"



In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import ADASYN
from tqdm import tqdm
import numpy as np
import pandas as pd

# Classes and labels
class_labels = ['HBOND', 'IONIC', 'PICATION', 'PIHBOND', 'PIPISTACK', 'SSBOND', 'VDW']

# Create the model structure
def create_model(input_dim):
    model = Sequential()
    model.add(Dense(128, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.2))
    model.add(BatchNormalization())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(16, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(len(class_labels), activation='softmax'))

    optimizer = Adam(learning_rate=0.0001)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Implement early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Define K-Fold cross-validator
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Store scores for each fold and class accuracies
fold_accuracies = []
class_accuracies = {label: [] for label in class_labels}

# Track the best model weights
best_accuracy = 0
best_model_weights = None

# ADASYN for oversampling minority classes with reduced neighbors
adasyn = ADASYN(n_neighbors=2)

# Model Trainig
# # K-Fold Cross Validation with Progress Bar
# for fold, (train_index, val_index) in enumerate(tqdm(kf.split(X_train), desc="Cross-validation Progress", total=kf.get_n_splits())):
#     X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
#     y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

#     # Apply ADASYN only if there are enough samples
#     if len(y_train_fold) > adasyn.n_neighbors:
#         X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train_fold, y_train_fold)
#     else:
#         X_train_resampled, y_train_resampled = X_train_fold, y_train_fold

    # Create a new model instance for each fold
model = create_model(input_dim=X_train.shape[1])

#     # Train the model
#     history = model.fit(X_train_resampled, y_train_resampled,
#                         validation_data=(X_val_fold, y_val_fold),
#                         epochs=50,
#                         batch_size=32,
#                         callbacks=[early_stopping],
#                         verbose=0)

#     # Predict the labels for the validation fold
#     y_val_pred = np.argmax(model.predict(X_val_fold), axis=1)
#     y_val_true = np.argmax(y_val_fold, axis=1)

#     # Compute confusion matrix for the current fold
#     cm = confusion_matrix(y_val_true, y_val_pred, labels=np.arange(len(class_labels)))

#     # Calculate accuracy for each class
#     for i, label in enumerate(class_labels):
#         TP = cm[i, i]
#         TN = np.sum(cm) - (np.sum(cm[i, :]) + np.sum(cm[:, i]) - TP)
#         FP = np.sum(cm[:, i]) - TP
#         FN = np.sum(cm[i, :]) - TP
#         accuracy = (TP + TN) / (TP + TN + FP + FN) * 100  # Accuracy formula
#         class_accuracies[label].append(accuracy)

#     # Calculate fold accuracy
#     fold_accuracy = np.mean([class_accuracies[label][-1] for label in class_labels])
#     fold_accuracies.append(fold_accuracy)

#     print(f"Fold {fold+1} accuracy: {fold_accuracy:.4f}")

#     # Track the best model weights
#     if fold_accuracy > best_accuracy:
#         best_accuracy = fold_accuracy
#         best_model_weights = model.get_weights()  # Save the best model weights

# # After K-Fold, set the model to the best weights and save them
# if best_model_weights is not None:
#     model.set_weights(best_model_weights)
#     model.save_weights('drive/MyDrive/SB_project/model_weights/best_multimodel.weights.h5')  # Save the best model weights

# # Calculate and print the average accuracy across all folds
# average_accuracy = np.mean(fold_accuracies)
# print(f"\nAverage accuracy across all folds: {average_accuracy:.4f}")

# # Create a DataFrame to display the class-wise accuracies across folds
# results_df = pd.DataFrame({
#     'Class': class_labels,
#     'Mean Accuracy': [np.mean(class_accuracies[label]) for label in class_labels],
# })

# # Display the results in a table
# print("\nClass-wise accuracy across all folds:")
# print(results_df)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3647/3647[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 286us/step


Cross-validation Progress:  10%|█         | 1/10 [15:41<2:21:11, 941.32s/it]

Fold 1 accuracy: 76.4969


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3647/3647[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 280us/step


Cross-validation Progress:  20%|██        | 2/10 [24:16<1:32:07, 690.90s/it]

Fold 2 accuracy: 72.7831


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3647/3647[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 282us/step


Cross-validation Progress:  30%|███       | 3/10 [32:55<1:11:23, 611.99s/it]

Fold 3 accuracy: 72.7085


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3647/3647[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 280us/step


Cross-validation Progress:  40%|████      | 4/10 [41:25<57:10, 571.80s/it]  

Fold 4 accuracy: 72.7550


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3647/3647[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 285us/step


Cross-validation Progress:  50%|█████     | 5/10 [49:49<45:38, 547.60s/it]

Fold 5 accuracy: 72.6737


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3647/3647[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 282us/step


Cross-validation Progress:  60%|██████    | 6/10 [58:15<35:32, 533.22s/it]

Fold 6 accuracy: 72.6404


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3647/3647[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 283us/step


Cross-validation Progress:  70%|███████   | 7/10 [1:06:42<26:14, 524.82s/it]

Fold 7 accuracy: 72.6646


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3647/3647[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 283us/step


Cross-validation Progress:  80%|████████  | 8/10 [1:15:07<17:16, 518.39s/it]

Fold 8 accuracy: 72.7258


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3647/3647[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 283us/step


Cross-validation Progress:  90%|█████████ | 9/10 [1:23:37<08:35, 515.86s/it]

Fold 9 accuracy: 72.6965


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3647/3647[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 287us/step


Cross-validation Progress: 100%|██████████| 10/10 [1:32:00<00:00, 552.04s/it]

Fold 10 accuracy: 72.8480

Average accuracy across all folds: 73.0993

Class-wise accuracy across all folds:
       Class  Mean Accuracy
0      HBOND      44.266414
1      IONIC      59.315796
2   PICATION      93.011405
3    PIHBOND      60.209076
4  PIPISTACK      96.210290
5     SSBOND      99.127133
6        VDW      59.554665





# Model Testing and Performance Metrics

In [18]:
from sklearn.metrics import balanced_accuracy_score, accuracy_score, classification_report

# Load best model weights (if they have already been saved)
model.load_weights('drive/MyDrive/SB_project/model_weights/best_multimodel.weights.h5')

# Predict the labels for the test set
y_test_pred = np.argmax(model.predict(X_test), axis=1)
y_test_true = np.argmax(y_test, axis=1)

# Calculate the confusion matrix for the test set
cm_test = confusion_matrix(y_test_true, y_test_pred, labels=np.arange(len(class_labels)))

# Calculate the balanced accuracy score for the entire test set
balanced_accuracy = balanced_accuracy_score(y_test_true, y_test_pred) * 100
print(f"\nBalanced accuracy for the entire test set: {balanced_accuracy:.4f}%")

# Calculate and print the class-wise accuracies
class_accuracies_test = []
for i, label in enumerate(class_labels):
    TP = cm_test[i, i]
    TN = np.sum(cm_test) - (np.sum(cm_test[i, :]) + np.sum(cm_test[:, i]) - TP)
    FP = np.sum(cm_test[:, i]) - TP
    FN = np.sum(cm_test[i, :]) - TP

    # Accuracy for each class
    class_accuracy = (TP + TN) / (TP + TN + FP + FN) * 100  # Accuracy formula
    class_accuracies_test.append(class_accuracy)
    print(f"Сlass '{label}': {class_accuracy:.4f}%")

# Create a DataFrame to display the class-wise accuracies
results_df_test = pd.DataFrame({
    'Class': class_labels,
    'Accuracy': class_accuracies_test
})

# Display the class-wise results in a table
print("\nClass-wise accuracy for the test set:")
print(results_df_test)

# Classification report for a more detailed breakdown
print("\nClassification report for the test set:")
print(classification_report(y_test_true, y_test_pred, target_names=class_labels))


  saveable.load_own_variables(weights_store.get(inner_path))


[1m9116/9116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step

Balanced accuracy for the entire test set: 77.2622%
Сlass 'HBOND': 58.4638%
Сlass 'IONIC': 91.5564%
Сlass 'PICATION': 97.2929%
Сlass 'PIHBOND': 96.5452%
Сlass 'PIPISTACK': 98.0498%
Сlass 'SSBOND': 99.8502%
Сlass 'VDW': 60.3698%

Class-wise accuracy for the test set:
       Class   Accuracy
0      HBOND  58.463827
1      IONIC  91.556398
2   PICATION  97.292878
3    PIHBOND  96.545223
4  PIPISTACK  98.049789
5     SSBOND  99.850195
6        VDW  60.369817

Classification report for the test set:
              precision    recall  f1-score   support

       HBOND       0.59      0.83      0.69    162727
       IONIC       0.18      1.00      0.31      5529
    PICATION       0.14      0.97      0.24      1319
     PIHBOND       0.02      0.62      0.04       330
   PIPISTACK       0.51      0.99      0.67      5883
      SSBOND       0.43      1.00      0.60       330
         VDW       0.49      0.00      0.0