In [11]:
pip install pandas scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [12]:
import os
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import (
    GaussianNB,
    MultinomialNB,
    CategoricalNB,
    ComplementNB,
    BernoulliNB,
)
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder


#import torch
#import torch.nn as nn
#import torch.optim as optim
#import torch.nn.functional as F
#from torch.utils.data import DataLoader, TensorDataset
#from sb_project.NeuralNetworks import *


In [13]:
random.seed(42)

In [14]:
# Combine all PDBs into a single dataframe
ring_path = Path("C:\\Users\\emmal\\Desktop\\StructuralBioinformatics\\PROJECT\\features_ring")
dfs = []
for filename in os.listdir(ring_path):
    dfs.append(pd.read_csv(ring_path / filename, sep="\t"))
df = pd.concat(dfs)
df


Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_up,s_down,s_phi,...,t_down,t_phi,t_psi,t_ss3,t_a1,t_a2,t_a3,t_a4,t_a5,Interaction
0,1aba,A,50,,E,H,0.304,7.0,10.0,-1.107,...,6.0,-1.102,-0.657,H,1.831,-0.561,0.533,-0.277,1.648,HBOND
1,1aba,A,67,,Q,E,0.086,14.0,24.0,-2.214,...,16.0,-1.350,2.242,H,0.336,-0.417,-1.673,-1.474,-0.078,HBOND
2,1aba,A,17,,C,H,0.000,17.0,17.0,-1.093,...,23.0,-1.043,-0.817,H,1.831,-0.561,0.533,-0.277,1.648,HBOND
3,1aba,A,45,,D,H,0.742,1.0,13.0,-1.182,...,16.0,-1.034,-0.718,H,-0.591,-1.302,-0.733,1.570,-0.146,HBOND
4,1aba,A,39,,E,B,0.634,0.0,15.0,-2.661,...,12.0,-2.073,2.008,H,-1.337,-0.279,-0.544,1.242,-1.262,HBOND
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,9f5c,A,105,,K,S,0.498,9.0,21.0,-1.949,...,18.0,-1.056,-0.761,H,-1.337,-0.279,-0.544,1.242,-1.262,
240,9f5c,A,69,,D,H,0.270,7.0,21.0,-0.984,...,14.0,-1.161,-0.599,H,-0.663,-1.524,2.219,-1.005,1.212,
241,9f5c,A,169,,T,E,0.577,1.0,22.0,-1.955,...,5.0,-1.681,2.414,H,0.336,-0.417,-1.673,-1.474,-0.078,
242,9f5c,A,106,,K,E,0.322,8.0,26.0,-2.203,...,10.0,-1.991,2.367,H,-0.591,-1.302,-0.733,1.570,-0.146,HBOND


In [15]:
new_df = pd.DataFrame(df.iloc[:1000000, :])
new_df.dropna(inplace=True)

y = new_df["Interaction"].astype("category")

# Define training features
X = new_df[
    [
        "s_rsa",
        "s_up",
        "s_down",
        "s_phi",
        "s_psi",
        "s_a1",
        "s_a2",
        "s_a3",
        "s_a4",
        "s_a5",
        "t_rsa",
        "t_up",
        "t_down",
        "t_phi",
        "t_psi",
        "t_a1",
        "t_a2",
        "t_a3",
        "t_a4",
        "t_a5",
    ]
]

# Calculate percentiles and transform into categories
X = X.rank(pct=True).round(1).astype("category")

# Split the dataset to define training and testing examples
X_new_train, X_new_test, y_new_train, y_new_test = train_test_split(
    X, y, test_size=0.1, random_state=0
)



In [16]:
bonds = ["HBOND", "IONIC", "PICATION", "PIHBOND", "PIPISTACK", "SSBOND", "VDW"]

for bond in bonds:
    print(f"this is new_df {bond}: {len(new_df[new_df["Interaction"] == bond])}")
    print(f"this is df{bond}: {len(df[df["Interaction"] == bond])}\n")

this is new_df HBOND: 367794
this is dfHBOND: 901814

this is new_df IONIC: 12522
this is dfIONIC: 30355

this is new_df PICATION: 3016
this is dfPICATION: 7623

this is new_df PIHBOND: 656
this is dfPIHBOND: 1836

this is new_df PIPISTACK: 11399
this is dfPIPISTACK: 32965

this is new_df SSBOND: 863
this is dfSSBOND: 1792

this is new_df VDW: 253627
this is dfVDW: 640469



In [17]:
# Remove all rows with NaN in at least one column
# including rows with missing class (they could be false negatives)
df.dropna(inplace=True)

# Define ground truth values
y = df["Interaction"].astype("category")
y

0          HBOND
1          HBOND
2          HBOND
3          HBOND
4          HBOND
         ...    
234          VDW
237    PIPISTACK
238          VDW
242        HBOND
243          VDW
Name: Interaction, Length: 1471380, dtype: category
Categories (7, object): ['HBOND', 'IONIC', 'PICATION', 'PIHBOND', 'PIPISTACK', 'SSBOND', 'VDW']

In [18]:
# Define training features
X = df[
    [
        "s_rsa",
        "s_up",
        "s_down",
        "s_phi",
        "s_psi",
        "s_a1",
        "s_a2",
        "s_a3",
        "s_a4",
        "s_a5",
        "t_rsa",
        "t_up",
        "t_down",
        "t_phi",
        "t_psi",
        "t_a1",
        "t_a2",
        "t_a3",
        "t_a4",
        "t_a5",
    ]
]

# Calculate percentiles and transform into categories
X = X.rank(pct=True).round(1).astype("category")
X 

Unnamed: 0,s_rsa,s_up,s_down,s_phi,s_psi,s_a1,s_a2,s_a3,s_a4,s_a5,t_rsa,t_up,t_down,t_phi,t_psi,t_a1,t_a2,t_a3,t_a4,t_a5
0,0.7,0.1,0.1,0.7,0.3,0.8,0.1,0.7,0.4,0.4,0.8,0.5,0.0,0.7,0.3,1.0,0.3,0.5,0.3,0.8
1,0.5,0.4,0.9,0.1,0.7,0.7,0.5,0.2,0.2,0.2,0.9,0.2,0.5,0.4,0.8,0.6,0.5,0.2,0.0,0.5
2,0.1,0.5,0.5,0.7,0.1,0.0,0.8,0.3,0.1,0.4,0.7,0.3,0.9,0.8,0.1,1.0,0.3,0.5,0.3,0.8
3,1.0,0.0,0.2,0.5,0.5,0.8,0.7,0.1,0.3,0.0,0.9,0.0,0.5,0.8,0.3,0.4,0.1,0.4,1.0,0.5
4,1.0,0.0,0.3,0.0,1.0,0.8,0.1,0.7,0.4,0.4,0.9,0.0,0.2,0.2,0.7,0.0,0.5,0.4,0.8,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234,0.1,0.9,0.0,0.2,0.7,0.0,0.5,0.4,0.8,0.2,0.9,0.3,0.0,0.1,0.8,0.9,0.6,0.8,0.6,1.0
237,0.7,0.6,0.0,0.2,0.6,0.3,0.6,0.5,0.0,0.4,0.3,0.7,0.0,0.2,0.7,0.6,0.5,0.2,0.0,0.5
238,0.7,0.6,0.0,0.2,0.6,0.3,0.6,0.5,0.0,0.4,0.3,0.7,0.0,0.2,0.7,0.6,0.5,0.2,0.0,0.5
242,0.8,0.2,0.9,0.1,0.8,1.0,0.3,0.5,0.2,0.9,0.4,0.3,0.1,0.2,0.8,0.4,0.1,0.4,1.0,0.5


In [19]:
# Split the dataset to define training and testing examples
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

### Test different versions of Naive Bayes

In [20]:
nb = GaussianNB()
y_new_pred = nb.fit(X_new_train, y_new_train).predict(X_new_test)
print(
    "Number of mislabeled points out of a total %d points : %d"
    % (X_new_test.shape[0], (y_new_test != y_new_pred).sum())
)

Number of mislabeled points out of a total 64988 points : 31581


In [21]:
nb = MultinomialNB()
y_new_pred = nb.fit(X_new_train, y_new_train).predict(X_new_test)
print(
    "Number of mislabeled points out of a total %d points : %d"
    % (X_new_test.shape[0], (y_new_test != y_new_pred).sum())
)

Number of mislabeled points out of a total 64988 points : 28391


In [22]:
nb = ComplementNB()
y_new_pred = nb.fit(X_new_train, y_new_train).predict(X_new_test)
print(
    "Number of mislabeled points out of a total %d points : %d"
    % (X_new_test.shape[0], (y_new_test != y_new_pred).sum())
)

Number of mislabeled points out of a total 64988 points : 40318


In [23]:
nb = BernoulliNB()
y_new_pred = nb.fit(X_new_train, y_new_train).predict(X_new_test)
print(
    "Number of mislabeled points out of a total %d points : %d"
    % (X_new_test.shape[0], (y_new_test != y_new_pred).sum())
)

Number of mislabeled points out of a total 64988 points : 28271


In [24]:
nb = CategoricalNB()
y_new_pred = nb.fit(X_new_train, y_new_train).predict(X_new_test)
print(
    "Number of mislabeled points out of a total %d points : %d"
    % (X_new_test.shape[0], (y_new_test != y_new_pred).sum())
)

Number of mislabeled points out of a total 64988 points : 27982


# Logistic Regression

In [25]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=42)

# Validation
lr.fit(X_new_train, y_new_train)

# Prediction
y_pred_lr = lr.predict(X_new_test)
print("Logistic Regression Classification Report:")

# Accuracy
accuracy = accuracy_score(y_new_test, y_new_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification report 
print("Classification Report:")
print(classification_report(y_new_test, y_new_pred))


Logistic Regression Classification Report:
Accuracy: 0.57
Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

       HBOND       0.57      0.96      0.72     36597
       IONIC       0.00      0.00      0.00      1247
    PICATION       0.00      0.00      0.00       324
     PIHBOND       0.00      0.00      0.00        58
   PIPISTACK       0.51      0.10      0.17      1129
      SSBOND       0.00      0.00      0.00        87
         VDW       0.50      0.06      0.11     25546

    accuracy                           0.57     64988
   macro avg       0.23      0.16      0.14     64988
weighted avg       0.53      0.57      0.45     64988



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
