In [8]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import (
    GaussianNB,
    MultinomialNB,
    CategoricalNB,
    ComplementNB,
    BernoulliNB,
)
from pathlib import Path

In [10]:
# Combine all PDBs into a single dataframe
ring_path = Path(
    "/Users/mattiapiazza/Documents/University/structural_bioinformatics/project/sb_project/features_ring"
)
dfs = []
for filename in os.listdir(ring_path):
    dfs.append(pd.read_csv(ring_path / filename, sep="\t"))
df = pd.concat(dfs)
df

Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_up,s_down,s_phi,...,t_down,t_phi,t_psi,t_ss3,t_a1,t_a2,t_a3,t_a4,t_a5,Interaction
0,1u9c,A,32,,Y,H,0.149,22.0,24.0,-0.971,...,11.0,-1.163,-0.725,H,-1.006,-0.590,1.891,-0.397,0.412,
1,1u9c,A,112,,L,H,0.000,25.0,25.0,-1.135,...,16.0,-1.136,-0.853,H,-0.591,-1.302,-0.733,1.570,-0.146,HBOND
2,1u9c,A,107,,T,H,0.162,11.0,16.0,-1.096,...,21.0,-1.128,-0.789,H,0.260,0.830,3.097,-0.838,1.512,
3,1u9c,A,147,,K,-,0.190,18.0,8.0,-1.795,...,17.0,-1.631,1.771,H,0.945,0.828,1.299,-0.169,0.933,HBOND
4,1u9c,A,109,,Q,H,0.263,13.0,21.0,-1.056,...,25.0,-1.135,-0.671,H,-1.019,-0.987,-1.505,1.266,-0.912,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1032,3vrd,B,231,,D,E,0.466,9.0,16.0,-2.257,...,16.0,1.057,0.914,L,-0.663,-1.524,2.219,-1.005,1.212,HBOND
1033,3vrd,A,44,,F,H,0.162,18.0,11.0,-1.214,...,21.0,1.497,0.054,L,-0.384,1.652,1.330,1.045,2.064,HBOND
1034,3vrd,A,13,,G,H,0.000,16.0,30.0,-1.022,...,24.0,-1.093,-0.704,H,0.260,0.830,3.097,-0.838,1.512,
1035,3vrd,A,163,,L,H,0.024,13.0,23.0,-1.092,...,22.0,-1.010,-0.806,H,-1.019,-0.987,-1.505,1.266,-0.912,VDW


In [15]:
# Remove all rows with NaN in at least one column
# including rows with missing class (they could be false negatives)
df.dropna(inplace=True)

# Define ground truth values
y = df["Interaction"].astype("category")
y

1       HBOND
3       HBOND
6       HBOND
7         VDW
9       HBOND
        ...  
1030    HBOND
1031      VDW
1032    HBOND
1033    HBOND
1035      VDW
Name: Interaction, Length: 1471380, dtype: category
Categories (7, object): ['HBOND', 'IONIC', 'PICATION', 'PIHBOND', 'PIPISTACK', 'SSBOND', 'VDW']

In [18]:
df.shape

(1471380, 34)

In [16]:
# Define training features
X = df[
    [
        "s_rsa",
        "s_up",
        "s_down",
        "s_phi",
        "s_psi",
        "s_a1",
        "s_a2",
        "s_a3",
        "s_a4",
        "s_a5",
        "t_rsa",
        "t_up",
        "t_down",
        "t_phi",
        "t_psi",
        "t_a1",
        "t_a2",
        "t_a3",
        "t_a4",
        "t_a5",
    ]
]

# Calculate percentiles and transform into categories
X = X.rank(pct=True).round(1).astype("category")
X

Unnamed: 0,s_rsa,s_up,s_down,s_phi,s_psi,s_a1,s_a2,s_a3,s_a4,s_a5,t_rsa,t_up,t_down,t_phi,t_psi,t_a1,t_a2,t_a3,t_a4,t_a5
1,0.1,0.9,0.9,0.6,0.4,0.2,0.2,0.3,0.9,0.3,0.5,0.5,0.5,0.6,0.1,0.4,0.1,0.4,1.0,0.5
3,0.6,0.6,0.0,0.2,0.9,1.0,0.3,0.5,0.2,0.9,0.9,0.1,0.5,0.3,0.7,0.7,0.8,0.6,0.4,0.7
6,0.5,0.5,0.4,0.6,0.1,0.3,0.1,0.4,1.0,0.5,0.8,0.0,0.8,1.0,0.6,0.4,1.0,0.6,0.7,0.9
7,0.5,0.5,0.4,0.6,0.1,0.3,0.1,0.4,1.0,0.5,0.8,0.0,0.8,1.0,0.6,0.4,1.0,0.6,0.7,0.9
9,0.7,0.4,0.5,0.1,0.7,0.2,0.3,0.8,0.2,0.6,0.1,0.6,0.9,0.4,0.4,0.6,0.6,0.1,0.2,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1030,0.5,0.6,0.0,0.0,0.6,1.0,0.3,0.5,0.2,0.9,0.3,0.7,0.2,0.3,1.0,0.5,0.9,0.0,0.7,0.1
1031,0.5,0.6,0.0,0.0,0.6,1.0,0.3,0.5,0.2,0.9,0.3,0.7,0.2,0.3,1.0,0.5,0.9,0.0,0.7,0.1
1032,0.9,0.2,0.4,0.1,0.7,0.8,0.7,0.1,0.3,0.0,0.5,0.2,0.5,1.0,0.6,0.3,0.0,1.0,0.1,0.7
1033,0.6,0.6,0.1,0.5,0.3,0.2,0.3,0.8,0.2,0.6,0.9,0.0,0.8,1.0,0.6,0.4,1.0,0.6,0.7,0.9


In [17]:
X.shape

(1471380, 20)

In [None]:
# Split the dataset to define training and testing examples
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

### Test different versions of Naive Bayes

In [None]:
nb = GaussianNB()
y_pred = nb.fit(X_train, y_train).predict(X_test)
print(
    "Number of mislabeled points out of a total %d points : %d"
    % (X_test.shape[0], (y_test != y_pred).sum())
)

Number of mislabeled points out of a total 45420 points : 17865


In [None]:
nb = MultinomialNB()
y_pred = nb.fit(X_train, y_train).predict(X_test)
print(
    "Number of mislabeled points out of a total %d points : %d"
    % (X_test.shape[0], (y_test != y_pred).sum())
)

Number of mislabeled points out of a total 45420 points : 15631


In [None]:
nb = ComplementNB()
y_pred = nb.fit(X_train, y_train).predict(X_test)
print(
    "Number of mislabeled points out of a total %d points : %d"
    % (X_test.shape[0], (y_test != y_pred).sum())
)

Number of mislabeled points out of a total 45420 points : 16950


In [None]:
nb = BernoulliNB()
y_pred = nb.fit(X_train, y_train).predict(X_test)
print(
    "Number of mislabeled points out of a total %d points : %d"
    % (X_test.shape[0], (y_test != y_pred).sum())
)

Number of mislabeled points out of a total 45420 points : 15454


In [None]:
nb = CategoricalNB()
y_pred = nb.fit(X_train, y_train).predict(X_test)
print(
    "Number of mislabeled points out of a total %d points : %d"
    % (X_test.shape[0], (y_test != y_pred).sum())
)

Number of mislabeled points out of a total 45420 points : 15460
