In [17]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, CategoricalNB, ComplementNB, BernoulliNB

In [18]:
# Combine all PDBs into a single dataframe
dfs = []
for filename in os.listdir('data/features_ring'):
    dfs.append(pd.read_csv('data/features_ring/' + filename, sep='\t'))
df = pd.concat(dfs)
df

Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_up,s_down,s_phi,...,t_down,t_phi,t_psi,t_ss3,t_a1,t_a2,t_a3,t_a4,t_a5,Interaction
0,3gpi,A,17,,A,H,0.085,22.0,20.0,-1.017,...,8.0,-1.170,-0.674,H,-1.019,-0.987,-1.505,1.266,-0.912,HBOND
1,3gpi,A,198,,Q,H,0.293,12.0,8.0,-1.453,...,6.0,-1.886,1.715,H,-0.591,-1.302,-0.733,1.570,-0.146,HBOND
2,3gpi,A,198,,Q,H,0.293,12.0,8.0,-1.453,...,6.0,-1.886,1.715,H,-0.591,-1.302,-0.733,1.570,-0.146,VDW
3,3gpi,A,32,,R,-,0.581,8.0,14.0,-1.534,...,4.0,-1.780,1.838,H,0.931,-0.179,-3.005,-0.503,-1.853,HBOND
4,3gpi,A,32,,R,-,0.581,8.0,14.0,-1.534,...,4.0,-1.780,1.838,H,0.931,-0.179,-3.005,-0.503,-1.853,VDW
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
457,1z2o,X,108,,I,,,21.0,13.0,-2.388,...,18.0,-1.299,2.161,H,-1.337,-0.279,-0.544,1.242,-1.262,
458,1z2o,X,12,,W,,,21.0,13.0,-1.903,...,15.0,-2.604,2.738,H,-0.228,1.399,-4.760,0.670,-2.647,
459,1z2o,X,87,,I,,,19.0,12.0,-1.170,...,15.0,-1.099,-0.719,H,-0.663,-1.524,2.219,-1.005,1.212,HBOND
460,1z2o,X,7,,V,,,19.0,10.0,-2.260,...,11.0,-1.810,2.169,H,-1.006,-0.590,1.891,-0.397,0.412,VDW


In [19]:
# Remove all rows with NaN in at least one column
# including rows with missing class (they could be false negatives)
df.dropna(inplace=True)

# Define ground truth values
y = df['Interaction'].astype('category')
y

0      HBOND
1      HBOND
2        VDW
3      HBOND
4        VDW
       ...  
315      VDW
317    HBOND
319    HBOND
320    HBOND
322      VDW
Name: Interaction, Length: 454193, dtype: category
Categories (6, object): ['HBOND', 'IONIC', 'PICATION', 'PIPISTACK', 'SSBOND', 'VDW']

In [20]:
# Define training features
X = df[['s_rsa', 's_up', 's_down', 's_phi', 's_psi', 's_a1', 's_a2', 's_a3', 's_a4', 's_a5', 
        't_rsa', 't_up', 't_down', 't_phi', 't_psi', 't_a1', 't_a2', 't_a3', 't_a4', 't_a5']]

# Calculate percentiles and transform into categories
X = X.rank(pct=True).round(1).astype('category') 
X

Unnamed: 0,s_rsa,s_up,s_down,s_phi,s_psi,s_a1,s_a2,s_a3,s_a4,s_a5,t_rsa,t_up,t_down,t_phi,t_psi,t_a1,t_a2,t_a3,t_a4,t_a5
0,0.5,0.8,0.7,0.9,0.3,0.4,0.1,0.4,1.0,0.5,0.1,1.0,0.1,0.6,0.3,0.2,0.2,0.2,0.9,0.3
1,0.7,0.3,0.0,0.3,0.5,0.7,0.5,0.1,0.2,0.1,0.8,0.2,0.0,0.2,0.7,0.4,0.1,0.3,1.0,0.5
2,0.7,0.3,0.0,0.3,0.5,0.7,0.5,0.1,0.2,0.1,0.8,0.2,0.0,0.2,0.7,0.4,0.1,0.3,1.0,0.5
3,0.9,0.2,0.3,0.3,0.9,0.9,0.6,0.7,0.6,1.0,0.5,0.4,0.0,0.2,0.7,0.7,0.6,0.1,0.2,0.1
4,0.9,0.2,0.3,0.3,0.9,0.9,0.6,0.7,0.6,1.0,0.5,0.4,0.0,0.2,0.7,0.7,0.6,0.1,0.2,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,1.0,0.1,0.5,0.4,0.8,0.4,0.1,0.4,1.0,0.5,0.6,0.2,1.0,0.0,1.0,0.7,0.6,0.1,0.2,0.1
317,0.1,0.7,0.1,0.9,0.5,0.2,0.2,0.3,0.9,0.3,0.6,0.4,0.2,0.2,0.8,0.3,0.3,0.8,0.2,0.6
319,0.4,0.7,0.8,0.8,0.8,0.9,0.1,0.7,0.5,0.4,0.1,0.8,0.9,0.3,0.9,0.3,0.3,0.8,0.2,0.6
320,0.1,1.0,0.7,0.3,0.8,0.2,0.2,0.3,0.9,0.3,0.1,0.9,1.0,0.0,0.8,0.4,0.1,0.3,1.0,0.5


In [21]:
# Split the dataset to define training and testing examples
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

### Test different versions of Naive Bayes

In [22]:
nb = GaussianNB()
y_pred = nb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 45420 points : 17865


In [23]:
nb = MultinomialNB()
y_pred = nb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 45420 points : 15631


In [24]:
nb = ComplementNB()
y_pred = nb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 45420 points : 16950


In [25]:
nb = BernoulliNB()
y_pred = nb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 45420 points : 15454


In [26]:
nb = CategoricalNB()
y_pred = nb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 45420 points : 15460
