# Set up Environment

In [14]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score,cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split

# Common imports
import numpy as np
import os
import pandas as pd
import plotly.express as px
import time

# Import custom utility functions
import glycan_bionames

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

# Define custom functions
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)
    
def restrict_RBD_window(df,nm):
    '''Function to drop features of dataframe that correspond to glycans which are outside a given RBD neighborhood (in nm)'''
    #Get list of glycans
    glycans = list(np.unique([x.replace('RBD__2__','') for x in df.keys().to_list() if 'RBD__2__GLY' in x]))
    
    for g in glycans:
        if df['RBD__2__' + g].mean() > nm:
            df.drop(['RBD__2__'+g,g+':ROF',g+':RMSD',g+'_x',g+'_y',g+'_z'],axis=1,inplace=True)    
    return df

def overlapping_hist(open_df,closed_df,feat):
    '''Plot overlapping histograms for a given feature of all datasets'''
    open_df[feat].hist(bins=50)
    closed_df[feat].hist(bins=50)
    mutant_df[feat].hist(bins=50)
    plt.legend(['Open','Closed','Mutant (open)'])
    plt.title(feat)
    if 'RBD__2__' in feat:
        plt.xlabel('nm')
        
def drop_feats(df,flag):
    '''Drops all features in df containing flag'''
    for f in df.keys().to_list():
        if flag in f:
            df.drop(f,axis=1,inplace=True)
    return df

def read_n_filter_dfs(fname,num_reps,RBD_wind,val_reps_open,val_reps_closed,label_val,dfs_train=[],dfs_val=[]):
    '''Reads data and filters columns, then places in either train or validation dataframe list'''
    for i in range(1,num_reps+1):
        df = pd.read_csv(fname+'_'+str(i)+'.csv').assign(label=label_val).iloc[:,1:]
        # Only use glycans within certain range of the RBD
        df = restrict_RBD_window(df,RBD_wind)
        # Drop _x, _y, and _z features
        df = drop_feats(df,'_x')
        df = drop_feats(df,'_y')
        df = drop_feats(df,'_z')
        # Withold some replicants for use in a separate validation set
        if (label_val==1) & (i in val_reps_open):
            dfs_val.append(df)
        elif (label_val==0) & (i in val_reps_closed):
            dfs_val.append(df)
        else:
            dfs_train.append(df)
            
    return dfs_train, dfs_val

def remove_corr_feats(full_df,corr_thresh= 0.65):
    '''Remove highly correlated features'''
    corr_matrix = full_df.corr()
    final_features = corr_matrix['RBD_CA0:RMSD'][(corr_matrix['RBD_CA0:RMSD'] < corr_thresh) & (corr_matrix['RBD_CA0:RMSD'] > -corr_thresh)].reset_index().loc[:,'index'].to_list()
    if 'label' not in final_features:
        final_features.append('label')
    clf_df = full_df.loc[:,final_features]
    return clf_df

def prep_ML_data(clf_df,ts,rs,labelnames):
    '''Prepare data for use in training machine learning algorithm'''
    # Split training and testing data
    train_set, test_set = train_test_split(clf_df,test_size=ts, random_state=rs,stratify=labelnames)
    print(f'Train set : {train_set.shape}, Test set : {test_set.shape}')

    # Split data and labels
    train_X = train_set.drop("label", axis=1) # drop labels for training set
    train_labels = train_set["label"].copy()
    test_X = test_set.drop("label", axis=1) # drop labels for training set
    test_labels = test_set["label"].copy()

    return train_X, test_X, train_labels, test_labels
   

# Load data

### All as one dataframe

In [None]:
# Open dataset
fname = '/net/jam-amaro-shared/dse_project/Spike_Dataset/TRAJECTORIES_spike_open_prot_glyc_amarolab/results/FinalExtractedFeature_open.csv'
open_df = pd.read_csv(fname).assign(label = 1).iloc[:,1:]

# Closed dataset
fname = '/net/jam-amaro-shared/dse_project/Spike_Dataset/TRAJECTORIES_spike_closed_prot_glyc_amarolab/results/FinalExtractedFeature_closed.csv'
closed_df = pd.read_csv(fname).assign(label = 0).iloc[:,1:]

# Mutant dataset
fname = '/net/jam-amaro-shared/dse_project/Spike_Dataset/TRAJECTORIES_spike_mutant_prot_glyc_amarolab.tar.gz/results/FinalExtractedFeature_mutant.csv'
#mutant_df = pd.read_csv(fname).assign(label=1)

# Filter out features

In [None]:
# Only use glycans within 10 nm of the RBD
open_df = restrict_RBD_window(open_df,8)
closed_df = restrict_RBD_window(closed_df,8)
#mutant_df = restrict_RBD_window(mutant_df,8)
print(open_df.shape)

# Drop _x, _y, _z features
open_df = drop_feats(open_df,'_x')
open_df = drop_feats(open_df,'_y')
open_df = drop_feats(open_df,'_z')

closed_df = drop_feats(closed_df,'_x')
closed_df = drop_feats(closed_df,'_y')
closed_df = drop_feats(closed_df,'_z')

# Only use columns that exist in all datasets
common_cols = set(open_df.columns.to_list()).intersection(closed_df.columns.to_list())
full_df = open_df.loc[:,common_cols].append(closed_df.loc[:,common_cols]).drop(['frame','Frame Num'],axis=1)
full_df.shape

In [10]:
clf_df = remove_corr_feats(full_df,0.5)
clf_df.shape

(23526, 63)

# Prepare the Data for Machine Learning Algorithms

In [13]:
# Split train/test data
train_X, test_X, train_labels, test_labels = prep_ML_data(clf_df,0.3,42,full_df.label)


# Normalize data
num_pipeline = Pipeline([
       ('std_scaler', StandardScaler()),
    ])

train_X_prepared = num_pipeline.fit_transform(train_X)
test_X_prepared = num_pipeline.transform(test_X)

Train set : (16468, 63), Test set : (7058, 63)


# Train and Test Model

In [27]:
# Initialize classifier
sgd_clf = SGDClassifier(max_iter=100, tol=1e-3, random_state=42)

# Perform 10-fold cross-validation on training data
y_train_pred = cross_val_predict(sgd_clf,train_X_prepared, train_labels, cv=10)
t = time.time()
print(cross_val_score(sgd_clf, train_X_prepared, train_labels, cv=10, scoring="accuracy"))
print(str(time.time()-t) + ' sec elapsed')

# Get overall precision and recall for training data
confusion_matrix(train_labels, y_train_pred)
print(f' Train precison : {precision_score(train_labels, y_train_pred)}, train recall {recall_score(train_labels, y_train_pred)}')

# Get overall precision and recall for testing data
sgd_clf.fit(train_X_prepared,train_labels)
y_test_pred = sgd_clf.predict(test_X_prepared)
print(f' Test precison : {precision_score(test_labels, y_test_pred)}, Test recall {recall_score(test_labels, y_test_pred)}')



[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
8.309174060821533 sec elapsed
 Train precison : 1.0, train recall 1.0
 Test precison : 1.0, Test recall 1.0


In [None]:
#mutant_df.loc[:,common_cols].drop(["frame","Frame Num"],axis=1).head()

In [None]:
#val_X = num_pipeline.transform(mutant_df.loc[:,train_set.keys().to_list()].drop(["label"],axis=1))
#val_labels = mutant_df["label"].copy()
#y_val_pred = sgd_clf.predict(val_X)
#print(f' Val precison : {precision_score(val_labels, y_val_pred)}, Val recall {recall_score(val_labels, y_val_pred)}')


# Iterative Replicant Analysis

Run iterative leave-one-out analysis wherein 1/3 of the replicants are withheld from the training/testing dataset and used as a separate "validation" dataset afterwards. The idea is to implement the trained model on a completely "new" dataset and see if the model's performance holds up.

In [None]:
RBD_wind = 8
leftouts = []
train_precs = np.zeros([6,3])
train_recalls = np.zeros([6,3])
test_precs = np.zeros([6,3])
test_recalls = np.zeros([6,3])
val_precs = np.zeros([6,3])
val_recalls = np.zeros([6,3])
top_feats = []
for i in range(1,7):
    for j in range(1,4):
        val_reps_closed = [j]
        if i == 6:
            val_reps_open = [1,6];
        else:
            val_reps_open = [i,i+1];

        # Read open data
        fname = '/net/jam-amaro-shared/dse_project/Spike_Dataset/TRAJECTORIES_spike_open_prot_glyc_amarolab/results/FinalExtractedFeature'
        dfs_train, dfs_val = read_n_filter_dfs(fname,6,RBD_wind,val_reps_open,val_reps_closed,1)

        # Read closed data
        fname = fname = '/net/jam-amaro-shared/dse_project/Spike_Dataset/TRAJECTORIES_spike_closed_prot_glyc_amarolab/results/FinalExtractedFeature'
        dfs_train, dfs_val = read_n_filter_dfs(fname,3,RBD_wind,val_reps_open,val_reps_closed,0,dfs_train,dfs_val)
        
        # Only use columns that exist in all datasets
        common_cols = list(set.intersection(*map(set,dfs_train+dfs_val)))
        full_df = pd.concat(dfs_train).loc[:,common_cols].drop(['frame'],axis = 1)
        full_df.shape
        
        # Remove highly correlated columns
        clf_df = remove_corr_feats(full_df,0.5)

        # Split train/test data
        train_X, test_X, train_labels, test_labels = prep_ML_data(clf_df,0.3,42,full_df.label)


        # Normalize data
        num_pipeline = Pipeline([
               ('std_scaler', StandardScaler()),
            ])
        train_X_prepared = num_pipeline.fit_transform(train_X)
        test_X_prepared = num_pipeline.transform(test_X)
        
        # Initialize classifier
        sgd_clf = SGDClassifier(max_iter=100, tol=1e-3, random_state=42)

        # Perform 10-fold cross-validation on training data
        y_train_pred = cross_val_predict(sgd_clf,train_X_prepared, train_labels, cv=10)
        t = time.time()
        print(cross_val_score(sgd_clf, train_X_prepared, train_labels, cv=10, scoring="accuracy"))
        print(str(time.time()-t) + ' sec elapsed')

        # Get overall precision and recall for training data
        confusion_matrix(train_labels, y_train_pred)
        print(f' Train precison : {precision_score(train_labels, y_train_pred)}, train recall {recall_score(train_labels, y_train_pred)}')

        # Get overall precision and recall for testing data
        sgd_clf.fit(train_X_prepared,train_labels)
        y_test_pred = sgd_clf.predict(test_X_prepared)
        print(f' Test precison : {precision_score(test_labels, y_test_pred)}, Test recall {recall_score(test_labels, y_test_pred)}')

        # Prep data
        val_X = pd.concat(dfs_val).loc[:,train_X.keys()]
        val_labels = pd.concat(dfs_val).label
        val_X_prepared = num_pipeline.transform(val_X)

        # Get testing results on unseen replicant(s)
        y_val_pred = sgd_clf.predict(val_X_prepared)
        print(f' Val precison : {precision_score(val_labels, y_val_pred)}, Val recall {recall_score(val_labels, y_val_pred)}')
        
        # Save results
        leftouts.append(['open '+ str(x) +' ' for x in val_reps_open] + ['closed ' + str(x) + ' ' for x in val_reps_closed])
        train_precs[i-1,j-1] = precision_score(train_labels, y_train_pred)
        train_recalls[i-1,j-1] = recall_score(train_labels,y_train_pred)
        test_precs[i-1,j-1] = precision_score(test_labels, y_test_pred)
        test_recalls[i-1,j-1] = recall_score(test_labels, y_test_pred)
        val_precs[i-1,j-1] = precision_score(val_labels, y_val_pred)
        val_recalls[i-1,j-1] = recall_score(val_labels, y_val_pred)
        a = list(np.abs(sgd_clf.coef_[0]))
        idx = sorted(range(len(a)), key = lambda k: a[k])[-5:]
        x_vals = [glycan_bionames.get_elem(k,'feat') for k in train_X.columns.to_list()]
        top_feats.append(list(np.array(x_vals)[idx]))


Train set : (934017, 79), Test set : (400293, 79)
[1.         1.         1.         1.         1.         0.99998929
 0.99997859 1.         0.99998929 1.        ]
22.714370489120483 sec elapsed
 Train precison : 0.9999939197713834, train recall 1.0
 Test precison : 0.999964532970151, Test recall 1.0
 Val precison : 0.9999738670494853, Val recall 1.0
Train set : (957670, 79), Test set : (410430, 79)


# Present Results Graphically

In [18]:
a = list(np.abs(sgd_clf.coef_[0]))
idx = sorted(range(len(a)), key = lambda k: a[k])[-5:]
x_vals = [glycan_bionames.get_elem(i,'feat') for i in train_X.columns.to_list()]

In [None]:
x_vals = [glycan_bionames.get_elem(i,'feat') for i in train_X.columns.to_list()]
#x_vals = train_X.columns.to_list()
y_vals = np.abs(sgd_clf.coef_[0])
col_vals = [glycan_bionames.get_elem(i,'chain') for i in train_X.columns.to_list()]

px.bar(x=x_vals,y=y_vals,color=col_vals,labels={'x':'Feature','y':'Importance','color':'Importance'}).update_xaxes(categoryorder='total ascending')

In [None]:
# Plot overlapping histograms for some features

feats = list(np.unique([x for x in train_set.keys().to_list() if 'RBD__2__GLY' in x]))        
#feats = ['RBD__2__GLY51','RBD__2__GLY24','RBD__2__GLY14','RBD__2__GLY3']   
#feats = ['GLY22:ROF','RBD__2__CH_CA0','RBD__2__backbone0','RBD__2__GLY32','RBD__2__GLY51']
plt.figure(figsize=(30,18))
for i in range(len(feats)):
    plt.subplot(3,5,i+1)
    overlapping_hist(open_df,closed_df,feats[i])

In [None]:
# Plot histograms of all features in full training dataset
train_X.hist(bins=50, figsize=(20,15))