In [8]:
# Import custom libaries
import classification_utils as clu
import md_utils as mdu
import glycan_bionames
# Import common libraries
import time
import plotly_express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import os, glob
import pandas as pd


In [58]:
# Set input variables
fnames = ['/net/jam-amaro-shared/dse_project/Spike_Dataset/TRAJECTORIES_spike_open_prot_glyc_amarolab/results/FinalExtractedFeature_1.csv',
          '/net/jam-amaro-shared/dse_project/Spike_Dataset/TRAJECTORIES_spike_open_prot_glyc_amarolab/results/FinalExtractedFeature_2.csv',
          '/net/jam-amaro-shared/dse_project/Spike_Dataset/TRAJECTORIES_spike_open_prot_glyc_amarolab/results/FinalExtractedFeature_3.csv',
          '/net/jam-amaro-shared/dse_project/Spike_Dataset/TRAJECTORIES_spike_open_prot_glyc_amarolab/results/FinalExtractedFeature_4.csv',
          '/net/jam-amaro-shared/dse_project/Spike_Dataset/TRAJECTORIES_spike_open_prot_glyc_amarolab/results/FinalExtractedFeature_5.csv',
          '/net/jam-amaro-shared/dse_project/Spike_Dataset/TRAJECTORIES_spike_open_prot_glyc_amarolab/results/FinalExtractedFeature_6.csv',
        '/net/jam-amaro-shared/dse_project/Spike_Dataset/TRAJECTORIES_spike_closed_prot_glyc_amarolab/results/FinalExtractedFeature_1.csv',
         '/net/jam-amaro-shared/dse_project/Spike_Dataset/TRAJECTORIES_spike_closed_prot_glyc_amarolab/results/FinalExtractedFeature_2.csv',
         '/net/jam-amaro-shared/dse_project/Spike_Dataset/TRAJECTORIES_spike_closed_prot_glyc_amarolab/results/FinalExtractedFeature_3.csv']
is_open = [1,1,1,1,1,1,0,0,0]
traj_dir = '/net/jam-amaro-shared/dse_project/Spike_Dataset/TRAJECTORIES_spike_open_prot_glyc_amarolab/'

In [None]:
def load_data(fnames, is_open):
    '''Load and concatenate all datasets'''
    # fnames = list of files corresponding to featuresets to use in training. Should include full path
    # is_open = list of labels for corresponding fnames. 1 is open & 0 is closed
    openlabels = ['Closed','Open']
    dfs = []
    for f in range(len(fnames)):
        df = pd.read_csv(fnames[f]).assign(label = is_open[f]).iloc[:,1:]
        df['Replicant'] = '/'.join(fnames[f].split('/')[-3:])
#         df['Replicant'] = os.path.basename(fnames[f])# basename leaves duplicates (csvs in different datasets have same names)
        df['isopen'] = is_open[f]
#         df['Replicant'] = openlabels[is_open[f]] +'_'+os.path.basename(fnames[f])
        dfs.append(df)
    return pd.concat(dfs,join='inner')

In [76]:
# Test loading of data
# Load data
t0 = time.time()
openlabels = ['Closed','Open']
dfs = []
for f in range(1):# range(len(fnames)):
    df = pd.read_csv(fnames[f]).assign(label = is_open[f]).iloc[:,1:]
    t1 = time.time(); print(t1-t0)
    df['Replicant'] = '/'.join(fnames[f].split('/')[-3:])
    t2 = time.time(); print(t2-t1)
    df['isopen'] = is_open[f]
    t3 = time.time(); print(t3-t2)
    dfs.append(df)
    t4 = time.time(); print(t4-t3)
    
df = pd.concat(dfs,join='inner')
    
# df = clu.load_data(fnames,is_open)
print(time.time()-t0)
#print(df['Replicant'].unique())

0.7967977523803711
0.001234292984008789
0.0004851818084716797
3.5762786865234375e-05
0.8084406852722168


In [60]:
# Test curation of features
t0 = time.time()
df = clu.curate_feats(df)
print(time.time()-t0)

16.641210079193115


In [74]:
def restrict_RBD_window(df,nm):
    '''Function to drop features of dataframe that correspond to glycans which are outside a given RBD neighborhood (in nm)'''
    #Get list of glycans
    glycans = list(np.unique([x.replace('RBD__2__','') for x in df.keys().to_list() if 'RBD__2__GLY' in x]))
    
    flist = []
    for g in glycans:
        if df['RBD__2__' + g].mean() > nm:
            for f in ['RBD__2__'+g,g+':ROF',g+':RMSD',g+'_x',g+'_y',g+'_z']:
                if f in df.keys().to_list():
                    flist.append(f)
    df.drop(flist,axis=1,inplace=True)  
    return df

In [75]:
nm=4;feat_incl=['_x','_y','_z','RBD__2__','ROF','RMSD'];corr_thresh=0.5

t0 = time.time()

# Limit RBD_window
df = restrict_RBD_window(df,rbd_wind)

all_feats = ['_x','_y','_z','RBD__2__','ROF','RMSD']

t1 = time.time()
print(t1-t0)

# Remove highly correlated features
df = clu.remove_corr_feats(df,corr_thresh)

t2 = time.time()
print(t2-t1)

# Drop features user selected not to include
for f in all_feats:
    if f not in feat_incl:
        df=clu.drop_feats(df,f)

t3 = time.time()
print(t3-t2)

0.027315616607666016
0.4967517852783203
0.0007059574127197266


In [61]:
# Test model training
t0 = time.time()
clu.train_sgd_model_new(df)
print(time.time()-t0)

Train set : (41675, 199), Test set : (17861, 199)
0.6037869453430176


In [None]:
# Test model training 
t0 = time.time()
tr_p, tr_r, ts_p, ts_r, df_feat = clu.train_sgd_model(df,feat_incl=['_x','_y','_z','RMSD','ROF'])
print('Train precision: ' + str(tr_p) + ', Train recall: ' + str(tr_r))
print('Test precision: ' + str(ts_p) + ', Test recall: ' + str(ts_r))
print(time.time()-t0)

In [None]:
fig = clu.trace_single_feat(df,glycan_bionames.get_elem(df_feat.iloc[0]['feats'],'feat'),'blue')
fig.show()

In [None]:
df_r = df[['Replicant','isopen']].drop_duplicates()
cmap = {}; colors = ['red','blue']
for i in range(len(df_r)):
    cmap[df_r.iloc[i]['Replicant']] = colors[df_r.iloc[i]['isopen']]
cmap

In [None]:
df_f = pd.DataFrame()
df_r = df[['Replicant','isopen']].drop_duplicates()
cmap = {}; colors = ['red','blue']
for i in range(len(df_r)):
    cmap[df_r.iloc[i]['Replicant']] = colors[df_r.iloc[i]['isopen']]
    

In [None]:
f = df_feat.iloc[0]['feats']

In [None]:
for r in df['Replicant'].unique():
   df_f[r] = df.loc[df['Replicant']==r][f]


fig = px.line(df_f,title=f,color_discrete_map=cmap)
fig.show()
#fig.update_layout(template='simple_white')

In [None]:
def plot_feature_importances(df_feats):
    '''Plot bar chart of feature importances'''
    x_vals = [glycan_bionames.get_elem(i,'feat') for i in df_feats['feats'].to_list()]
    y_vals = df_feats['importance'].to_list() 
    col_vals = [glycan_bionames.get_elem(i,'chain') for i in df_feats['feats'].to_list()]
    cmap = {'Monomer A':'royalblue','Monomer B':'indianred','Monomer C':'forestgreen','Core':'orange','RBD_CA0:ROF':'mediumpurple'}

    fig1 = px.bar(x=x_vals,y=y_vals,color=col_vals,title='Important Features',color_discrete_map=cmap, labels={'x':'Feature','y':'Importance','color':'Substructure'}).update_xaxes(categoryorder='total ascending')
    return fig1
plot_feature_importances(df_feat)

In [None]:
# Test bar chart
fig1 = clu.plot_feature_importances(df_feat)
fig1.show()

In [None]:
# Test spike viz

# Load
t0 = time.time()
traj = mdu.load_traj(traj_dir)
t1 = time.time()
print(str(t1 - t0) + ' sec to load')

# Parse
atom_id = mdu.parse_traj(traj)
t2 = time.time()
print(str(t2-t1) + ' sec to parse')

# Viz
fig2 = mdu.viz_traj(traj,atom_id,df_feat,'Open Spike','blue')
t3 = time.time()
print(str(t3-t2) + ' sec to viz')
fig2.show()

In [None]:
coord_df['Substructure'].unique()

In [None]:
dfFeats = df_feat
atom_id_LUP = atom_id
title_str = 'Open Spike'
title_clr = 'blue'


# Get names of substructures
dfFeats.sort_values(by='importance',axis=0,inplace=True,ascending=False)
feats = []; 
bionames = {'sidechain':'Sidechain','RBD_CA':'RBD','CH_CA':'Central Helix','GLY':'Glycans','backbone':'Backbone'}
for i in dfFeats['feats']:
    try:
        featname = f'G{int(gly_4m_featname(i))+1}'
        feats.append(featname)
        glyname = f'GLY{int(gly_4m_featname(i))}'
        bionames[featname] = glycan_bionames.get_elem(glyname,'position') + '_' + glycan_bionames.get_elem(glyname,'chain')
    except:
        continue

for j in feats[:5]:
    name = 'segname ' + j
    atom_id_LUP[j] = traj.top.select(name)


# Prep most important features for visualization
keyNames =['sidechain','RBD_CA', 'CH_CA', 'GLY','backbone']+feats[:5]
coord_df = mdu.gen_xyz_Table_4_LUP(LUP=atom_id_LUP,traj=traj, keyNames =keyNames)

# Rename features to use bionames
coord_df['Substructure'] = coord_df.apply(lambda row: bionames[row['type']],axis=1)

# Add viz info - size and color
coord_df['marker_size'] = coord_df.apply(lambda row: mdu.assign_marker(row['type'],'size'),axis=1)
coord_df['marker_clr'] = coord_df.apply(lambda row: mdu.assign_marker(row['type'],'color'),axis=1)

# Display most important features
fig1 = px.scatter_3d(coord_df, title=title_str, x='x', y='y', z='z',
          color='Substructure',width=800,height=800,opacity=0.5, template='simple_white',
                     size = 'marker_size', color_discrete_map = coord_df.marker_clr.to_dict()
            )
# Remove tick labels on all 3 axes
fig1.update_layout(scene=dict(xaxis=dict(showticklabels=False),
                         yaxis = dict(showticklabels=False),
                         zaxis = dict(showticklabels=False)),
                   title={'font':{'color':title_clr}})
fig1.show()

In [None]:
# Create custom colormap
cmap = {}; bkg = ['Backbone','Sidechain','Glycans','RBD','Central Helix']
colors = px.
for s in coord_df['Substructure'].unique():
    print(s)
    if s in bkg:
        cmap[s] = '#d3d3d3'
    else:
        cmap[s] = 'red'
cmap

In [None]:
df['Replicant'].unique

In [None]:
df = pd.read_csv('./current_tmp_df.csv')
df[df['Replicant']=='Closed_FinalExtractedFeature_2.csv']['RBD__2__GLY32']