### The purpose of this notebook is to complete a data cleaning workflow from start to finish in order to validate the core functionality our package

#### TO DO:
- Add in complete PubChem data
- Write PubChem function
- Organize code modules & tests
- Clean up/finish writing tests
- Write main script wrapper function


In [2]:
# imports

from core import *
from cpd_info import *
from mol_sim_copy import *
from pubchem_client import *

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

## Step 1

#### Read in master dataframe



In [3]:
# read in the master_df

master_df = pd.read_csv('../datasets/MASTER_DF.csv') 
# master_df = master_df[master_df['reacts'] == 1.0]    # select only the reactive enzyme - compound pairs
# master_df = master_df[['entry', 'PubChem', 'SMILES']]
print(master_df.shape)
master_df.head()

(6879, 24)


Unnamed: 0,entry,product,reacts,PubChem,SMILES,Mol,Fingerprint,dist,enzyme_class_1,enzyme_class_2,...,enzyme_class_7,n_C,n_H,n_O,n_N,n_P,n_S,n_X,DoU,MW
0,1.8.99.5,C00094,1.0,3394,OS(=O)O,<rdkit.Chem.rdchem.Mol object at 0x1ac9b8a210>,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,0.0,1,0,...,0,0.0,2.0,3.0,0.0,0.0,1.0,0.0,0.0,82.08
1,1.13.11.18,C00094,1.0,3394,OS(=O)O,<rdkit.Chem.rdchem.Mol object at 0x1ac9b8a580>,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,0.511007,1,0,...,0,0.0,2.0,3.0,0.0,0.0,1.0,0.0,0.0,82.08
2,1.8.99.5,C00283,1.0,3578,S,<rdkit.Chem.rdchem.Mol object at 0x1ac9b8ac10>,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,0.0,1,0,...,0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,34.083
3,2.8.1.2,C00283,1.0,3578,S,<rdkit.Chem.rdchem.Mol object at 0x1ac9b8a2b0>,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,0.241667,0,1,...,0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,34.083
4,4.4.1.28,C00283,1.0,3578,S,<rdkit.Chem.rdchem.Mol object at 0x1ac9b8a120>,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,0.294605,0,0,...,0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,34.083


## Step 2 
#### Get query SMILES string & pair query compound with each unique enzyme in the master DataFrame
Example: PubChem SID 3480

In [4]:
# function to query the SMILES string and append new pairs to the master dataframe

def pair_query_compound(master_df, enzyme_col, pubchem_col, smiles_col, pubchem_sid):
    """
    pair_query_compound_with_enzymes() queries pubchem to get a SMILES string from an input pubchem_sid,
        then pairs that query compound with each unique enzyme id in the master dataframe
        
    Args:
        master_df (pandas.DataFrame): master dataframe containing enzyme ids
        enzyme_col (str): column containing enzyme id
        pubchem_col (str): column containing pubchem sid
        smiles_col (str): column containing SMILES string
        pubchem_sid (str): query PubChem sid
        
    Returns:
        pandas.DataFrame: with rows added to include query compound
    """
    master_df = master_df[[enzyme_col, pubchem_col, smiles_col]]
    new_pairs = []
    smiles, _ = sid_to_smiles(pubchem_sid)
    if len(smiles) == 0:
        raise 'query compound SMILES string could not be retrieved'
    else:
        pass
    unique_enzymes = master_df[enzyme_col].unique().tolist()
    for enzyme in unique_enzymes:
        pair = {enzyme_col:enzyme, pubchem_col:pubchem_sid, smiles_col:smiles}
        new_pairs.append(pair)
    new_paris_df = pd.DataFrame(new_pairs)
    output_df = pd.concat((master_df, new_paris_df), axis=0, sort=False)
    return output_df


In [5]:
# pair_query_compound_with_enzymes()

updated_df = pair_query_compound(master_df, 'entry', 'PubChem', 'SMILES', '3480')
print(updated_df.shape)
updated_df.head()

(7395, 3)


Unnamed: 0,entry,PubChem,SMILES
0,1.8.99.5,3394,OS(=O)O
1,1.13.11.18,3394,OS(=O)O
2,1.8.99.5,3578,S
3,2.8.1.2,3578,S
4,4.4.1.28,3578,S


## Step 3
#### Calculate molecular distances between products of the same enzyme
Positive and negative data must be calculated separately because the calculated molecular distance is an average of the distances between all products of any one enzyme. The model is based on the fact that we expect these distances to be closer for reactive enzyme-product pairs than for non-reactive enzyme-product pairs

In [None]:
# calculate_dist()
# positive data

distance_df = calculate_dist(updated_df)
print(distance_df.shape)
distance_df.head()

## Step 4
#### Get dummy variables to represent enzyme class
We expect that many enzyme properties could be predictive features for this model. Enzyme class should encapsulate many of these features at a high level.

In [None]:
# remove any rows that are not the query compound

reduced_df = distance_df[distance_df['PubChem'] == '3480']

In [None]:
# binarize_enzyme_class()

query_df = binarize_enzyme_class(reduced_df, 'entry')
query_df = query_df.reset_index(drop=True)
print(query_df.shape)
query_df.head()

## Step 5
#### Add in compound features with RDKit
This step uses the RDKit packages to generate descriptive features of the reaction product compounds

In [None]:
query_df = create_cpd_info(query_df)
print(query_df.shape)
query_df.head()

## Step 6
#### Re-Instantiate model

In [None]:
feature_df = master_df[['dist', 'enzyme_class_1', 'enzyme_class_2', 'enzyme_class_3',
       'enzyme_class_4', 'enzyme_class_5', 'enzyme_class_6', 'enzyme_class_7',
       'n_C', 'n_H', 'n_O', 'n_N', 'n_P', 'n_S', 'n_X', 'DoU', 'MW']]
features = np.array(feature_df) #shape balance array for regression
reactions = list(master_df['reacts'])
feature_train, feature_test, reaction_train, reaction_test = train_test_split(features, reactions,
                                                  test_size=0.20, random_state=42)
model_1 = linear_model.LogisticRegression(solver='liblinear', penalty='l1', random_state=1)
model_1.fit(feature_train, np.ravel(reaction_train))

## Step 7 
#### Use model to predict reactivity of pairs

In [None]:
# select query features

query_feat_df = query_df[['dist', 'enzyme_class_1', 'enzyme_class_2', 'enzyme_class_3',
       'enzyme_class_4', 'enzyme_class_5', 'enzyme_class_6', 'enzyme_class_7',
       'n_C', 'n_H', 'n_O', 'n_N', 'n_P', 'n_S', 'n_X', 'DoU', 'MW']]

In [None]:
predictions = model_1.predict(query_feat_df) # change me to the data you want to predict based on

pred = model_1.predict_proba(query_feat_df)

prediction_values = pd.DataFrame(pred)
model_descriptive_df = pd.DataFrame()
model_descriptive_df['0']=prediction_values[0]
model_descriptive_df['1']=prediction_values[1]

In [None]:
prediction_df = pd.merge(model_descriptive_df, query_df, left_index=True, right_index=True) 
print(prediction_df.shape)
prediction_df.head()

In [None]:
prediction_df = prediction_df.sort_values(by=['1'], ascending=False)
prediction_df.head()

## Out of curiosity:
#### Examine average molecular distance distributions for negative and positive data
On first glance, it appears that our hypothesis is correct in that the distributions of average molecular distances are qualitatively different between the positive and negative datasets

In [None]:
# look at distributions of distances for positive and negative data

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

pos = sns.distplot(distance_df['dist'], bins=50, kde=False, ax=axes[0])
axes[0].set_title('positive data avg. mol. dist.')

neg = sns.distplot(distance_df['dist'], bins=50, kde=False, ax=axes[1])
axes[1].set_title('negative data avg. mol. dist.')

for axis in axes:
    axis.set_xlim([0.0, 1.0])