In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import matplotlib.pyplot as plt


In [10]:
master_df = pd.read_csv('../../../big-datasets/master_dataframe_metacyc.csv.gz', compression='gzip')

In [11]:
master_df.drop(columns='Fingerprint', inplace=True)

In [12]:
master_df.drop(columns='Promiscuous', inplace=True)

In [13]:
#master_df.set_index('PubChemID', drop=True, inplace=True)
master_df.head()

Unnamed: 0,Enzyme,product,reacts,PubChemID,SMILES,n_C,n_H,n_O,n_N,n_P,...,n_DoU,MW,Dist,enzyme_class_1,enzyme_class_2,enzyme_class_3,enzyme_class_4,enzyme_class_5,enzyme_class_6,enzyme_class_7
0,1.14.14.80,CPD-10515,1.0,25201835,CCCCCCCCC(C(CCCCCCCC(=O)[O-])O)O,18,35,4,0,0,...,1,315.474,0.714718,1,0,0,0,0,0,0
1,1.14.14.80,PALMITATE,1.0,504166,CCCCCCCCCCCCCCCC(=O)[O-],16,31,2,0,0,...,1,255.422,0.714718,1,0,0,0,0,0,0
2,1.14.14.80,OLEATE-CPD,1.0,5460221,CCCCCCCCC=CCCCCCCCC(=O)[O-],18,33,2,0,0,...,2,281.46,0.714718,1,0,0,0,0,0,0
3,1.14.14.80,STEARIC_ACID,1.0,3033836,CCCCCCCCCCCCCCCCCC(=O)[O-],18,35,2,0,0,...,1,283.476,0.714718,1,0,0,0,0,0,0
4,1.14.14.80,CPD-10514,1.0,19746553,CCCCCCCCC1C(O1)CCCCCCCC(=O)[O-],18,33,3,0,0,...,2,297.459,0.714718,1,0,0,0,0,0,0


In [16]:
feature_df = master_df[['PubChemID', 'Dist', 'enzyme_class_1', 'enzyme_class_2', 'enzyme_class_3',
       'enzyme_class_4', 'enzyme_class_5', 'enzyme_class_6', 'enzyme_class_7',
        'n_O', 'n_N', 'n_P', 'n_S', 'n_X', 'n_DoU']]
feature_df.set_index(keys=['PubChemID'], inplace=True)
feature_df.head()

Unnamed: 0_level_0,Dist,enzyme_class_1,enzyme_class_2,enzyme_class_3,enzyme_class_4,enzyme_class_5,enzyme_class_6,enzyme_class_7,n_O,n_N,n_P,n_S,n_X,n_DoU
PubChemID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
25201835,0.714718,1,0,0,0,0,0,0,4,0,0,0,0,1
504166,0.714718,1,0,0,0,0,0,0,2,0,0,0,0,1
5460221,0.714718,1,0,0,0,0,0,0,2,0,0,0,0,2
3033836,0.714718,1,0,0,0,0,0,0,2,0,0,0,0,1
19746553,0.714718,1,0,0,0,0,0,0,3,0,0,0,0,2


full_feature = master_df[['Dist', 'enzyme_class_1', 'enzyme_class_2', 'enzyme_class_3',
       'enzyme_class_4', 'enzyme_class_5', 'enzyme_class_6', 'enzyme_class_7',
       'n_C', 'n_H', 'n_O', 'n_N', 'n_P', 'n_S', 'n_X', 'DoU', 'MW']]
full_feature.head()

full_features = np.array(full_feature) #shape balance array for regression
full_reactions = list(master_df['reacts'])

full_feature_train, full_feature_test, full_reaction_train, full_reaction_test = train_test_split(full_features, full_reactions,
                                                    test_size=0.20, random_state=42)

In [17]:
features = np.array(feature_df) #shape balance array for regression
reactions = list(master_df['reacts'])

feature_train, feature_test, reaction_train, reaction_test = train_test_split(features, reactions,
                                                    test_size=0.20, random_state=42)

In [18]:
from sklearn.preprocessing import StandardScaler

In [20]:
feature_scaler = StandardScaler()
fft = feature_scaler.fit_transform(feature_train)
full_feature_train = fft
fft = feature_scaler.fit_transform(feature_test)
full_feature_test = fft

In [21]:
from sklearn.model_selection import GridSearchCV

In [22]:
grid_param1 = {
    'solver': ['liblinear', 'saga'],    
}

In [34]:
logistic = linear_model.LogisticRegression(penalty='l2', random_state=1, class_weight='balanced', max_iter=1000)

In [35]:
gd_sr = GridSearchCV(logistic, grid_param2, cv=5, n_jobs=-1)

In [36]:
best_model = gd_sr.fit(features, reactions)

In [37]:
print(gd_sr.best_params_)

{'solver': 'newton-cg'}


In [31]:
grid_param2 = {
    'solver': ['newton-cg', 'lbfgs', 'sag'],    
}

_____

In [38]:
from sklearn.svm import SVC ##### do we want svr or svc?

In [39]:
svclassifier = SVC()

In [40]:
grid_param3 = {
    'kernel': ['rbf','linear', 'sigmoid', 'poly'],
    'degree': [3, 7, 10]
}

In [41]:
gd_sr = GridSearchCV(svclassifier, grid_param3, cv=5, n_jobs=-1)

In [None]:
best_model = gd_sr.fit(features, reactions)

In [None]:
print(gd_sr.best_params_)

_______

In [None]:
rfc=RandomForestClassifier(random_state=9)

In [None]:
param_grid4 = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

In [None]:
#### going to need to change the name of the training data here
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid4, cv= 5)
CV_rfc.fit(x_train, y_train)

In [None]:
CV_rfc.best_params_

In [None]:
rfc1=RandomForestClassifier(random_state=42, max_features='auto', n_estimators= 200, max_depth=8, criterion='gini')

In [None]:
pred=rfc1.predict(x_test)

In [None]:
print("Accuracy for Random Forest on CV data: ",accuracy_score(y_test,pred))