## Explanation

After exploring the sklearn logistic regression options, the **final model was selected as: liblinear solver, L1 penalty, 13 features (left out n_C, n_H, n_P, MW), balanced class weights.**

Explorations in this notebook include: solver, penalty, recursive feature elimination (RFE. Features were eliminated based on their score in RFE. Lower scores mean the feature was more important to model accuracy.)

**Future work:** A comparison to SVM would be useful. A gridsearch parameter exploration would also be useful.

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import matplotlib.pyplot as plt


In [2]:
master_df = pd.read_csv('../datasets/MASTER_DF.csv')

In [3]:
feature_df = master_df[['PubChem', 'dist', 'enzyme_class_1', 'enzyme_class_2', 'enzyme_class_3',
       'enzyme_class_4', 'enzyme_class_5', 'enzyme_class_6', 'enzyme_class_7',
        'n_O', 'n_N', 'n_P', 'n_S', 'n_X', 'DoU']]
feature_df.set_index(keys=['PubChem'], inplace=True)
feature_df.head()

Unnamed: 0_level_0,dist,enzyme_class_1,enzyme_class_2,enzyme_class_3,enzyme_class_4,enzyme_class_5,enzyme_class_6,enzyme_class_7,n_O,n_N,n_P,n_S,n_X,DoU
PubChem,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
3394,0.0,1,0,0,0,0,0,0,3.0,0.0,0.0,1.0,0.0,0.0
3394,0.511007,1,0,0,0,0,0,0,3.0,0.0,0.0,1.0,0.0,0.0
3578,0.0,1,0,0,0,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0
3578,0.241667,0,1,0,0,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0
3578,0.294605,0,0,0,1,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0


In [4]:
full_feature = master_df[['dist', 'enzyme_class_1', 'enzyme_class_2', 'enzyme_class_3',
       'enzyme_class_4', 'enzyme_class_5', 'enzyme_class_6', 'enzyme_class_7',
       'n_C', 'n_H', 'n_O', 'n_N', 'n_P', 'n_S', 'n_X', 'DoU', 'MW']]
full_feature.head()

full_features = np.array(full_feature) #shape balance array for regression
full_reactions = list(master_df['reacts'])

full_feature_train, full_feature_test, full_reaction_train, full_reaction_test = train_test_split(full_features, full_reactions,
                                                    test_size=0.20, random_state=42)

In [5]:
features = np.array(feature_df) #shape balance array for regression
reactions = list(master_df['reacts'])

feature_train, feature_test, reaction_train, reaction_test = train_test_split(features, reactions,
                                                    test_size=0.20, random_state=42)

In [7]:
from sklearn.preprocessing import StandardScaler

In [10]:
feature_scaler = StandardScaler()
fft = feature_scaler.fit_transform(full_feature_train)
full_feature_train = fft
fft = feature_scaler.fit_transform(full_feature_test)
full_feature_test = fft

In [13]:
from sklearn.model_selection import GridSearchCV

In [22]:
grid_param1 = {
    'solver': ['liblinear', 'saga'],    
}

In [28]:
logistic = linear_model.LogisticRegression(penalty='l2', random_state=1, class_weight='balanced', max_iter=1000)

In [29]:
gd_sr = GridSearchCV(logistic, grid_param2, cv=5, n_jobs=-1)

In [30]:
best_model = gd_sr.fit(full_features, full_reactions)

In [31]:
print(gd_sr.best_params_)

{'solver': 'newton-cg'}


In [27]:
grid_param2 = {
    'solver': ['newton-cg', 'lbfgs', 'sag'],    
}

_____

In [38]:
from sklearn.svm import SVR ##### do we want svr or svc?

In [39]:
svclassifier = SVR()

In [40]:
grid_param3 = {
    'kernel': ['rbf','linear', 'sigmoid', 'poly'],
    'degree': [3, 7, 10]
}

In [43]:
gd_sr = GridSearchCV(svclassifier, grid_param3, cv=5, n_jobs=-1)

In [None]:
best_model = gd_sr.fit(full_features, full_reactions)

In [None]:
print(gd_sr.best_params_)

_______

In [None]:
rfc=RandomForestClassifier(random_state=9)

In [None]:
param_grid4 = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

In [None]:
#### going to need to change the name of the training data here
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid4, cv= 5)
CV_rfc.fit(x_train, y_train)

In [None]:
CV_rfc.best_params_

In [None]:
rfc1=RandomForestClassifier(random_state=42, max_features='auto', n_estimators= 200, max_depth=8, criterion='gini')

In [None]:
pred=rfc1.predict(x_test)

In [None]:
print("Accuracy for Random Forest on CV data: ",accuracy_score(y_test,pred))