In [82]:
import operator
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn import feature_selection
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler, RobustScaler, Imputer, LabelEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from IPython.core.pylabtools import figsize 
%matplotlib inline

In [2]:
RANDOM_STATE = 42

In [13]:
hum = pd.read_csv("data/humsavar_gt.tab.gz", sep="\t")
hum.rename(columns={"Type of variant HumsaVar": "TYPE"}, inplace=True)
hum["MUTANT"] = hum.MUTANT.str.rstrip()
hum.drop_duplicates(subset="MUTANT", inplace=True)

In [14]:
hum.TYPE.value_counts()

Disease         4661
Polymorphism    1983
Unclassified     607
Name: TYPE, dtype: int64

In [86]:
model_dict = {
    'lr':{'pipe':('lr', LogisticRegression(random_state=RANDOM_STATE)),
          'params':[{'lr__C' : [.001, .01, .1, 1, 10, 100, 1000], 'lr__class_weight':[None, 'balanced']}],
          'name':'LogisticRegression'
    },
    'rf':{'pipe':('rf', RandomForestClassifier(random_state=RANDOM_STATE)),
          'params':[{'rf__max_depth':[3,5,7], 'rf__n_estimators':[10,50,100], 'rf__max_features':[4,'sqrt',0.2]}],
          'name':'Random Forest'
    },
    'svc':{'pipe':('svc', SVC(kernel='rbf', random_state=RANDOM_STATE)),
           'params':[{'svc__C':[0.001, 0.01, 0.1, 1, 10], 'svc__gamma':[0.001, 0.01, 0.1, 1]}],
           'name': 'Support Vector Classifier'
    }
}

In [16]:
dataset = hum.drop(['3DID', 'PDB'], 1) \
    .replace({"ACTIVE_SITE": {"BINDING": 1, np.nan: 0}}) \
    .set_index("MUTANT")

unclassified_index = dataset[dataset.TYPE == "Unclassified"].index
dataset_disease_index = dataset[dataset.TYPE == "Disease"].index
dataset_poly_index = dataset[dataset.TYPE == "Polymorphism"].index

#### 50-50% en el set de entrenamiento y 60-40% en set de evaluación respetando humsavar

In [64]:
train = pd.concat([
    dataset.loc[dataset_disease_index].sample(n=1500, random_state=RANDOM_STATE),
    dataset.loc[dataset_poly_index].sample(n=1500, random_state=RANDOM_STATE)
])

test = pd.concat([
    dataset.loc[dataset_disease_index.difference(train.index)].sample(n=323, random_state=RANDOM_STATE),
    dataset.loc[dataset_poly_index.difference(train.index)].sample(n=483, random_state=RANDOM_STATE)
])

In [65]:
X_train = train.drop("TYPE", 1)
y_train = train.TYPE
X_test = test.drop("TYPE", 1)
y_test = test.TYPE

In [66]:
y_test.value_counts()

Polymorphism    483
Disease         323
Name: TYPE, dtype: int64

In [67]:
y_train.value_counts()

Disease         1500
Polymorphism    1500
Name: TYPE, dtype: int64

#### Random Forest

In [94]:
algorithm = 'rf'
model = Pipeline([('imputer', Imputer(missing_values="NaN", strategy="median")), 
                  model_dict[algorithm]['pipe']])
param_list = [model_dict[algorithm]['params']][0]
gs = GridSearchCV(model, param_list, cv=3, n_jobs=2, scoring='roc_auc', verbose=1, refit=True)
le = LabelEncoder().fit(y_train)

In [69]:
gs.fit(X_train, le.transform(y_train))

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=2)]: Done  81 out of  81 | elapsed:    9.1s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impu...timators=10, n_jobs=1,
            oob_score=False, random_state=100, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=2,
       param_grid=[{'rf__max_depth': [3, 5, 7], 'rf__max_features': [4, 'sqrt', 0.2], 'rf__n_estimators': [10, 50, 100]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [70]:
gs.best_score_

0.73360333333333327

In [71]:
rf = gs.best_estimator_

In [72]:
y_pred = rf.predict(X_test)

In [73]:
roc_auc_score(le.transform(y_test), y_pred)

0.68321699389137802

In [81]:
roc_curve(le.transform(y_test), y_pred)

(array([ 0.       ,  0.3250774,  1.       ]),
 array([ 0.        ,  0.69151139,  1.        ]),
 array([2, 1, 0]))

#### Y los unclassified?

In [80]:
pd.Series(le.inverse_transform(rf.predict(dataset.loc[unclassified_index].drop("TYPE", 1)))).value_counts()

Polymorphism    338
Disease         269
dtype: int64

#### Support Vector Classifier

In [100]:
algorithm = 'svc'
model = Pipeline([('imputer', Imputer(missing_values="NaN", strategy="median")), 
                  ('scale', RobustScaler()), 
                  model_dict[algorithm]['pipe']])
param_list = [model_dict[algorithm]['params']][0]
gs = GridSearchCV(model, param_list, cv=3, n_jobs=2, scoring='roc_auc', verbose=1, refit=True)
le = LabelEncoder().fit(y_train)

In [101]:
gs.fit(X_train, le.transform(y_train))

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   11.3s
[Parallel(n_jobs=2)]: Done  60 out of  60 | elapsed:   14.0s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('scale', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=2,
       param_grid=[{'svc__gamma': [0.001, 0.01, 0.1, 1], 'svc__C': [0.001, 0.01, 0.1, 1, 10]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [103]:
gs.best_score_

0.6982693333333333

In [104]:
svc = gs.best_estimator_

In [105]:
y_pred = svc.predict(X_test)

In [106]:
roc_auc_score(le.transform(y_test), y_pred)

0.67905697748206828