In [2]:
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, Imputer, LabelEncoder
from datetime import datetime

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))


In [3]:
RANDOM_STATE = 42
DATA_EXTERNAL = "../data/external/"
DATA_PROCESSED = "../data/processed/"
DATA_INTERIM = "../data/interim/"

In [4]:
model_dict = {
    'xgb':{'pipe':('xgb', XGBClassifier(learning_rate=0.01, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)),
          'params':[{'min_child_weight': [1, 5, 10], 'gamma': [0.5, 1, 1.5, 2, 5], 'subsample': [0.6, 0.8, 1.0], 
                     'colsample_bytree': [0.6, 0.8, 1.0], 'max_depth': [3, 4, 5]
        }],
          'name':'Xtreme Gradient Boosting'
    },
}

In [5]:
folds = 5
param_comb = 20
algorithm = 'xgb'
model = model_dict[algorithm]['pipe'][1]
param_list = model_dict[algorithm]['params'][0]
rs = RandomizedSearchCV(model, param_distributions=param_list, n_iter=param_comb, scoring='roc_auc', 
                        n_jobs=3, cv=folds, verbose=1, random_state=RANDOM_STATE)

In [None]:
xgb_train_scores = []
xgb_test_scores = []
for i in range(20):
    train = pd.read_csv(DATA_PROCESSED + "train_test_sets/train_{}.csv".format(i), index_col="MUTANT")
    test = pd.read_csv(DATA_PROCESSED + "train_test_sets/test_{}.csv".format(i), index_col="MUTANT")
    X_train = train.drop("TYPE", 1)
    y_train = train.TYPE
    X_test = test.drop("TYPE", 1)
    y_test = test.TYPE
    le = LabelEncoder().fit(y_train)
    rs.fit(X_train, le.transform(y_train))
    print("Parameters ({}): {}".format(i, rs.best_params_))
    print("Score train ({}): {}".format(i, rs.best_score_))
    xgb = rs.best_estimator_
    y_pred = xgb.predict_proba(X_test)
    test_score = roc_auc_score(le.transform(y_test), y_pred[:, 1])
    print("Score test ({}): {}".format(i, test_score))
    xgb_train_scores.append(rs.best_score_)
    xgb_test_scores.append(test_score)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
