## GBMs for t=1 to t=20
* GBM grid search using tsfresh time-series features computed in 'LNP_tsfresh_efficient_extract_select_PCA_regression.py'
* fit best model from grid search
* seperate model for each prediction model (classification or regression) and each cross-validation fold
* save out predictions on the test data

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import cv2
import xgboost as xgb

from scipy.stats import uniform, randint

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split

### Settings

In [3]:
n_time = 20
n_latent = 32
fold = 'fold5'
fit_method = 'regress'

X_train_pca = pd.read_csv('/scratch-shared/phil/LNP/LNP_data_09/tsfresh_efficient_pca_train_features_' + fit_method + '_' + fold + '.csv')
X_train_pca = X_train_pca.as_matrix()
X_train_pca = X_train_pca[:,1:]

X_test_pca = pd.read_csv('/scratch-shared/phil/LNP/LNP_data_09/tsfresh_efficient_pca_test_features_' + fit_method + '_' + fold + '.csv')
X_test_pca = X_test_pca.as_matrix()
X_test_pca = X_test_pca[:,1:]

train_ids = np.load('/scratch-shared/phil/LNP/LNP_data_09/train_cell_ids_' + fit_method + '_' + fold + '.npy')

train_y = np.load('/scratch-shared/phil/LNP/LNP_data_09/train_cell_gfp_' + fit_method + '_' + fold + '.npy')
test_y = np.load('/scratch-shared/phil/LNP/LNP_data_09/test_cell_gfp_' + fit_method + '_' + fold + '.npy')

train_index = []
valid_index = []

for i in range(len(train_ids)):
    s0 = train_ids[i].split('train/')
    s1 = s0[1].split('_')[0]
    if s1 == fold:
        valid_index.append(i)
    else:
        train_index.append(i)

X_train_pca_train = X_train_pca[train_index]
train_y_train = train_y[train_index]

X_train_pca_valid = X_train_pca[valid_index]
train_y_valid = train_y[valid_index]

# getting positive class weights for when in classification mode
if fit_method == 'classify':
    class_gfp = train_y_train.astype('int64')
    scale_pos = (len(class_gfp) - np.sum(class_gfp)) / np.sum(class_gfp)
    print('pos weight = ' + str(scale_pos))


  import sys
  # This is added back by InteractiveShellApp.init_path()


### Grid search

In [4]:
col_names = ['colsamp', 'gamma', 'lr', 'max_d', 'n_est', 'subsamp', 'min_rmse']

for repl in range(200):
    colsamp = np.random.uniform(0.3, 0.7)
    gamma = np.random.uniform(0.0, 0.5)
    lr = np.random.uniform(0.03, 0.3)
    max_d = np.random.randint(2, 6)
    n_est = 300 # now using early stopping so this is just an upper limit
    subsamp = np.random.uniform(0.4, 0.6)
    
    if fit_method == 'classify':
        xgb_model = xgb.XGBClassifier(objective = "binary:logistic",
                                      colsample_bytree = colsamp,
                                      gamma = gamma,
                                      learning_rate = lr,
                                      max_depth = max_d,
                                      n_estimators = n_est,
                                      subsample = subsamp,
                                      scale_pos_weight = scale_pos,
                                      n_jobs = 3)
        eval_metric = "logloss"
    else:
        xgb_model = xgb.XGBRegressor(objective ='reg:squarederror',
                                     colsample_bytree = colsamp,
                                     gamma = gamma,
                                     learning_rate = lr,
                                     max_depth = max_d,
                                     n_estimators = n_est,
                                     subsample = subsamp,
                                     n_jobs = 3)
        eval_metric = "rmse"
    
    eval_set = [(X_train_pca_valid, train_y_valid)]
        
    xgb_model.fit(X_train_pca_train, train_y_train, early_stopping_rounds=10, eval_metric=eval_metric, 
                  eval_set=eval_set, verbose=0)
    
    y_pred = xgb_model.predict(X_train_pca_valid)
    
    val_losses = xgb_model.evals_result()
    
    res = np.zeros((1,7))
    res[0, 0] = colsamp
    res[0, 1] = gamma
    res[0, 2] = lr
    res[0, 3] = max_d
    res[0, 4] = n_est
    res[0, 5] = subsamp
    if fit_method == 'classify':
        res[0, 6] = min(val_losses['validation_0']['logloss'])
    else:
        res[0, 6] = min(val_losses['validation_0']['rmse'])
    
    if repl == 0:
        res_df = pd.DataFrame(res, columns=col_names)
    else:
        df = pd.DataFrame(res, columns=col_names)
        res_df = res_df.append(df, ignore_index=True, sort=False)

### Print out 5 best models selected via grid search

In [5]:
if fit_method == 'classify':
    res_df = res_df.sort_values(ascending=True, by='min_val_loss')
else:
    res_df = res_df.sort_values(ascending=True, by='min_rmse')
res_df = res_df.reset_index()
res_df[:5]

Unnamed: 0,index,colsamp,gamma,lr,max_d,n_est,subsamp,min_rmse
0,149,0.672299,0.481209,0.105425,4.0,300.0,0.44465,0.566725
1,99,0.533801,0.300534,0.05664,5.0,300.0,0.428597,0.567073
2,143,0.599805,0.336235,0.040526,5.0,300.0,0.446401,0.568722
3,102,0.671026,0.403088,0.040559,5.0,300.0,0.505051,0.571376
4,96,0.609975,0.296452,0.165932,4.0,300.0,0.510807,0.571583


### Train best model from above and make predictions on test set

In [6]:
colsamp = res_df['colsamp'][0]
gamma = res_df['gamma'][0]
lr = res_df['lr'][0]
max_d = int(res_df['max_d'][0])
n_est = int(res_df['n_est'][0])
subsamp = res_df['subsamp'][0]

if fit_method == 'classify':
    xgb_model = xgb.XGBClassifier(objective = "binary:logistic",
                                  colsample_bytree = colsamp,
                                  gamma = gamma,
                                  learning_rate = lr,
                                  max_depth = max_d,
                                  n_estimators = n_est,
                                  subsample = subsamp,
                                  scale_pos_weight = scale_pos,
                                  n_jobs = 3)
    eval_metric = "logloss"
else:    
    xgb_model = xgb.XGBRegressor(objective ='reg:squarederror',
                                 colsample_bytree = colsamp,
                                 gamma = gamma,
                                 learning_rate = lr,
                                 max_depth = max_d,
                                 n_estimators = n_est,
                                 subsample = subsamp,
                                 n_jobs = 3)
    eval_metric = "rmse"

eval_set = [(X_train_pca_valid, train_y_valid)]

xgb_model.fit(X_train_pca_train, train_y_train, early_stopping_rounds=10, eval_metric=eval_metric, 
              eval_set=eval_set, verbose=0)

y_pred = xgb_model.predict(X_test_pca)
np.save('/scratch-shared/phil/LNP/LNP_data_09/tsf_efficient_test_pred_' + fit_method + '_' + fold + '.npy', y_pred)


print('rmse for test data')
print(np.round(np.sqrt(mean_squared_error(test_y, y_pred)), decimals=3))

rmse for test data
0.626
