In [None]:
import numpy as np
import pandas as pd

import xgboost as xgb
import lightgbm as lgb

from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
from sklearn.neighbors import KNeighborsClassifier
from pandas.tools.plotting import scatter_matrix
from project import utils

In [None]:
X_train, X_test, Y = utils.load_ensemble_data(flip_columns=False, extended_columns=False, tsne=False, kmeans=True)
X1, _, _ = utils.load_data('flipped2')
X_train = pd.concat((X_train, X1), axis=1)

In [None]:
X_train.columns

In [None]:
bonus_columns = X_train
prev_best = [
       '04_KNN_POWER_col_2', '04_KNN_POWER_col_3', '04_KNN_POWER_col_5',
       '04_KNN_POWER_col_6', '01_XGB_split', '08_NN_flip', '11_LGB_split',
       '02_XGB_flip', 'doReturnOnLowerLevels', 'last_lvl_completed',
       'clean_start',
    
       'totalScore2', 'totalNumOfAttempts',
       'numberOfBoostersUsed', 'f_16_inv',
       'averageNumOfTurnsPerCompletedLevel', 'f_old_7', 
    #'f_3',
       'totalBonusScore2', 'totalStarsCount2'
    
]
X_train = pd.DataFrame()
for c in prev_best:
    X_train = pd.concat((X_train, bonus_columns[c]), axis=1)
    bonus_columns.drop(c, axis=1, inplace=True)

In [None]:
PARAMS = {

    'colsample_bytree': 0.8,
    'gamma': 0.05,
    'learning_rate': 0.01,
    'max_depth': 3,
    'min_child_weight': 2,
    'n_estimators': 700,
    'reg_alpha': 0,
    'reg_lambda': 6.0,
    'subsample': 0.8,
    
#  'colsample_bytree': 0.8,
#  'gamma': 0.05,
#  'learning_rate': 0.01,
#  'max_depth': 3,
#  'min_child_weight': 2,
#  'n_estimators': 569+70,
#  'reg_alpha': 0.3,
#  'reg_lambda': 0.42,
#  'subsample': 0.65,

    'nthread': 4,
    'seed': 1502,
    'silent': True
}

In [None]:
if(prev_best):
    model = xgb.XGBClassifier(**PARAMS)
    par = PARAMS.copy()
    par['num_leaves'] = 2 ** par['max_depth']
    del par['gamma']
    del par['max_depth']
    model = lgb.LGBMClassifier(**par)
    scores = cross_val_score(model, X_train, Y, cv=20, scoring='neg_log_loss', fit_params={'eval_metric':'logloss'})
    print(np.mean(scores), scores)

# Feature importance

In [None]:
from matplotlib import pyplot
from xgboost import plot_importance
if(prev_best):    
    model = xgb.XGBClassifier(**PARAMS)
    model.fit(X_train, Y, eval_metric='logloss')
    for a, b in sorted(zip(model.feature_importances_, X_train.columns)):
        print(a,b, sep='\t\t')
    plot_importance(model)
    pyplot.show()

# Feature selection

In [None]:
CLASSIFIER = 'xgb'

In [None]:
kf = StratifiedKFold(n_splits=5, random_state=13, shuffle=True)
for i in range(15):
    best_score = 1
    best_column = ''
    best_std = 1
    
    worst_score = 0
    worst_column = ''
    for column in bonus_columns:
        temp_X_train = pd.concat((X_train, bonus_columns[column]), axis=1)
        if CLASSIFIER == 'xgb':
            model = xgb.XGBClassifier(**PARAMS)
        else:
            par = PARAMS.copy()
            par['num_leaves'] = 2 ** par['max_depth']
            del par['gamma']
            del par['max_depth']
            model = lgb.LGBMClassifier(**par)
        scores = cross_val_score(model,
                                         temp_X_train,
                                         Y,
                                         cv=kf,
                                         scoring='neg_log_loss',
                                         fit_params={'eval_metric':'logloss'}
                                )
        score = round(-np.mean(scores), 6)
        std = np.std(scores)
        print(column, ':\t', score,'\t', std, sep='')
        #best_results.append((score, column))
        if (score < best_score) or (score == best_score and std < best_std):
            best_score = score
            best_column = column
            best_std = std
        elif score > worst_score:
            worst_score = score
            worst_column = column


    X_train = pd.concat((X_train, bonus_columns[best_column]), axis=1)
    print('Iter', i)
    print('Best columns:', X_train.columns, 'with score:', best_score)
    print('Worst column:', worst_column, 'with score:', worst_score)
    #log_bot('Iter '+ str(i) +' Worst column ' + str(worst_column) + ' with score ' + str(worst_score))
    #log_bot('Iter '+ str(i) +'\r\n' + str(X_train.columns) + '\r\nScore: ' + str(best_score) + ' Std: ' + str(best_std))
    bonus_columns.drop(best_column, axis=1, inplace=True)
    #bonus_columns.drop(worst_column, axis=1, inplace=True)
print('Final features:', X_train.columns)
#log_bot('Final features :' + str(X_train.columns))

# Feature drop

In [None]:
kf = StratifiedKFold(n_splits=10, random_state=13, shuffle=True)
for i in range(5):
    best_score = 1
    best_column = ''
    best_std = 1
    
    for column in X_train.columns:
        if CLASSIFIER == 'xgb':
            model = xgb.XGBClassifier(**PARAMS)
        else:
            par = PARAMS.copy()
            par['num_leaves'] = 2 ** par['max_depth']
            del par['gamma']
            del par['max_depth']
            model = lgb.LGBMClassifier(**par)
        temp_X_train = X_train.drop(column, axis=1)
        scores = cross_val_score(model,
                                         temp_X_train,
                                         Y,
                                         cv=kf,
                                         scoring='neg_log_loss',
                                         fit_params={'eval_metric':'logloss'}
                                )
        score = round(-np.mean(scores), 6)
        std = np.std(scores)
        print(column, ':\t', score,'\t', std, sep='')
        if score < best_score or (score == best_score and std < best_std):
            best_score = score
            best_column = column
            best_std = std
    print('===Best column:', best_column, 'with score:', best_score)
    X_train.drop(best_column, axis=1, inplace=True)
print('Final features:', X_train.columns)

In [None]:
X_train.columns
