In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Standard plotly imports
#import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.offline import iplot, init_notebook_mode
#import cufflinks
#import cufflinks as cf
import plotly.figure_factory as ff

# Using plotly + cufflinks in offline mode
init_notebook_mode(connected=True)
#cufflinks.go_offline(connected=True)

# Preprocessing, modelling and evaluating
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
from xgboost import XGBClassifier
import xgboost as xgb

## Hyperopt modules
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING
from functools import partial

import os
import gc
print(os.listdir("../input"))

['new-train', 'data-science-bowl-2019']


## Modelling 
To start simple, I will start using as base the kernels below: <br>
https://www.kaggle.com/artkulak/ieee-fraud-simple-baseline-0-9383-lb - (@artkulak - Art) <br>
https://www.kaggle.com/artgor/eda-and-models - (@artgor - Andrew Lukyanenko)



In [2]:
train_data = pd.read_csv('../input/new-train/new_train_v24_concat.csv')

In [3]:
def preprocess(train_data):
        ## Null values
    for i in ['time_percentage', 'Clip_time_per', 'Activity_time_per', 'Game_time_per', 'code_4xx_time']:
    
        train_data[i].fillna(0, inplace=True)

    for i in ['Scrub-A-Dub_process_Acc', 'All Star Sorting_process_Acc',
           'Air Show_process_Acc', 'Crystals Rule_process_Acc',
           'Dino Drink_process_Acc', 'Bubble Bath_process_Acc',
           'Dino Dive_process_Acc', 'Chow Time_process_Acc',
           'Pan Balance_process_Acc', 'Happy Camel_process_Acc',
           'Leaf Leader_process_Acc']:
        
        train_data[i].fillna(0, inplace=True)
        
            ### Assessment dictionary
    assess_list = {'Mushroom Sorter (Assessment)': 0, 
                   'Bird Measurer (Assessment)': 1, 
                   'Cauldron Filler (Assessment)': 2, 
                   'Chest Sorter (Assessment)': 3,
                   'Cart Balancer (Assessment)': 4}

    del_clip = ['Welcome to Lost Lagoon!', 'Magma Peak - Level 1', 'Slop Problem',
           'Tree Top City - Level 1', 'Ordering Spheres', 'Costume Box',
           '12 Monkeys', 'Tree Top City - Level 2',"Pirate's Tale",
           'Treasure Map', 'Tree Top City - Level 3', 'Rulers',
           'Magma Peak - Level 2', 'Crystal Caves - Level 1', 'Balancing Act',
           'Crystal Caves - Level 2', 'Crystal Caves - Level 3',
           'Lifting Heavy Things', 'Honey Cake', 'Heavy, Heavier, Heaviest']

    del_duration = ['Scrub-A-Dub_duration_y', 'Dino Drink_duration_y',
       'All Star Sorting_duration_y', 'Air Show_duration_y',
       'Crystals Rule_duration_y', 'Bubble Bath_duration_y',
       'Dino Dive_duration_y', 'Chow Time_duration_y',
       'Pan Balance_duration_y', 'Happy Camel_duration_y',
       'Leaf Leader_duration_y', 'Mushroom Sorter (Assessment)_duration',
       'Bird Measurer (Assessment)_duration',
       'Cauldron Filler (Assessment)_duration',
       'Cart Balancer (Assessment)_duration',
       'Chest Sorter (Assessment)_duration']
        
    ### 1. Assessment encoding
    train_data['title'] = train_data['title'].map(assess_list)
    
    ### ** Group ###
    group_train = train_data['title'].values
    
    ### 2. Make title column to dummy columns
    train_data = pd.concat([train_data, pd.get_dummies(train_data['title'], prefix='Assess_')], axis=1)
    
    
    ### 3. Target column
    target = train_data['accuracy_group'].copy()

    ### 4. Drop columns
    train_data.drop(columns=(['installation_id', 'game_session', 'title',
                              'num_correct', 'num_incorrect', 'acc', 'accuracy_group']), inplace=True)
    
    
    ### 5. Drop previous Game features
    train_data.drop(columns=del_clip, inplace=True, errors='ignore')
    train_data.drop(columns=del_duration, inplace=True, errors='ignore')   
    
    return train_data, target, group_train

In [4]:
train_data, target, group_train = preprocess(train_data)

In [5]:
X_train = train_data
y_train = target

X = train_data.values
y = target.values

# Defining the HyperOpt function with parameters space and model

In [6]:
import pickle

In [7]:
with open('../input/new-train/val_dict_add2.pkl', 'rb') as f:
    val_dict = pickle.load(f)

In [8]:
from sklearn.model_selection import KFold,TimeSeriesSplit
from sklearn.metrics import roc_auc_score, mean_squared_error
from xgboost import plot_importance
from sklearn.metrics import make_scorer

import time
def objective(params):
    time1 = time.time()
    params = {
        'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'subsample': "{:.2f}".format(params['subsample']),
        'reg_alpha': "{:.3f}".format(params['reg_alpha']),
        'reg_lambda': "{:.3f}".format(params['reg_lambda']),
        'learning_rate': "{:.3f}".format(params['learning_rate']),
        'num_leaves': '{:.3f}'.format(params['num_leaves']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'min_child_samples': '{:.3f}'.format(params['min_child_samples']),
        'feature_fraction': '{:.3f}'.format(params['feature_fraction']),
        'bagging_fraction': '{:.3f}'.format(params['bagging_fraction'])
    }

    print("\n############## New Run ################")
    print(f"params = {params}")
    FOLDS = 7
    count=1

    #y_preds = np.zeros(sample_submission.shape[0])
    y_oof = np.zeros(X_train.shape[0])
    score_mean = 0
    
    for n_fold in range(5):
        print("="*50)
        print('Training Start, fold : {}'.format(n_fold+1))
        print("="*50)
        val_idx = val_dict[n_fold]
        tr_idx = train_data.index[train_data.index.isin(val_idx) == False]
        
        clf = xgb.XGBRegressor(
            n_estimators=200, random_state=4, verbose=True, 
            tree_method='gpu_hist', 
            **params
        )
        X_tr, y_tr = X[tr_idx], y[tr_idx]
        X_vl, y_vl = X[val_idx], y[val_idx]
        
        #X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[tr_idx, :]
        #y_tr, y_vl = y_train.iloc[val_idx], y_train.iloc[val_idx]
        
        clf.fit(X_tr, y_tr)
        #y_pred_train = clf.predict_proba(X_vl)[:,1]
        #print(y_pred_train)
        #score = make_scorer(mean_squared_error, needs_proba=True)(clf, X_vl, y_vl)
        score = mean_squared_error(clf.predict(X_vl), y_vl)
        # plt.show()
        score_mean += score
        print(f'{count} CV - score: {round(score, 4)}')
        count += 1
    time2 = time.time() - time1
    print(f"Total Time Run: {round(time2 / 60,2)}")
    gc.collect()
    print(f'Mean ROC_AUC: {score_mean / FOLDS}')
    del X_tr, X_vl, y_tr, y_vl, clf, score
    return -(score_mean / FOLDS)


space = {
    # The maximum depth of a tree, same as GBM.
    # Used to control over-fitting as higher depth will allow model 
    # to learn relations very specific to a particular sample.
    # Should be tuned using CV.
    # Typical values: 3-10
    'max_depth': hp.quniform('max_depth', 7, 23, 1),
    
    # reg_alpha: L1 regularization term. L1 regularization encourages sparsity 
    # (meaning pulling weights to 0). It can be more useful when the objective
    # is logistic regression since you might need help with feature selection.
    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),
    
    # reg_lambda: L2 regularization term. L2 encourages smaller weights, this
    # approach can be more useful in tree-models where zeroing 
    # features might not make much sense.
    'reg_lambda': hp.uniform('reg_lambda', 0.01, .4),
    
    # eta: Analogous to learning rate in GBM
    # Makes the model more robust by shrinking the weights on each step
    # Typical final values to be used: 0.01-0.2
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    
    # colsample_bytree: Similar to max_features in GBM. Denotes the 
    # fraction of columns to be randomly samples for each tree.
    # Typical values: 0.5-1
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, .9),
    
    # A node is split only when the resulting split gives a positive
    # reduction in the loss function. Gamma specifies the 
    # minimum loss reduction required to make a split.
    # Makes the algorithm conservative. The values can vary depending on the loss function and should be tuned.
    'gamma': hp.uniform('gamma', 0.01, .7),
    
    # more increases accuracy, but may lead to overfitting.
    # num_leaves: the number of leaf nodes to use. Having a large number 
    # of leaves will improve accuracy, but will also lead to overfitting.
    'num_leaves': hp.choice('num_leaves', list(range(20, 250, 10))),
    
    # specifies the minimum samples per leaf node.
    # the minimum number of samples (data) to group into a leaf. 
    # The parameter can greatly assist with overfitting: larger sample
    # sizes per leaf will reduce overfitting (but may lead to under-fitting).
    'min_child_samples': hp.choice('min_child_samples', list(range(100, 250, 10))),
    
    # subsample: represents a fraction of the rows (observations) to be 
    # considered when building each subtree. Tianqi Chen and Carlos Guestrin
    # in their paper A Scalable Tree Boosting System recommend 
    'subsample': hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, .8, .9]),
    
    # randomly select a fraction of the features.
    # feature_fraction: controls the subsampling of features used
    # for training (as opposed to subsampling the actual training data in 
    # the case of bagging). Smaller fractions reduce overfitting.
    'feature_fraction': hp.uniform('feature_fraction', 0.4, .8),
    
    # randomly bag or subsample training data.
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, .9)
    
    # bagging_fraction and bagging_freq: enables bagging (subsampling) 
    # of the training data. Both values need to be set for bagging to be used.
    # The frequency controls how often (iteration) bagging is used. Smaller
    # fractions and frequencies reduce overfitting.
}


# Running the optimizer

In [9]:
# Set algoritm parameters
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=27)

# Print best parameters
best_params = space_eval(space, best)

                                                    
############## New Run ################
params = {'max_depth': 13, 'gamma': '0.360', 'subsample': '0.90', 'reg_alpha': '0.303', 'reg_lambda': '0.122', 'learning_rate': '0.119', 'num_leaves': '170.000', 'colsample_bytree': '0.341', 'min_child_samples': '120.000', 'feature_fraction': '0.561', 'bagging_fraction': '0.432'}
Training Start, fold : 1
1 CV - score: 1.0441
Training Start, fold : 2
2 CV - score: 1.0384
Training Start, fold : 3
3 CV - score: 1.0015
Training Start, fold : 4
4 CV - score: 1.0418
Training Start, fold : 5
5 CV - score: 1.041
Total Time Run: 0.83
Mean ROC_AUC: 0.7381209679085384
                                                                              
############## New Run ################
params = {'max_depth': 14, 'gamma': '0.609', 'subsample': '0.70', 'reg_alpha': '0.019', 'reg_lambda': '0.100', 'learning_rate': '0.092', 'num_leaves': '240.000', 'colsample_bytree': '0.800', 'min_child_samples': '240.000',

# Best parameters

In [10]:
print("BEST PARAMS: ", best_params)

best_params['max_depth'] = int(best_params['max_depth'])

BEST PARAMS:  {'bagging_fraction': 0.4924424099711338, 'colsample_bytree': 0.47917146335117106, 'feature_fraction': 0.5410689507241268, 'gamma': 0.6958871934464437, 'learning_rate': 0.19974558926399488, 'max_depth': 20.0, 'min_child_samples': 100, 'num_leaves': 220, 'reg_alpha': 0.37242188170834345, 'reg_lambda': 0.32701197628586554, 'subsample': 0.2}


# Trainning and Predicting with best Parameters

## Predicting X test

In [11]:
clf = xgb.XGBRegressor(
    n_estimators=300,
    **best_params,
    tree_method='gpu_hist'
)

clf.fit(X_train, y_train)

#y_preds = clf.predict_proba(X_test)[:,1] 




Series.base is deprecated and will be removed in a future version



XGBRegressor(bagging_fraction=0.4924424099711338, base_score=0.5,
             booster='gbtree', colsample_bylevel=1, colsample_bynode=1,
             colsample_bytree=0.47917146335117106,
             feature_fraction=0.5410689507241268, gamma=0.6958871934464437,
             importance_type='gain', learning_rate=0.19974558926399488,
             max_delta_step=0, max_depth=20, min_child_samples=100,
             min_child_weight=1, missing=None, n_estimators=300, n_jobs=1,
             nthread=None, num_leaves=220, objective='reg:linear',
             random_state=0, reg_alpha=0.37242188170834345,
             reg_lambda=0.32701197628586554, scale_pos_weight=1, seed=None,
             silent=None, subsample=0.2, tree_method='gpu_hist', verbosity=1)

# Top 20 Feature importance

In [12]:
feature_important = clf.get_booster().get_score(importance_type="weight")
keys = list(feature_important.keys())
values = list(feature_important.values())

data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)

# Top 10 features
data.head(20)

Unnamed: 0,score
time_gap,12407
game_time,10782
total_time,7274
Assessment_time_per,6698
Clip_time_per,6383
time_percentage,6026
Activity_time_per,5519
Game_time_per,5356
code_4xx_time,4561
session_count,3407


In [13]:
data.to_csv('features_xgb.csv')

## Seting y_pred to csv

In [14]:
#sample_submission['isFraud'] = y_preds
#sample_submission.to_csv('XGB_hypopt_model.csv')

## I'm working in this kernel yet.
# <font color="red">Please if this kernel were useful for you, please <b>UPVOTE</b> =)</font>