In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/data-science-bowl-2019/train.csv
/kaggle/input/data-science-bowl-2019/specs.csv
/kaggle/input/data-science-bowl-2019/train_labels.csv
/kaggle/input/data-science-bowl-2019/test.csv
/kaggle/input/data-science-bowl-2019/sample_submission.csv
/kaggle/input/data-processing-consolidated/test_processed.pkl
/kaggle/input/data-processing-consolidated/__results__.html
/kaggle/input/data-processing-consolidated/train_processed.pkl
/kaggle/input/data-processing-consolidated/__output__.json
/kaggle/input/data-processing-consolidated/custom.css
/kaggle/input/data-processing-consolidated/train_predictions.csv
/kaggle/input/data-processing-consolidated/__notebook__.ipynb
/kaggle/input/data-processing-consolidated/submission.csv
/kaggle/input/data-processing-consolidated/__results___files/__results___76_0.png
/kaggle/input/data-processing-consolidated/__results___files/__results___75_0.png
/kaggle/input/data-processing-consolidated/__results___files/__results___62_0.png
/kaggle/input/data

In [3]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from collections import Counter

import lightgbm

import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline


# Загрузка данных

In [4]:
#DATA_DIR = 'data'   # Home
#DATA_DIR = '../input/data-science-bowl-2019' # Kaggle

PROCESSED_DATA_DIR = 'processed/557'  # Work
PROCESSED_DATA_DIR = '../input/data-processing-consolidated'  # Kaggle

RANDOM_SEED = 17

In [5]:
import pickle
import os

featureset = None
with open(os.path.join(PROCESSED_DATA_DIR, 'train_processed.pkl'), 'rb') as fi:
    train = pickle.load(fi)

train.shape

(17687, 550)

In [6]:
from sklearn.model_selection import cross_val_score, GroupShuffleSplit
from sklearn.metrics import cohen_kappa_score

# Определение требуемой метрики 'quadratic weighted kappa'
scorer = lambda estimator, x, y : cohen_kappa_score(estimator.predict(x), y, weights='quadratic')

## Регрессия LightGBM с выбором порогов

In [7]:
from numba import jit

@jit
def qwk(a1, a2):
    """
    Source: https://www.kaggle.com/c/data-science-bowl-2019/discussion/114133#latest-660168

    :param a1:
    :param a2:
    :param max_rat:
    :return:
    """
    max_rat = 3
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)

    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))

    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)

    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)

    e = e / a1.shape[0]

    return 1 - o / e


In [8]:
from functools import partial
import scipy as sp

class OptimizedRounder(object):
    """
    An optimizer for rounding thresholds
    to maximize Quadratic Weighted Kappa (QWK) score
    # https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
    """
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        """
        Get loss according to
        using current coefficients
        
        :param coef: A list of coefficients that will be used for rounding
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

        return -qwk(y, X_p)

    def fit(self, X, y):
        """
        Optimize rounding thresholds
        
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        """
        Make predictions with specified thresholds
        
        :param X: The raw predictions
        :param coef: A list of coefficients that will be used for rounding
        """
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])


    def coefficients(self):
        """
        Return the optimized coefficients
        """
        return self.coef_['x']
    
    
class MultistartOptimizedRounder(object):
    """
    An optimizer for rounding thresholds
    to maximize Quadratic Weighted Kappa (QWK) score
    # https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
    """
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        """
        Get loss according to
        using current coefficients
        
        :param coef: A list of coefficients that will be used for rounding
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

        return -qwk(y, X_p)

    def fit(self, X, y):
        """
        Optimize rounding thresholds
        
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coefs = [[0.5, 1.5, 2.5],
                         [1.1, 1.6, 2.2],
                         [1.2, 1.7, 2.1]]
        best_val = 10.0                    # значение берется с обратным знаком
        best_coef = None
        for coef in initial_coefs:
            cur_coef = sp.optimize.minimize(loss_partial, coef, method='nelder-mead')
            cur_val = self._kappa_loss(cur_coef['x'], X, y)
            print('Val:', cur_val)
            if cur_val < best_val:         # значение берется с обратным знаком
                print('New best!')
                best_val = cur_val
                best_coef = cur_coef
        self.coef_ = best_coef

    def predict(self, X, coef):
        """
        Make predictions with specified thresholds
        
        :param X: The raw predictions
        :param coef: A list of coefficients that will be used for rounding
        """
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])


    def coefficients(self):
        """
        Return the optimized coefficients
        """
        return self.coef_['x']

In [9]:
from sklearn.base import BaseEstimator, MetaEstimatorMixin, ClassifierMixin, clone

class ClassifierWrapper(BaseEstimator, MetaEstimatorMixin, ClassifierMixin):
    
    def __init__(self, base_estimator, estimator_params):
        super().__init__()
        self.base_estimator = base_estimator
        self.estimator_params = estimator_params        
        self.optimized_rounder = MultistartOptimizedRounder()

    def make_estimator(self):
        estimator = clone(self.base_estimator)
        #estimator.set_params(**dict((p, getattr(self, p))
        #                            for p in self.estimator_params))
        estimator.set_params(**self.estimator_params)
        return estimator        
        
    def fit(self, X, y):
        self.model = self.make_estimator().fit(X, y)
        y_pred = self.model.predict(X)
        self.optimized_rounder.fit(y_pred.reshape(-1,), y)
        # coefficients = optR.coefficients()
        print('Found bounds:', self.optimized_rounder.coefficients())
        
    def predict(self, X):
        y_pred = self.model.predict(X)
        return self.optimized_rounder.predict(y_pred.reshape(-1, ), self.optimized_rounder.coefficients())
        

In [13]:
X = train.drop(columns=['installation_id', 'timestamp', 
                        'accuracy_group', 'accuracy', 'num_correct', 'num_incorrect'])
y = train[['accuracy_group']]


X.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X.columns]


In [11]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import GroupKFold

splitter = GroupKFold(5)

# Определение требуемой метрики 'quadratic weighted kappa'
scorer = lambda estimator, x, y : cohen_kappa_score(estimator.predict(x), y, weights='quadratic')

# Baseline: модель с исходными параметрами

In [14]:

# from Andrew Lukyanenko
parameters = {'n_estimators':2000,
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': 'rmse',
            'subsample': 0.75,
            'subsample_freq': 1,
            'learning_rate': 0.04,
            'feature_fraction': 0.9,
         'max_depth': 15,
            'lambda_l1': 1,  
            'lambda_l2': 1,
            'verbose': 100,
            'random_state': RANDOM_SEED,
            'early_stopping_rounds': 100,
            }

scores = []
for train_idxs, test_idxs in splitter.split(X, y, groups=train.installation_id):
    train_X = X.iloc[train_idxs, :]
    train_y = y.iloc[train_idxs, :]
    test_X = X.iloc[test_idxs, :]
    test_y = y.iloc[test_idxs, :]
    train_data = lightgbm.Dataset(train_X, label=train_y, categorical_feature=['session_title', 'world'])
    test_data = lightgbm.Dataset(test_X, label=test_y)
    model = lightgbm.train(parameters,
                           train_data,
                           valid_sets=test_data)
    rounder = MultistartOptimizedRounder()
    rounder.fit(model.predict(train_X).reshape(-1), train_y)
    y_pred = rounder.predict(model.predict(test_X).reshape(-1, ), rounder.coefficients())
    scores.append(cohen_kappa_score(y_pred, test_y, weights='quadratic'))

scores = np.array(scores)    
print(scores)    
print(scores.mean())    
print(scores.std())    
    


New categorical_feature is ['session_title', 'world']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[1]	valid_0's rmse: 1.23945
Training until validation scores don't improve for 100 rounds
[2]	valid_0's rmse: 1.2243
[3]	valid_0's rmse: 1.20989
[4]	valid_0's rmse: 1.19664
[5]	valid_0's rmse: 1.18425
[6]	valid_0's rmse: 1.17667
[7]	valid_0's rmse: 1.16563
[8]	valid_0's rmse: 1.15804
[9]	valid_0's rmse: 1.14773
[10]	valid_0's rmse: 1.13816
[11]	valid_0's rmse: 1.12891
[12]	valid_0's rmse: 1.12298
[13]	valid_0's rmse: 1.11518
[14]	valid_0's rmse: 1.10756
[15]	valid_0's rmse: 1.10055
[16]	valid_0's rmse: 1.0948
[17]	valid_0's rmse: 1.08864
[18]	valid_0's rmse: 1.0826
[19]	valid_0's rmse: 1.07771
[20]	valid_0's rmse: 1.07266
[21]	valid_0's rmse: 1.06797
[22]	valid_0's rmse: 1.06361
[23]	valid_0's rmse: 1.06006
[24]	valid_0's rmse: 1.05654
[25]	valid_0's rmse: 1.05263
[26]	valid_0's rmse: 1.04911
[27]	valid_0's rmse: 1.04667
[28]	valid_0's rmse: 1.04378
[29]	valid_0's rmse: 1.04118
[30]	valid_0's rmse: 1.03893
[31]	valid_0's rmse: 1.03629
[32]	valid_0's rmse: 1.03374
[33]	valid_0's rmse: 1

Compilation is falling back to object mode WITH looplifting enabled because Function "qwk" failed type inference due to: [1m[1mnon-precise type pyobject[0m
[0m[1m[1] During: typing of argument at <ipython-input-7-7e6567382b89> (13)[0m
[1m
File "<ipython-input-7-7e6567382b89>", line 13:[0m
[1mdef qwk(a1, a2):
    <source elided>
    """
[1m    max_rat = 3
[0m    [1m^[0m[0m
[0m
  @jit
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "qwk" failed type inference due to: [1m[1mcannot determine Numba type of <class 'numba.dispatcher.LiftedLoop'>[0m
[1m
File "<ipython-input-7-7e6567382b89>", line 21:[0m
[1mdef qwk(a1, a2):
    <source elided>
    o = 0
[1m    for k in range(a1.shape[0]):
[0m    [1m^[0m[0m
[0m[0m
  @jit
[1m
File "<ipython-input-7-7e6567382b89>", line 4:[0m
[1m@jit
[1mdef qwk(a1, a2):
[0m[1m^[0m[0m
[0m
  state.func_ir.loc))
Fall-back from the nopython compilation path to the object mode compilation path

Val: [-0.69988235]
New best!
Val: [-0.71247633]
New best!
Val: [-0.71257884]
New best!


New categorical_feature is ['session_title', 'world']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[1]	valid_0's rmse: 1.24659
Training until validation scores don't improve for 100 rounds
[2]	valid_0's rmse: 1.23118
[3]	valid_0's rmse: 1.21688
[4]	valid_0's rmse: 1.20392
[5]	valid_0's rmse: 1.19148
[6]	valid_0's rmse: 1.18478
[7]	valid_0's rmse: 1.17359
[8]	valid_0's rmse: 1.16688
[9]	valid_0's rmse: 1.15678
[10]	valid_0's rmse: 1.14737
[11]	valid_0's rmse: 1.1383
[12]	valid_0's rmse: 1.13317
[13]	valid_0's rmse: 1.12457
[14]	valid_0's rmse: 1.11686
[15]	valid_0's rmse: 1.10995
[16]	valid_0's rmse: 1.10344
[17]	valid_0's rmse: 1.09729
[18]	valid_0's rmse: 1.09161
[19]	valid_0's rmse: 1.08655
[20]	valid_0's rmse: 1.0814
[21]	valid_0's rmse: 1.07667
[22]	valid_0's rmse: 1.0725
[23]	valid_0's rmse: 1.06826
[24]	valid_0's rmse: 1.06469
[25]	valid_0's rmse: 1.06114
[26]	valid_0's rmse: 1.05745
[27]	valid_0's rmse: 1.05451
[28]	valid_0's rmse: 1.05138
[29]	valid_0's rmse: 1.04847
[30]	valid_0's rmse: 1.04612
[31]	valid_0's rmse: 1.0436
[32]	valid_0's rmse: 1.04143
[33]	valid_0's rmse: 1.

[5]	valid_0's rmse: 1.17473
[6]	valid_0's rmse: 1.16712
[7]	valid_0's rmse: 1.15596
[8]	valid_0's rmse: 1.14841
[9]	valid_0's rmse: 1.13856
[10]	valid_0's rmse: 1.12927
[11]	valid_0's rmse: 1.12044
[12]	valid_0's rmse: 1.11532
[13]	valid_0's rmse: 1.10735
[14]	valid_0's rmse: 1.09976
[15]	valid_0's rmse: 1.09284
[16]	valid_0's rmse: 1.08656
[17]	valid_0's rmse: 1.08072
[18]	valid_0's rmse: 1.07482
[19]	valid_0's rmse: 1.06973
[20]	valid_0's rmse: 1.06451
[21]	valid_0's rmse: 1.0599
[22]	valid_0's rmse: 1.05545
[23]	valid_0's rmse: 1.05133
[24]	valid_0's rmse: 1.04759
[25]	valid_0's rmse: 1.04397
[26]	valid_0's rmse: 1.04019
[27]	valid_0's rmse: 1.03764
[28]	valid_0's rmse: 1.03462
[29]	valid_0's rmse: 1.03179
[30]	valid_0's rmse: 1.02915
[31]	valid_0's rmse: 1.02663
[32]	valid_0's rmse: 1.0244
[33]	valid_0's rmse: 1.02235
[34]	valid_0's rmse: 1.02038
[35]	valid_0's rmse: 1.01805
[36]	valid_0's rmse: 1.01635
[37]	valid_0's rmse: 1.01474
[38]	valid_0's rmse: 1.01313
[39]	valid_0's rmse: 

[39]	valid_0's rmse: 1.00311
[40]	valid_0's rmse: 1.00155
[41]	valid_0's rmse: 0.999867
[42]	valid_0's rmse: 0.997864
[43]	valid_0's rmse: 0.996579
[44]	valid_0's rmse: 0.994966
[45]	valid_0's rmse: 0.993604
[46]	valid_0's rmse: 0.991939
[47]	valid_0's rmse: 0.990562
[48]	valid_0's rmse: 0.988834
[49]	valid_0's rmse: 0.987996
[50]	valid_0's rmse: 0.986472
[51]	valid_0's rmse: 0.985412
[52]	valid_0's rmse: 0.984628
[53]	valid_0's rmse: 0.98367
[54]	valid_0's rmse: 0.982588
[55]	valid_0's rmse: 0.981619
[56]	valid_0's rmse: 0.980607
[57]	valid_0's rmse: 0.97974
[58]	valid_0's rmse: 0.978756
[59]	valid_0's rmse: 0.977845
[60]	valid_0's rmse: 0.97705
[61]	valid_0's rmse: 0.976138
[62]	valid_0's rmse: 0.975314
[63]	valid_0's rmse: 0.97462
[64]	valid_0's rmse: 0.974221
[65]	valid_0's rmse: 0.973755
[66]	valid_0's rmse: 0.973162
[67]	valid_0's rmse: 0.972474
[68]	valid_0's rmse: 0.971972
[69]	valid_0's rmse: 0.971667
[70]	valid_0's rmse: 0.971172
[71]	valid_0's rmse: 0.970743
[72]	valid_0's r

[35]	valid_0's rmse: 1.02529
[36]	valid_0's rmse: 1.02344
[37]	valid_0's rmse: 1.02166
[38]	valid_0's rmse: 1.01996
[39]	valid_0's rmse: 1.01854
[40]	valid_0's rmse: 1.01808
[41]	valid_0's rmse: 1.01703
[42]	valid_0's rmse: 1.01567
[43]	valid_0's rmse: 1.01421
[44]	valid_0's rmse: 1.01316
[45]	valid_0's rmse: 1.01216
[46]	valid_0's rmse: 1.01112
[47]	valid_0's rmse: 1.01012
[48]	valid_0's rmse: 1.00919
[49]	valid_0's rmse: 1.00875
[50]	valid_0's rmse: 1.00788
[51]	valid_0's rmse: 1.00719
[52]	valid_0's rmse: 1.00637
[53]	valid_0's rmse: 1.0059
[54]	valid_0's rmse: 1.00542
[55]	valid_0's rmse: 1.00465
[56]	valid_0's rmse: 1.00417
[57]	valid_0's rmse: 1.00331
[58]	valid_0's rmse: 1.00278
[59]	valid_0's rmse: 1.0024
[60]	valid_0's rmse: 1.00206
[61]	valid_0's rmse: 1.00153
[62]	valid_0's rmse: 1.00103
[63]	valid_0's rmse: 1.00056
[64]	valid_0's rmse: 1.00023
[65]	valid_0's rmse: 0.999892
[66]	valid_0's rmse: 0.999401
[67]	valid_0's rmse: 0.99918
[68]	valid_0's rmse: 0.999123
[69]	valid_0'

# Настройка параметров

In [15]:
import optuna

In [16]:
def evaluate_parameters(params):
    scores = []
    for seed_delta in range(2):
        params_copy = params.copy()
        params_copy['random_state'] = RANDOM_SEED + seed_delta
        for train_idxs, test_idxs in splitter.split(X, y, groups=train.installation_id):
            train_X = X.iloc[train_idxs, :]
            train_y = y.iloc[train_idxs, :]
            test_X = X.iloc[test_idxs, :]
            test_y = y.iloc[test_idxs, :]
            train_data = lightgbm.Dataset(train_X, label=train_y, categorical_feature=['session_title', 'world'])
            test_data = lightgbm.Dataset(test_X, label=test_y)
            model = lightgbm.train(params_copy,
                                   train_data,
                                   verbose_eval=False)
            rounder = MultistartOptimizedRounder()
            rounder.fit(model.predict(train_X).reshape(-1), train_y)
            y_pred = rounder.predict(model.predict(test_X).reshape(-1, ), rounder.coefficients())
            scores.append(cohen_kappa_score(y_pred, test_y, weights='quadratic'))

    scores = np.array(scores)
    print(scores)    
    print(scores.mean())
    print(scores.std())
    return scores[1:-1].mean()

def objective(trial):
    param = {
        'n_estimators': 500,
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        'verbose': 0,
        'random_state': RANDOM_SEED,
        #'early_stopping_rounds': 100,
        
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-2, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-2, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 16, 128),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.5, 1.0),     # subsample   
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 3),                     # subsample_freq
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_depth': trial.suggest_int('max_depth', 6, 15),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-2, 0.32),
    }

    # TODO pruning callbacks 
    return evaluate_parameters(param)
    

In [17]:
study = optuna.create_study(study_name='557_LGBM_Tuning_Paranoidal', direction='maximize', storage='sqlite:///557_lgbm_tuning.db')
study.optimize(objective, n_trials = 200)

[I 2020-01-21 23:13:56,416] A new study created with name: 557_LGBM_Tuning_Paranoidal

Found `n_estimators` in params. Will use it instead of argument


Using categorical_feature in Dataset.


categorical_feature in Dataset is overridden.
New categorical_feature is ['session_title', 'world']



Val: [-0.76459013]
New best!
Val: [-0.76414932]


KeyboardInterrupt: 