In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', 500)

import keggler as kg
from helpers import *

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm import tqdm

import gc
gc.enable()

import warnings
warnings.simplefilter(action='ignore', category=Warning)

import os, psutil
import glob

# Set up a logger to dump messages to both log file and notebook
import logging as logging
def ini_log(filename):
    logger = logging.getLogger(__name__)
    ## avoid multiple printouts due to same handlers added several times
    if not logger.handlers:
        logger.setLevel(logging.DEBUG)

        handlers = [#logging.StreamHandler(None), 
            logging.FileHandler(filename, 'a')
        ]

        fmt=logging.Formatter('%(asctime)-15s: %(levelname)s  %(message)s')
        for h in handlers:
            h.setFormatter(fmt)
            logger.addHandler(h)
    return logger
        
log = ini_log('out.log')

#PATH='data_mini/'
#prefix='_mini'

prefix=''

n_files = 6
n_iter  = 5



# Lists of files

In [2]:
data_path = 'data/' # point this to your data folder
trn_path = data_path + 'training_set/'

# trn_input_logs = sorted(glob.glob(trn_path + "outDD_v1*.csv.gz"))
trn_input_logs = sorted(glob.glob(trn_path + "outDD_v2_*.h5"))
trn_extra = sorted(glob.glob(trn_path + "outDD_v4_*.h5"))
ys   = sorted(glob.glob(trn_path + "y_*.h5"))

In [3]:
trn_input_logs[:]

['data/training_set/outDD_v2_00.h5',
 'data/training_set/outDD_v2_01.h5',
 'data/training_set/outDD_v2_02.h5',
 'data/training_set/outDD_v2_03.h5',
 'data/training_set/outDD_v2_04.h5',
 'data/training_set/outDD_v2_05.h5',
 'data/training_set/outDD_v2_06.h5',
 'data/training_set/outDD_v2_07.h5',
 'data/training_set/outDD_v2_08.h5',
 'data/training_set/outDD_v2_09.h5',
 'data/training_set/outDD_v2_10.h5',
 'data/training_set/outDD_v2_11.h5',
 'data/training_set/outDD_v2_12.h5',
 'data/training_set/outDD_v2_13.h5',
 'data/training_set/outDD_v2_14.h5',
 'data/training_set/outDD_v2_15.h5',
 'data/training_set/outDD_v2_16.h5',
 'data/training_set/outDD_v2_17.h5',
 'data/training_set/outDD_v2_18.h5',
 'data/training_set/outDD_v2_19.h5',
 'data/training_set/outDD_v2_20.h5',
 'data/training_set/outDD_v2_21.h5',
 'data/training_set/outDD_v2_22.h5',
 'data/training_set/outDD_v2_23.h5',
 'data/training_set/outDD_v2_24.h5',
 'data/training_set/outDD_v2_25.h5',
 'data/training_set/outDD_v2_26.h5',
 

In [4]:
trn_extra

['data/training_set/outDD_v4_00.h5',
 'data/training_set/outDD_v4_01.h5',
 'data/training_set/outDD_v4_02.h5',
 'data/training_set/outDD_v4_03.h5',
 'data/training_set/outDD_v4_04.h5',
 'data/training_set/outDD_v4_05.h5',
 'data/training_set/outDD_v4_06.h5',
 'data/training_set/outDD_v4_07.h5',
 'data/training_set/outDD_v4_08.h5',
 'data/training_set/outDD_v4_09.h5',
 'data/training_set/outDD_v4_10.h5',
 'data/training_set/outDD_v4_11.h5',
 'data/training_set/outDD_v4_12.h5',
 'data/training_set/outDD_v4_13.h5',
 'data/training_set/outDD_v4_14.h5',
 'data/training_set/outDD_v4_15.h5',
 'data/training_set/outDD_v4_16.h5',
 'data/training_set/outDD_v4_17.h5',
 'data/training_set/outDD_v4_18.h5',
 'data/training_set/outDD_v4_19.h5',
 'data/training_set/outDD_v4_20.h5',
 'data/training_set/outDD_v4_21.h5',
 'data/training_set/outDD_v4_22.h5',
 'data/training_set/outDD_v4_23.h5',
 'data/training_set/outDD_v4_24.h5',
 'data/training_set/outDD_v4_25.h5',
 'data/training_set/outDD_v4_26.h5',
 

# Main function

In [5]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb
mdl = lgb.LGBMClassifier(max_depth=-1, min_child_samples=400, 
              random_state=314, silent=True, metric='None', 
              n_jobs=4, n_estimators=500, learning_rate=0.1,
              **{'colsample_bytree': 0.75, 'min_child_weight': 1, 
               'num_leaves': 60, 'subsample': 0.75}
             )

def learning_rate_decay_power_0995(current_iter):
    base_learning_rate = 0.15
    lr = base_learning_rate  * np.power(.998, current_iter)
    return lr if lr > 1e-2 else 1e-2

def print_RAM():
    print('Memory = {:.2f} GB'. format(psutil.Process(os.getpid()).memory_info().rss / 1024**3))

def train_models_for_each_track(i_iter, i_step):
    print('==========================================')
    print('Train {}-th iteration'.format(i_iter))
    start_file = i_iter*i_step
    print(trn_input_logs[start_file:start_file+i_step])
    df_trn = pd.concat([read_log(f) 
                        for f in trn_input_logs[start_file:start_file+i_step]
                       ], axis=0, ignore_index=True)
#     print(df_trn.shape)
    df_trn.rename({'not_skipped': 'skip_4'}, axis=1, inplace=True)
    gc.collect()
    print(trn_extra[start_file:start_file+i_step])
    df_xtr = pd.concat([read_log(f) 
                        for f in trn_extra[start_file:start_file+i_step]
                       ], axis=0, ignore_index=True)
#     print(df_xtr.shape)
    df_trn = pd.concat([df_trn, df_xtr], axis=1)
    print(df_trn.shape)
    
#     display(df_trn.head())

    from sklearn.preprocessing import LabelEncoder
    df_trn['session_id'] = LabelEncoder().fit_transform(df_trn['session_id'])
    df_trn['session_id'] = df_trn['session_id'].astype(np.uint32)
    
    print_RAM()
    
    # parameters for the train/test split
    split_params = dict(test_size=0.10, random_state=314, shuffle=True)

    # competition metric format
    print(ys[start_file:start_file+i_step])
    y_competition_truth = pd.concat([pd.read_hdf(f, key='df') 
                                     for f in ys[start_file:start_file+i_step]
                                    ], axis=0)
    y_lists_trn, y_lists_stp = train_test_split(y_competition_truth, **split_params)


    l_prob = []
    X_prob=[]

    for i_ in list(range(10)):
        print('----------- {} -------------'.format(i_))
        print('Full dataframe length = {}'.format(len(df_trn)))
        X_trn, y_trn, X_trk = get_XY(df_trn, aggs, reset_index=False, 
                                      list_musik_qualities_=list_musik_qualities, 
                                      aggs_music_qualities_=aggs_music_qualities,
                                      i_=i_,
                                     aggs_trkvec_=aggs_trkvec, list_trkvec_=list_trkvec)
        
        id_trn, id_stp = train_test_split(X_trn.index, **split_params)

        i_trk = 0

        # merge track aggregates
        X = pd.concat([X_trn, X_trk[i_trk]], axis=1)
        # add predictions for the last modelled track
        if len(X_prob) > 0:
            X = pd.concat([X, pd.DataFrame({'pred_trk{}'.format(j): X_prob[j] 
                                            for j in range(len(X_prob))
                                           })],
                          axis=1)
        # get training and early-stop data and targets
        X_trn_, y_trn_ = X.loc[id_trn,:], y_trn[i_trk].loc[id_trn]
        X_stp_, y_stp_ = X.loc[id_stp,:], y_trn[i_trk].loc[id_stp]

        # limit yourself to long-enough sessions
        if i_ >= 5:
            orig_len = len(X_trn_)
            is_long_session = X_trn_['session_length'] >= (2*i_+1)
            X_trn_, y_trn_ = (X_trn_[is_long_session],
                              y_trn_[is_long_session])
            is_long_session = X_stp_['session_length'] >= (2*i_+1)
            X_stp_, y_stp_ = (X_stp_[is_long_session],
                              y_stp_[is_long_session])
            y_lists_stp_ = y_lists_stp[is_long_session.values]

            final_len = len(X_trn_)
            print('Kept {:.2f}% of data'.format(100. * final_len / orig_len))
        else:
            y_lists_stp_ = y_lists_stp

    #     display(y_trn_.head(30))

        fit_params = {'eval_names': ['train', 'early_stop'],
                      'eval_set': [(X_trn_, y_trn_), 
                                   (X_stp_, y_stp_)],
                      'eval_metric': 'binary_error',
                      'verbose':100, 'early_stopping_rounds':60,
                      'callbacks':[lgb.reset_parameter(learning_rate=learning_rate_decay_power_0995)]}    

        mdl.fit(X_trn_, y_trn_, 
                **fit_params)

        del X_trn_, y_trn_, X_stp_, y_stp_
        gc.collect()

        # save full (trn+val) prediction to be used in modelling
        X_prob.append(mdl.predict_proba(X)[:,1])


        # store the model
        import joblib
        joblib.dump(mdl, 'models/model_v2_m05i{}_{}f_{}.pkl'.format(i_iter, n_files, i_))
        del X
        gc.collect()
        print_RAM()

# Main training loop

In [6]:
for i in range(n_iter):
    train_models_for_each_track(i_iter=i, i_step=n_files)

Train 0-th iteration
['data/training_set/outDD_v2_00.h5', 'data/training_set/outDD_v2_01.h5', 'data/training_set/outDD_v2_02.h5', 'data/training_set/outDD_v2_03.h5', 'data/training_set/outDD_v2_04.h5', 'data/training_set/outDD_v2_05.h5']
['data/training_set/outDD_v4_00.h5', 'data/training_set/outDD_v4_01.h5', 'data/training_set/outDD_v4_02.h5', 'data/training_set/outDD_v4_03.h5', 'data/training_set/outDD_v4_04.h5', 'data/training_set/outDD_v4_05.h5']
(20107425, 46)
Memory = 2.86 GB
['data/training_set/y_00_.h5', 'data/training_set/y_01_.h5', 'data/training_set/y_02_.h5', 'data/training_set/y_03_.h5', 'data/training_set/y_04_.h5', 'data/training_set/y_05_.h5']
----------- 0 -------------
Full dataframe length = 20107425
Training until validation scores don't improve for 60 rounds.
[100]	train's binary_error: 0.217083	early_stop's binary_error: 0.220502
Early stopping, best iteration is:
[69]	train's binary_error: 0.218226	early_stop's binary_error: 0.220203
Memory = 5.12 GB
----------- 

['data/training_set/outDD_v4_12.h5', 'data/training_set/outDD_v4_13.h5', 'data/training_set/outDD_v4_14.h5', 'data/training_set/outDD_v4_15.h5', 'data/training_set/outDD_v4_16.h5', 'data/training_set/outDD_v4_17.h5']
(19047719, 46)
Memory = 6.50 GB
['data/training_set/y_12_.h5', 'data/training_set/y_13_.h5', 'data/training_set/y_14_.h5', 'data/training_set/y_15_.h5', 'data/training_set/y_16_.h5', 'data/training_set/y_17_.h5']
----------- 0 -------------
Full dataframe length = 19047719
Training until validation scores don't improve for 60 rounds.
[100]	train's binary_error: 0.216258	early_stop's binary_error: 0.219531
Early stopping, best iteration is:
[108]	train's binary_error: 0.215988	early_stop's binary_error: 0.219426
Memory = 7.13 GB
----------- 1 -------------
Full dataframe length = 19047719
Training until validation scores don't improve for 60 rounds.
[100]	train's binary_error: 0.289021	early_stop's binary_error: 0.296752
[200]	train's binary_error: 0.28373	early_stop's bina

['data/training_set/outDD_v4_24.h5', 'data/training_set/outDD_v4_25.h5', 'data/training_set/outDD_v4_26.h5', 'data/training_set/outDD_v4_27.h5', 'data/training_set/outDD_v4_28.h5', 'data/training_set/outDD_v4_29.h5']
(18734510, 46)
Memory = 7.30 GB
['data/training_set/y_24_.h5', 'data/training_set/y_25_.h5', 'data/training_set/y_26_.h5', 'data/training_set/y_27_.h5', 'data/training_set/y_28_.h5', 'data/training_set/y_29_.h5']
----------- 0 -------------
Full dataframe length = 18734510
Training until validation scores don't improve for 60 rounds.
[100]	train's binary_error: 0.215924	early_stop's binary_error: 0.218282
Early stopping, best iteration is:
[95]	train's binary_error: 0.216067	early_stop's binary_error: 0.217998
Memory = 7.85 GB
----------- 1 -------------
Full dataframe length = 18734510
Training until validation scores don't improve for 60 rounds.
[100]	train's binary_error: 0.289138	early_stop's binary_error: 0.294118
Early stopping, best iteration is:
[51]	train's binary