# Why this kernel?
Why should you read through this kernel? The goal is to have a visual guide on which strategy leads to the win:

- the data will be read and **memory footprint will be reduced**;
- **missing data** will be checked;
- **aggregations of the data over teams are performed**;
- a baseline model **on team level** will be trained:
   - Gradient boosting model as implemented in **LightGBM** is used;
   - **Mean absolute error (MAE) is used as the loss function** in the training (consistently with the final evaluation metric). **FAIR loss**  was also tried and seems to lead similar results
   - Training is performed with **early stopping based on MAE metric**.
 - The training is implemented in a cross validation (CV) loop and **out-of-fold (OOF) predictions are stored** for future use in stacking.
 - **Test predictions** are obtained as an **average over predictions from models trained on k-1 fold subsets**.
- Predictions are **clipped to `[0,1]` range**

# Side note: score of 0.0411 can be achieved with only 100k games from the train set with ranking post-processing

In [None]:
# The number of MATCHES to use in training. Whole training dataset is used anyway. Use it to have fast turn-around. Set to 50k for all entries
max_matches_trn=50000
# The number of entries from test to read in. Use it to have fast turn-around. Set to None for all entries
max_events_tst=None
# Number on CV folds
n_cv=3

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.simplefilter(action='ignore', category=Warning)

from sklearn.metrics import mean_squared_error, mean_absolute_error

import os
print(os.listdir("../input"))

Define a function to reduce memory foorprint

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object and col_type.name != 'category' and 'datetime' not in col_type.name:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif 'datetime' not in col_type.name:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

Read in the data

In [None]:
df_trn = pd.read_csv('../input/train.csv', nrows=None)
df_trn = reduce_mem_usage(df_trn)

df_tst = pd.read_csv('../input/test.csv',  nrows=max_events_tst)
df_tst = reduce_mem_usage(df_tst)

In [None]:
df_trn = df_trn.query('matchId < @max_matches_trn')

In [None]:
print('Number of training entries after selecting a subset of matches: {}'.format(df_trn.shape[0]))

In [None]:
# we will NOT use in training
features_not2use = ['Id', 'groupId', 'matchId', 'numGroups']

## How do the data look like?

In [None]:
df_trn.head()

In [None]:
df_trn.info(memory_usage='deep', verbose=False)

In [None]:
df_tst.info(memory_usage='deep', verbose=False)

- The training dataset has 4.3M entries, which is not small and aloows for advanced models like GBM and NN to dominate.
- The test dataset is only 1.9M entries
- There are 25 features (+ the target in the train dataset)

## Are there missing data?

In [None]:
df_trn.isnull().sum().sum()

Good news: **There are no entries with `np.nan`**, so at the first glance we do not need to do anything fancy about missing data. 

There might be some default values pre-filled into missing entries- this would have to be discovered.

# Feature engineering: group by teams

In [None]:
def fe(df_):
    for s_gb, s_name in [('groupId', 'groupSize'),
                         ('matchId', 'numPlayers')
                        ]:
        series = df_.groupby(s_gb).size().to_frame()
        series.columns = [s_name]
        df_ = df_.merge(series, left_on=s_gb, right_index=True, how='left')
    
    df_aggs = df_.groupby('matchId').agg({'killPlace':['max'],
                               'groupId': ['min']
                              })
    df_aggs.columns = pd.Index([e[0] + "_" + e[1].upper() for e in df_aggs.columns])
    df_ = df_.merge(df_aggs, left_on='matchId', right_index=True, how='left')
    
    feats_add = [('rsDistance', 'rideDistance', 'swimDistance'),
                 ('totalDistance', 'fe_rsDistance_ADD', 'walkDistance'),
                 ('totalPoints', 'killPoints', 'winPoints'),
                 #('', '', '')
                ]
    feats_sub = [('groupId_DIFF', 'groupId', 'groupId_MIN')
                ]
    feats_div = [
                 ('headshotFraction', 'headshotKills', 'kills'),
                 ('weaponsPerMeter', 'weaponsAcquired', 'fe_totalDistance_ADD'),
                 ('kill2win', 'killPoints', 'winPoints'),
                 ('damage2kills', 'damageDealt', 'kills'),
                 ('killPlace_maxPlace' , 'killPlace', 'maxPlace'),
                ('killPlace_maxKillPlace' , 'killPlace', 'killPlace_MAX'),
                ('killPlace_numPlayers' , 'killPlace', 'numPlayers'),
                ]

    for f_new, f1, f2 in feats_add:
        df_['fe_' + f_new + '_ADD'] = df_[f1] + df_[f2]
    for f_new, f1, f2 in feats_sub:
        df_['fe_' + f_new + '_SUB'] = df_[f1] - df_[f2]
    for f_new, f1, f2 in feats_div:
        df_['fe_' + f_new + '_DIV'] = df_[f1] / df_[f2]
        
    return df_
        
df_trn = fe(df_trn)
df_tst = fe(df_tst)

In [None]:
agg_team = {c: ['mean', 'min', 'max', 'median', 'sum', 'std'] 
            for c in [c 
                      for c in df_trn.columns 
                      if c not in features_not2use 
                      and c != 'winPlacePerc']
           }
agg_team['numGroups'] = ['size','median']

print(agg_team.keys())

def preprocess(df):    
    df_gb = df.groupby('groupId').agg(agg_team)
    df_gb.columns = pd.Index([e[0] + "_" + e[1].upper() for e in df_gb.columns])    
    return df_gb

df_trn_gb = preprocess(df_trn)
df_tst_gb = preprocess(df_tst)

y = df_trn.groupby('groupId')['winPlacePerc'].median()
# target for the ranker
y_rnk = df_trn.groupby(['matchId', 'groupId'])['winPlacePerc'].median().groupby('matchId').rank().astype(np.int16)
y_rnk.index = y_rnk.index.droplevel(0)

# Prepare the data

In [None]:
w_trn = df_trn_gb['numGroups_SIZE']
w_tst = df_tst_gb['numGroups_SIZE']

In [None]:
g_trn = df_trn_gb.iloc[:,:0].merge(df_trn[['matchId','groupId']].drop_duplicates().set_index('groupId'), 
                                   how='left', left_index=True, right_index=True)['matchId']
g_tst = df_tst_gb.iloc[:,:0].merge(df_tst[['matchId','groupId']].drop_duplicates().set_index('groupId'), 
                                   how='left', left_index=True, right_index=True)['matchId']

# Train and evaluate a model
Start by defining handy helper functions...

In [None]:
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.base import clone, ClassifierMixin, RegressorMixin
import lightgbm as lgb


def learning_rate_decay_power(current_iter):
    '''
    The function defines learning rate deay for LGBM
    '''
    base_learning_rate = 0.20
    min_lr = 5e-2
    lr = base_learning_rate  * np.power(.996, current_iter)
    return lr if lr > min_lr else min_lr


def train_single_model(clf_, X_, y_, random_state_=314, opt_parameters_={}, fit_params_={}):
    '''
    A wrapper to train a model with particular parameters
    '''
    c = clone(clf_)
    c.set_params(**opt_parameters_)
    c.set_params(random_state=random_state_)
    return c.fit(X_, y_, **fit_params_)

def train_model_in_CV(model, X, y, metric, metric_args={},
                            model_name='xmodel',
                            seed=31416, n=5,
                            opt_parameters_={}, fit_params_={},
                            verbose=True,
                            groups=None, y_eval=None):
    # the list of classifiers for voting ensable
    clfs = []
    # performance 
    perf_eval = {'score_i_oof': 0,
                 'score_i_ave': 0,
                 'score_i_std': 0,
                 'score_i': []
                }
    # full-sample oof prediction
    y_full_oof = pd.Series(np.zeros(shape=(y.shape[0],)), 
                          index=y.index)
    
    sample_weight=None
    if 'sample_weight' in metric_args:
        sample_weight=metric_args['sample_weight']
        
    index_weight=None
    if 'index_weight' in metric_args:
        index_weight=metric_args['index_weight']
        del metric_args['index_weight']
        
    doSqrt=False
    if 'sqrt' in metric_args:
        doSqrt=True
        del metric_args['sqrt']

    if groups is None:
        cv = KFold(n, shuffle=True, random_state=seed) #Stratified
    else:
        cv = GroupKFold(n)
    # The out-of-fold (oof) prediction for the k-1 sample in the outer CV loop
    y_oof = pd.Series(np.zeros(shape=(X.shape[0],)), 
                      index=X.index)
    scores = []
    clfs = []

    for n_fold, (trn_idx, val_idx) in enumerate(cv.split(X, (y!=0).astype(np.int8), groups=groups)):
        X_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        
        if 'LGBMRanker' in type(model).__name__ and groups is not None:
            G_trn, G_val = groups.iloc[trn_idx], groups.iloc[val_idx]        

        if fit_params_:
            # use _stp data for early stopping
            fit_params_["eval_set"] = [(X_trn,y_trn), (X_val,y_val)]
            fit_params_['verbose'] = verbose
            if index_weight is not None:
                fit_params_["sample_weight"] = y_trn.index.map(index_weight).values
                fit_params_["eval_sample_weight"] = [None, y_val.index.map(index_weight).values]
            if 'LGBMRanker' in type(model).__name__ and groups is not None:
                fit_params_['group'] = G_trn.groupby(G_trn, sort=False).count()
                fit_params_['eval_group'] = [G_trn.groupby(G_trn, sort=False).count(),
                                             G_val.groupby(G_val, sort=False).count()]

        #display(y_trn.head())
        clf = train_single_model(model, X_trn, y_trn, 314+n_fold, opt_parameters_, fit_params_)

        clfs.append(('{}{}'.format(model_name,n_fold), clf))
        # oof predictions
        if isinstance(clf, RegressorMixin):
            y_oof.iloc[val_idx] = clf.predict(X_val)
        elif isinstance(clf, ClassifierMixin):
            y_oof.iloc[val_idx] = clf.predict_proba(X_val)[:,1]
        else:
            y_oof.iloc[val_idx] = clf.predict(X_val)
        # prepare weights for evaluation
        if sample_weight is not None:
            metric_args['sample_weight'] = y_val.map(sample_weight)
        elif index_weight is not None:
            metric_args['sample_weight'] = y_val.index.map(index_weight).values
        # prepare target values
        y_true_tmp = y_val if 'LGBMRanker' not in type(model).__name__  and y_eval is None else y_eval.iloc[val_idx]
        y_pred_tmp = y_oof.iloc[val_idx] if y_eval is None else y_oof.iloc[val_idx]        
        #store evaluated metric
        scores.append(metric(y_true_tmp, y_pred_tmp, **metric_args))
        #cleanup
        del X_trn, y_trn, X_val, y_val, y_true_tmp, y_pred_tmp

    # Store performance info for this CV
    if sample_weight is not None:
        metric_args['sample_weight'] = y_oof.map(sample_weight)
    elif index_weight is not None:
        metric_args['sample_weight'] = y_oof.index.map(index_weight).values
    perf_eval['score_i_oof'] = metric(y, y_oof, **metric_args)
    perf_eval['score_i'] = scores
    
    if doSqrt:
        for k in perf_eval.keys():
            if 'score' in k:
                perf_eval[k] = np.sqrt(perf_eval[k])
        scores = np.sqrt(scores)
            
    perf_eval['score_i_ave'] = np.mean(scores)
    perf_eval['score_i_std'] = np.std(scores)

    return clfs, perf_eval, y_oof

def print_perf_clf(name, perf_eval):
    print('Performance of the model:')    
    print('Mean(Val) score inner {} Classifier: {:.4f}+-{:.4f}'.format(name, 
                                                                      perf_eval['score_i_ave'],
                                                                      perf_eval['score_i_std']
                                                                     ))
    print('Min/max scores on folds: {:.4f} / {:.4f}'.format(np.min(perf_eval['score_i']),
                                                            np.max(perf_eval['score_i'])))
    print('OOF score inner {} Classifier: {:.4f}'.format(name, perf_eval['score_i_oof']))
    print('Scores in individual folds: {}'.format(perf_eval['score_i']))

In [None]:
# mdl_inputs = {
#         'lgbm1_rnk': (lgb.LGBMRanker(max_depth=-1, min_child_samples=400, random_state=314, silent=True, metric='None', 
#                                      n_jobs=4, n_estimators=5000, learning_rate=0.05,
#                                      label_gain=np.logspace(0,100, num=101, base=2)-1
#                                     ),
#                  {'colsample_bytree': 0.75, 'min_child_weight': 10.0, 'num_leaves': 30, 'reg_alpha': 1, 'subsample': 0.75}, 
#                  {"early_stopping_rounds":100, 
#                   "eval_metric" : 'ndcg',
#                   'eval_at':1,
#                   'eval_names': ['train', 'early_stop'],
#                   'verbose': False, 
#                   #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_decay_power)],
#                   'categorical_feature': 'auto'},
#                  y_rnk
#                 ),
#        }

In [None]:
# %%time
# mdls = {}
# results = {}
# y_oofs = {}
# for name, (mdl, mdl_pars, fit_pars, y_) in mdl_inputs.items():
#     print('--------------- {} -----------'.format(name))
#     mdl_, perf_eval_, y_oof_ = train_model_in_CV(mdl, df_trn_gb, y_, mean_absolute_error, 
#                                                           metric_args={},#'index_weight': w_trn},
#                                                           model_name=name, 
#                                                           opt_parameters_=mdl_pars,
#                                                           fit_params_=fit_pars, 
#                                                           n=n_cv,
#                                                           verbose=500, 
#                                                           groups=g_trn, y_eval=y)
#     results[name] = perf_eval_
#     mdls[name] = mdl_
#     y_oofs[name] = y_oof_
#     print_perf_clf(name, perf_eval_)

Now let's define the parameter and model in a scalable fashion (we can add later on further models to the list and it will work out-of-the-box). 

The format is a dictionary with keys that are user model names and items being an array (or tuple) of:

- model to be fitted;
- additional model parameters to be set;
- model fit parameters (they are passed to `model.fit()` call);
- target variable.

In [None]:
mdl_inputs = {
        # This will be with MAE loss
        'lgbm1_reg': (lgb.LGBMRegressor(max_depth=-1, min_child_samples=400, random_state=314, silent=True, metric='None', 
                                        n_jobs=4, n_estimators=1000, learning_rate=0.1),
                 {'objective': 'mse', 'colsample_bytree': 0.75, 'min_child_weight': 10.0, 'num_leaves': 30, 'reg_alpha': 1},#, 'subsample': 0.75}, 
                 {"early_stopping_rounds":100, 
                  "eval_metric" : 'mae',
                  'eval_names': ['train', 'early_stop'],
                  'verbose': False, 
                  #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_decay_power)],
                  'categorical_feature': 'auto'},
                 y,
                 None
                ),
#         'lgbm1_rnk': (lgb.LGBMRanker(max_depth=-1, min_child_samples=400, random_state=314, silent=True, metric='None', 
#                                      n_jobs=4, n_estimators=5000, learning_rate=0.1,
#                                      label_gain=np.logspace(0,100, num=101, base=2)-1
#                                     ),
#                  {'colsample_bytree': 0.75, 'min_child_weight': 10.0, 'num_leaves': 30, 'reg_alpha': 1, 'subsample': 0.75}, 
#                  {"early_stopping_rounds":100, 
#                   "eval_metric" : 'ndcg',
#                   'eval_names': ['train', 'early_stop'],
#                   'verbose': False, 
#                   #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_decay_power)],
#                   'categorical_feature': 'auto'},
#                  y_rnk
#                 ),
       }

Do the actual model training

In [None]:
%%time
mdls = {}
results = {}
y_oofs = {}
for name, (mdl, mdl_pars, fit_pars, y_, g_) in mdl_inputs.items():
    print('--------------- {} -----------'.format(name))
    mdl_, perf_eval_, y_oof_ = train_model_in_CV(mdl, df_trn_gb, y_, mean_absolute_error, 
                                                          metric_args={'index_weight': w_trn},
                                                          model_name=name, 
                                                          opt_parameters_=mdl_pars,
                                                          fit_params_=fit_pars, 
                                                          n=n_cv,
                                                          verbose=500, 
                                                          groups=g_)
    results[name] = perf_eval_
    mdls[name] = mdl_
    y_oofs[name] = y_oof_
    print_perf_clf(name, perf_eval_)

Let's plot how predictions look like

In [None]:
k = list(y_oofs.keys())[0]
_ = y_oofs[k].plot('hist', bins=100, figsize=(15,6))
plt.xlabel('Predicted winPlacePerc OOF')

Note, that predictions are spilled outside of the `[0,1]` range, which is not meaningful for percentage value. **We will clip test predictions to be within the meaningful range.** This will improve the score slightly

## Visualise importance of features

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def display_importances(feature_importance_df_, n_feat=30, silent=False, dump_strs=[], 
                        fout_name=None, title='Features (avg over folds)'):
    '''
    Make a plot of most important features from a tree-based model

    Parameters
    ----------
    feature_importance_df_ : pd.DataFrame
        The input dataframe. 
        Must contain columns `'feature'` and `'importance'`.
        The dataframe will be first grouped by `'feature'` and the mean `'importance'` will be calculated.
        This allows to calculate and plot importance averaged over folds, 
        when the same features appear in the dataframe as many time as there are folds in CV.
    n_feats : int [default: 20]
        The maximum number of the top features to be plotted
    silent : bool [default: False]
        Dump additionsl information, in particular the mean importances for features 
        defined by `dump_strs` and the features with zero (<1e-3) importance
    dump_strs : list of strings [default: []]
        Features containing either of these srings will be printed to the screen
    fout_name : str or None [default: None]
        The name of the file to dump the figure. 
        If `None`, no file is created (to be used in notebooks)
    '''
    # Plot feature importances
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(
            by="importance", ascending=False)[:n_feat].index  
    
    mean_imp = feature_importance_df_[["feature", "importance"]].groupby("feature").mean()
    df_2_neglect = mean_imp[mean_imp['importance'] < 1e-3]
    
    if not silent:
        print('The list of features with 0 importance: ')
        print(df_2_neglect.index.values.tolist())

        pd.set_option('display.max_rows', 500)
        pd.set_option('display.max_columns', 500)
        for feat_prefix in dump_strs:
            feat_names = [x for x in mean_imp.index if feat_prefix in x]
            print(mean_imp.loc[feat_names].sort_values(by='importance', ascending=False))
    del mean_imp, df_2_neglect
    
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    
    plt.figure(figsize=(8,10))
    sns.barplot(x="importance", y="feature", 
                data=best_features.sort_values(by="importance", ascending=False))
    plt.title(title)
    plt.tight_layout()

    if fout_name is not None:
        plt.savefig(fout_name)


In [None]:
display_importances(pd.DataFrame({'feature': df_trn_gb.columns,
                                  'importance': mdls['lgbm1_reg'][0][1].booster_.feature_importance('gain')}),
                    n_feat=20,
                    title='GAIN feature importance',
                    fout_name='feature_importance_gain.png'
                   )

## Prepare submission

In [None]:
%%time
y_subs= {}
for c in mdl_inputs:
    mdls_= mdls[c]
    y_sub = np.zeros(df_tst_gb.shape[0])
    for mdl_ in mdls_:
        y_sub += np.clip(mdl_[1].predict(df_tst_gb), 0, 1)
    y_sub /= n_cv
    
    y_subs[c] = y_sub

In [None]:
df_sub = pd.read_csv('../input/sample_submission.csv', nrows=max_events_tst)

In [None]:

for c in mdl_inputs:
    #Submission predictions
    y_tmp = pd.Series(y_subs[c], index=df_tst_gb.index)
    df_sub['winPlacePerc'] = df_tst.iloc[:,:5].merge(y_tmp.to_frame(), right_index=True, left_on='groupId', how='left')[0]
    df_sub.to_csv('sub_{}.csv'.format(c), index=False)
    #submission predictions ranked within each game
    y_sub_ranked = y_tmp.to_frame().merge(df_tst[['groupId', 'matchId']].drop_duplicates(), 
                           left_index=True, 
                           right_on='groupId', 
                           how='left').set_index(['matchId','groupId']).groupby(['matchId']).rank(pct=True)
    df_sub['winPlacePerc'] = df_tst[['groupId', 'matchId']].merge(y_sub_ranked, how='left', on=['matchId','groupId'])[0]
    df_sub.to_csv('sub_{}_ranked.csv'.format(c), index=False)
    
    # OOF predictions
    oof = pd.DataFrame(index=df_trn.index, columns=['winPlacePerc'])
    oof['winPlacePerc'] = df_trn.iloc[:,:5].merge(y_oofs[c].to_frame(), right_index=True, left_on='groupId', how='left')[0]
    oof.clip(0, 1, inplace=True)
    print('{} MAE OOF score = {:.4f}'.format(c, mean_absolute_error(df_trn['winPlacePerc'], oof['winPlacePerc'])))
    oof.to_csv('oof_{}.csv'.format(c), index=False)
    # OOF predictions ranked
    y_oof_ranked = y_oofs[c].to_frame().merge(df_trn[['groupId', 'matchId']].drop_duplicates(), 
                           left_index=True, 
                           right_on='groupId', 
                           how='left').set_index(['matchId','groupId']).groupby(['matchId']).rank(pct=True) 
    oof['winPlacePerc'] = (df_trn[['groupId', 'matchId']].merge(y_oof_ranked, how='left', on=['matchId','groupId'])[0]).values
    print('{} MAE OOF score = {:.4f}'.format(c, mean_absolute_error(df_trn['winPlacePerc'], oof['winPlacePerc'])))
    oof.to_csv('oof_{}_ranked.csv'.format(c), index=False)

In [None]:
!ls