In [1]:
import pandas as pd
import numpy as np
import random
import datetime
from datetime import datetime

import xgboost as xgb
from sklearn.metrics import mean_squared_error

## Notes

## Dataset preparation

### Load dataset

In [2]:
data = pd.read_parquet('dataset_warzone_kd_bigger.parquet.gzip')
display(data.head())
print(data.shape)

Unnamed: 0,utcStartSeconds,utcEndSeconds,matchID,duration,playlistName,version,gameType,playerCount,teamCount,rankedTeams,...,player.awards.seven,player.awards.eight,playerStats.objectiveBrLootChopperBoxOpen,playerStats.objectiveShieldDamage,player.awards.simultaneous_kill,playerStats.objectiveBrPerseusLockerDoorOpenEe,squad,lobbykd,pct_playerskd,map
0,1662835000.0,1662836000.0,3409667761123289245,821000.0,,1.0,wz,40.0,12.0,,...,,,,,,,Quads,1.13,82.5,rebirth
1,1662835000.0,1662836000.0,3409667761123289245,821000.0,,1.0,wz,40.0,12.0,,...,,,,,,,Quads,1.13,82.5,rebirth
2,1662835000.0,1662836000.0,3409667761123289245,821000.0,,1.0,wz,40.0,12.0,,...,,,,,,,Quads,1.13,82.5,rebirth
3,1662835000.0,1662836000.0,3409667761123289245,821000.0,,1.0,wz,40.0,12.0,,...,,,,,,,Quads,1.13,82.5,rebirth
4,1662835000.0,1662836000.0,3409667761123289245,821000.0,,1.0,wz,40.0,12.0,,...,,,,,,,Quads,1.13,82.5,rebirth


(55270, 154)


In [3]:
categorical_features = [
    'map',
    'squad'
]
data[categorical_features] = data[categorical_features].astype('category')

# rename 'lobbykd' as 'target', since we want to predict the avg k/d ratio of our game
data = data.rename(columns={'lobbykd':'target'})
data = data.reindex(columns = [col for col in data.columns if col != 'target'] + ['target'])
data.head(2)

Unnamed: 0,utcStartSeconds,utcEndSeconds,matchID,duration,playlistName,version,gameType,playerCount,teamCount,rankedTeams,...,player.awards.seven,player.awards.eight,playerStats.objectiveBrLootChopperBoxOpen,playerStats.objectiveShieldDamage,player.awards.simultaneous_kill,playerStats.objectiveBrPerseusLockerDoorOpenEe,squad,pct_playerskd,map,target
0,1662835000.0,1662836000.0,3409667761123289245,821000.0,,1.0,wz,40.0,12.0,,...,,,,,,,Quads,82.5,rebirth,1.13
1,1662835000.0,1662836000.0,3409667761123289245,821000.0,,1.0,wz,40.0,12.0,,...,,,,,,,Quads,82.5,rebirth,1.13


In [4]:
# types, missing values
data.info(max_cols=199, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55270 entries, 0 to 64457
Data columns (total 154 columns):
 #    Column                                                                 Non-Null Count  Dtype   
---   ------                                                                 --------------  -----   
 0    utcStartSeconds                                                        55270 non-null  float64 
 1    utcEndSeconds                                                          55270 non-null  float64 
 2    matchID                                                                55270 non-null  object  
 3    duration                                                               55270 non-null  float64 
 4    playlistName                                                           0 non-null      object  
 5    version                                                                55270 non-null  float64 
 6    gameType                                                            

## Exploratory data analysis

#### tmp

In [5]:
data[['matchID', 'playerCount', 'squad', 'map', 'playerStats.damageDone']].sort_values(by='playerStats.damageDone', ascending=False).head(20)

Unnamed: 0,matchID,playerCount,squad,map,playerStats.damageDone
38613,1370069958461487008,52.0,Quads,fortkeep,4294967000.0
55090,855032827875055390,42.0,Quads,rebirth,12718.0
63532,10628305472550279382,40.0,Quads,rebirth,12391.0
10273,7362251540313581052,52.0,Quads,fortkeep,11487.0
39966,3765989571523210640,44.0,Trios,rebirth,11138.0
9986,1501974320764785975,40.0,Quads,rebirth,10565.0
13144,13547399530503207787,52.0,Quads,fortkeep,10535.0
50310,1766746682563678467,40.0,Quads,rebirth,10527.0
49386,2449047677272433734,39.0,Quads,rebirth,10477.0
46143,14089138655969584359,40.0,Quads,rebirth,10262.0


### Quick overall shape/stats at a glance

The main specificity of our dataset is that each 'record' (match) is 'multi-dimensional' (several players with n stats/features attached): <br>
I.e. We want to predict the avg kills/deaths (kd) ratio of each match, but each match does not correspond to a single row (a single set of features), but rather to n rows / players playing in this match.<br>
One Warzone "Resurgence" match --identified with a matchID, can count up to 50 teams of 1 player (solo mode), or up to 57 players when mode is set to duos, trios, quads. Each player (a row) in a match is attached with a number of features (number of kills, deaths, assists, headshots, xp awards...).<br>

In [6]:
# 807 matches, but 37808 records (players)
print(data.shape)
print(data.matchID.nunique())

(55270, 154)
1170


In [7]:
data.describe()

Unnamed: 0,utcStartSeconds,utcEndSeconds,duration,version,playerCount,teamCount,playerStats.kills,playerStats.medalXp,playerStats.matchXp,playerStats.scoreXp,...,player.awards.backstab,playerStats.objectiveMedalScoreKillSsRadarDrone,player.awards.seven,player.awards.eight,playerStats.objectiveBrLootChopperBoxOpen,playerStats.objectiveShieldDamage,player.awards.simultaneous_kill,playerStats.objectiveBrPerseusLockerDoorOpenEe,pct_playerskd,target
count,55270.0,55270.0,55270.0,55270.0,55270.0,55270.0,55270.0,55270.0,55270.0,55270.0,...,0.0,1.0,4.0,2.0,6.0,2.0,2.0,1.0,55270.0,55270.0
mean,1662564000.0,1662565000.0,822161.697123,1.0,47.415017,17.527447,2.838321,110.598969,3104.130342,4270.571522,...,,2.0,0.0,0.0,1.0,2.0,0.0,1.0,79.011476,1.140005
std,485994.8,485997.2,50424.44463,0.0,5.08714,10.220768,2.996422,211.820421,1783.029725,3539.88024,...,,,0.0,0.0,0.0,1.414214,0.0,,9.189453,0.132615
min,1661549000.0,1661550000.0,534000.0,1.0,30.0,9.0,0.0,0.0,0.0,0.0,...,,2.0,0.0,0.0,1.0,1.0,0.0,1.0,60.0,0.8
25%,1662241000.0,1662242000.0,797000.0,1.0,42.0,11.0,1.0,0.0,1692.0,1500.0,...,,2.0,0.0,0.0,1.0,1.5,0.0,1.0,72.5,1.04
50%,1662377000.0,1662378000.0,824000.0,1.0,50.0,14.0,2.0,20.0,3153.0,3575.0,...,,2.0,0.0,0.0,1.0,2.0,0.0,1.0,79.069767,1.14
75%,1662850000.0,1662851000.0,851000.0,1.0,52.0,18.0,4.0,80.0,4324.0,6175.0,...,,2.0,0.0,0.0,1.0,2.5,0.0,1.0,85.454545,1.23
max,1663732000.0,1663733000.0,970000.0,1.0,58.0,50.0,29.0,3740.0,10622.0,72530.0,...,,2.0,0.0,0.0,1.0,3.0,0.0,1.0,100.0,1.67


In [8]:
data.groupby(by='matchID')[['playerStats.kills', 'playerStats.deaths', 'playerStats.kdRatio', 'playerStats.damageDone']].agg(['count', 'sum', 'mean', 'median'])

Unnamed: 0_level_0,playerStats.kills,playerStats.kills,playerStats.kills,playerStats.kills,playerStats.deaths,playerStats.deaths,playerStats.deaths,playerStats.deaths,playerStats.kdRatio,playerStats.kdRatio,playerStats.kdRatio,playerStats.kdRatio,playerStats.damageDone,playerStats.damageDone,playerStats.damageDone,playerStats.damageDone
Unnamed: 0_level_1,count,sum,mean,median,count,sum,mean,median,count,sum,mean,median,count,sum,mean,median
matchID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
10006862765041148091,45,101.0,2.244444,2.0,45,107.0,2.377778,2.0,45,57.750000,1.283333,0.800000,45,58445.0,1298.777778,1062.0
10022431813918295442,46,86.0,1.869565,1.0,46,87.0,1.891304,2.0,46,54.316667,1.180797,1.000000,46,44715.0,972.065217,799.5
10082009794454927010,53,185.0,3.490566,3.0,53,197.0,3.716981,3.0,53,55.233333,1.042138,0.833333,53,104978.0,1980.716981,1658.0
10090996203127539604,50,83.0,1.660000,1.0,50,85.0,1.700000,1.0,50,51.500000,1.030000,1.000000,50,49394.0,987.880000,715.0
10134718954536800884,46,114.0,2.478261,2.0,46,117.0,2.543478,2.5,46,49.083333,1.067029,0.708333,46,62215.0,1352.500000,1065.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9952914128324669505,52,163.0,3.134615,3.0,52,170.0,3.269231,3.0,52,69.788095,1.342079,1.000000,52,94405.0,1815.480769,1552.0
9961080040106933532,40,111.0,2.775000,2.0,40,115.0,2.875000,3.0,40,40.719048,1.017976,0.708333,40,67647.0,1691.175000,1528.5
9963452121608988341,52,140.0,2.692308,2.0,52,139.0,2.673077,2.5,52,51.283333,0.986218,0.500000,52,80934.0,1556.423077,1341.5
9971642112346017660,40,115.0,2.875000,2.0,40,117.0,2.925000,2.0,40,52.007143,1.300179,0.773810,40,61750.0,1543.750000,1336.0


In [9]:
pd.DataFrame(data.isna().sum().div(len(data)).mul(100).sort_values(ascending=False))[0:50]

Unnamed: 0,0
playlistName,100.0
rankedTeams,100.0
player.awards.backstab,100.0
playerStats.objectiveHack,99.998191
playerStats.objectiveMedalScoreKillSsRadarDrone,99.998191
playerStats.objectiveDestroyedVehicleMedium,99.998191
playerStats.objectiveTrophyDefense,99.998191
playerStats.objectiveBrPerseusLockerDoorOpenEe,99.998191
playerStats.objectiveBrForgottenLockerDoorOpenEe,99.998191
player.awards.simultaneous_kill,99.996381


In [10]:
pd.DataFrame(data.groupby(by='matchID').agg('mean').isna().sum().sort_values(ascending=False))[70:90]

Unnamed: 0,0
player.brMissionStats.missionStatsByType.masterassassination.xp,528
player.awards.triple,524
player.awards.explosive_stick,477
player.brMissionStats.missionStatsByType.timedrun.count,331
player.brMissionStats.missionStatsByType.timedrun.weaponXp,331
player.brMissionStats.missionStatsByType.timedrun.xp,331
player.awards.comeback,310
player.awards.air_to_air_kill,289
player.brMissionStats.missionStatsByType.scavenger.count,288
player.brMissionStats.missionStatsByType.scavenger.xp,288


In [11]:
# notes:
# EDA : outlier detection on damageDone with max values way above average

## XGB Prediction

### Train Test Split

In [12]:
# As we have multidimensional data for each match (one match = several players/rows, make sure we train test split
# keeping all rows (players) with same match Id in respective train or test dataset
# we're keeping 20 pct of match Ids for test dataset

random.seed(42)
match_uuids = list(set(data.matchID.tolist()))
split_ids = random.sample(match_uuids, int(20*len(match_uuids)/100))

test = data.query('matchID in @split_ids')
train = data.query('matchID not in @split_ids')

display(train.head(2))
print(test.shape)
print(train.shape)

Unnamed: 0,utcStartSeconds,utcEndSeconds,matchID,duration,playlistName,version,gameType,playerCount,teamCount,rankedTeams,...,player.awards.seven,player.awards.eight,playerStats.objectiveBrLootChopperBoxOpen,playerStats.objectiveShieldDamage,player.awards.simultaneous_kill,playerStats.objectiveBrPerseusLockerDoorOpenEe,squad,pct_playerskd,map,target
0,1662835000.0,1662836000.0,3409667761123289245,821000.0,,1.0,wz,40.0,12.0,,...,,,,,,,Quads,82.5,rebirth,1.13
1,1662835000.0,1662836000.0,3409667761123289245,821000.0,,1.0,wz,40.0,12.0,,...,,,,,,,Quads,82.5,rebirth,1.13


(11027, 154)
(44243, 154)


### Features engineering

#### features selection + encode categorical features

In [13]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

def shrink_features(df):
    to_keep = [
        'matchID',
        'target',
        'utcEndSeconds',
        'map',
        'squad',
        'duration',
        'playerCount',
        'teamCount',
        'playerStats.kills',
        'playerStats.deaths',
        'playerStats.assists',
        'playerStats.scorePerMinute',
        'playerStats.headshots',
        'playerStats.rank',
        'playerStats.distanceTraveled',
        'playerStats.teamSurvivalTime',
        'playerStats.kdRatio',
        'playerStats.timePlayed',
        'playerStats.percentTimeMoving',
        'playerStats.longestStreak',
        'playerStats.damageDone',
        'playerStats.damageTaken',
        'playerStats.executions',
        'player.awards.revenge',
        'player.awards.pointblank',
        'player.awards.streak_5',
        'player.awards.streak_10',
        'player.awards.gun_butt',
        'player.awards.kill_jumper',
        'player.awards.longshot',
        'player.awards.avenger',
        'player.awards.save_teammate',
        'player.awards.comeback',
        'player.brMissionStats.missionsComplete',
        'player.brMissionStats.missionStatsByType.assassination.count',
        'player.brMissionStats.missionStatsByType.timedrun.count',
        'player.brMissionStats.missionStatsByType.masterassassination.count',
        'player.brMissionStats.missionStatsByType.scavenger.count',
        'player.awards.low_health_kill'
    ]
    return df[to_keep]


def encode_datetime(df):
    """
    Add day of week, hour, from timestamp
    """

    df['utcEndSeconds'] = df['utcEndSeconds'].apply(lambda x: datetime.fromtimestamp(x))
    df['weekday'] = df['utcEndSeconds'].dt.weekday
    df['hour'] = df['utcEndSeconds'].dt.hour
    df.drop('utcEndSeconds', axis=1, inplace=True)
    
    return df

def squad_to_ordinal(df):
    """
    label (ordinal) encoding for 'squad' (Solos, Duos, Trios...) 
    (could also use one hot, but squad is kind of ordinal)
    """
    squad_order = {'Solos':1, 'Duos':2, 'Trios':3, 'Quads':4}
    df['squad_ordinal'] = df['squad'].map(squad_order)
    df['squad_ordinal'] = df['squad_ordinal'].astype('int64')
    df.drop('squad', axis=1, inplace=True)
    
    return df

def one_hot(df, column):
    """
    One Hot Encode one categorical column using sklearn
    """
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(df[[column]])
    encoded_features = enc.transform(df[[column]]).toarray()
    
    df_features = pd.DataFrame(encoded_features)
    columns = enc.get_feature_names_out([column]).tolist()
    df_features.columns = columns
    
    for _ in [df, df_features]:
        _.reset_index(drop=True, inplace=True)
    augmented_df = pd.concat([df, df_features], axis=1)
    augmented_df.drop(column, axis=1, inplace=True)
    
    return augmented_df

# map_ohe = agg = last
# kills, deaths, assists, agg = mean, std, max (last ?)
# others numerical = mean and/or sum  ?

#### features aggregation : keep one row (players' stats aggregation) per matchID

In [14]:
detailed_agg_columns_save = [
    'playerStats.kills',
    'playerStats.deaths',
    'playerStats.assists',
    'playerStats.timePlayed',
    'playerStats.teamSurvivalTime',
    'playerStats.scorePerMinute',
    'playerStats.rank',
]

def aggregate_players(df):
    
    # define the columns groups we will aggregate differently
    
    # those features (+target) wont be aggregated (map type is the same for all players etc...)
    no_agg_columns = [
        'target',
        'duration',
        'playerCount',
        'teamCount',
        'weekday',
        'hour',
        'squad_ordinal',
        'map_fortkeep',
        'map_rebirth',
        
    ]
    # for kills, deaths etc, we will calculate more aggr types (std, max...)
    # we based our selection on back and forth EDA + features importance 
    detailed_agg_columns = [
        'playerStats.kills',
        'playerStats.deaths',
        'playerStats.assists',
        'player.awards.streak_5',
        'player.awards.streak_10',
        'playerStats.damageDone',
        'playerStats.damageTaken',
        'playerStats.rank',
        'playerStats.headshots',
        'playerStats.teamSurvivalTime',
        'playerStats.timePlayed',
        'player.brMissionStats.missionStatsByType.scavenger.count',
        'playerStats.scorePerMinute',
        'playerStats.distanceTraveled',
        'player.awards.avenger'   
    ]
    
    # other columns that are neither non aggregable or detailed
    simple_agg_columns = [col for col in df.columns.tolist() if col not in no_agg_columns]
    simple_agg_columns = [col for col in simple_agg_columns if col not in detailed_agg_columns]
    simple_agg_columns.remove('matchID')
    
    # groupby matchID, but keep features/columns that do not need to be agg-ed
    df_core = df.groupby('matchID')[no_agg_columns].agg('last')
    display(df_core.head(1))
    print(f'df_core shape : {df_core.shape}')

    # groupby matchID, deeper aggregation (mean, std, max)
    # on chosen players stats : kills, deaths, assists...
    df_detailed = df.groupby('matchID')[detailed_agg_columns].agg(['mean', 'std', 'max'])
    df_detailed.columns = ['_'.join(x) for x in df_detailed.columns]
    display(df_detailed.head(1))
    print(f'df_detailed shape : {df_detailed.shape}')
    
    # groupby matchID, simple aggregation (mean) for other players stats
    df_simple = df.groupby('matchID')[simple_agg_columns].agg(['mean'])
    df_simple.columns = ['_'.join(x) for x in df_simple.columns]
    display(df_simple.head())
    print(f'df_simple shape : {df_simple.shape}')
    
    df = pd.concat([df_core, df_detailed, df_simple], axis=1)
    display(df.head(1))
    print(f'finale data shape : {df.shape}')
    
    return df

In [15]:
# encode categorical + aggregations, match_end (datetime)
#declare y (target) and x
#separate train test

# use k fold instead of train - test ?? because our dataset is small. Or 1/ score accu. 2/ metric per fold

# base model : RandomForestRegressor + intuition & our EDA for feature selection (+ basic aggregations) 
# XGB basic : intuition & our EDA for feature selection (+ basic aggregations) + fixed xgboost parameters
# XGB basic + grid search or random search
# XGB feature engineering + shap hypetune

#### Some EDA after applying features engineering

In [16]:
from pandas_profiling import ProfileReport

tmp_data = data.copy(deep=True)
tmp_data = shrink_features(tmp_data)
tmp_data = encode_datetime(tmp_data)
tmp_data = squad_to_ordinal(tmp_data)
tmp_data = one_hot(tmp_data, column='map')
tmp_data = aggregate_players(tmp_data)
tmp_data = tmp_data.reset_index(drop=True)

Unnamed: 0_level_0,target,duration,playerCount,teamCount,weekday,hour,squad_ordinal,map_fortkeep,map_rebirth
matchID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10006862765041148091,1.18,728000.0,45.0,16.0,6,20,3,0.0,1.0


df_core shape : (1170, 9)


Unnamed: 0_level_0,playerStats.kills_mean,playerStats.kills_std,playerStats.kills_max,playerStats.deaths_mean,playerStats.deaths_std,playerStats.deaths_max,playerStats.assists_mean,playerStats.assists_std,playerStats.assists_max,player.awards.streak_5_mean,...,player.brMissionStats.missionStatsByType.scavenger.count_max,playerStats.scorePerMinute_mean,playerStats.scorePerMinute_std,playerStats.scorePerMinute_max,playerStats.distanceTraveled_mean,playerStats.distanceTraveled_std,playerStats.distanceTraveled_max,player.awards.avenger_mean,player.awards.avenger_std,player.awards.avenger_max
matchID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10006862765041148091,2.244444,2.487748,13.0,2.377778,1.466116,6.0,0.911111,1.164283,4.0,446584.0,...,,349.927508,284.799598,1215.517241,110463.055578,31001.475958,162307.88,16740.0,33480.0,66960.0


df_detailed shape : (1170, 45)


Unnamed: 0_level_0,playerStats.kdRatio_mean,playerStats.percentTimeMoving_mean,playerStats.longestStreak_mean,playerStats.executions_mean,player.awards.revenge_mean,player.awards.pointblank_mean,player.awards.gun_butt_mean,player.awards.kill_jumper_mean,player.awards.longshot_mean,player.awards.save_teammate_mean,player.awards.comeback_mean,player.brMissionStats.missionsComplete_mean,player.brMissionStats.missionStatsByType.assassination.count_mean,player.brMissionStats.missionStatsByType.timedrun.count_mean,player.brMissionStats.missionStatsByType.masterassassination.count_mean,player.awards.low_health_kill_mean
matchID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
10006862765041148091,1.283333,83.80799,2.288889,0.0,319248.0,146098.285714,69000.0,159296.0,164214.857143,204192.0,,0.177778,1.5,1.0,,122260.363636
10022431813918295442,1.180797,86.934635,1.847826,0.0,196560.0,173095.384615,145152.0,0.0,242681.142857,213051.428571,,0.065217,1.0,,,77600.0
10082009794454927010,1.042138,80.05426,3.415094,0.0,244743.272727,255216.0,230100.0,185688.0,262656.0,221578.666667,421814.4,0.150943,1.0,,,387921.0
10090996203127539604,1.03,86.177693,1.66,0.0,154680.0,226244.571429,148800.0,213504.0,121704.0,255140.571429,,0.18,1.0,,,214616.727273
10134718954536800884,1.067029,81.815328,2.456522,0.0,266832.0,143145.6,,356860.8,208669.090909,217104.0,,0.086957,1.5,,,282457.6


df_simple shape : (1170, 16)


Unnamed: 0_level_0,target,duration,playerCount,teamCount,weekday,hour,squad_ordinal,map_fortkeep,map_rebirth,playerStats.kills_mean,...,player.awards.gun_butt_mean,player.awards.kill_jumper_mean,player.awards.longshot_mean,player.awards.save_teammate_mean,player.awards.comeback_mean,player.brMissionStats.missionsComplete_mean,player.brMissionStats.missionStatsByType.assassination.count_mean,player.brMissionStats.missionStatsByType.timedrun.count_mean,player.brMissionStats.missionStatsByType.masterassassination.count_mean,player.awards.low_health_kill_mean
matchID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10006862765041148091,1.18,728000.0,45.0,16.0,6,20,3,0.0,1.0,2.244444,...,69000.0,159296.0,164214.857143,204192.0,,0.177778,1.5,1.0,,122260.363636


finale data shape : (1170, 70)


In [17]:
tmp_data = tmp_data.sample(n=500)
profile = ProfileReport(tmp_data, title="EDA on engineered features", minimal=True)
profile.to_file("features_pandas_profiling.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [18]:
# notes : some outliers in damageDone, but we keep them as we will encounter them in real life + we're using xgboost

### XGBOOST Training and Hyperparameters tuning

#### Apply features engineering to train test data

In [19]:
train = shrink_features(train)
train = encode_datetime(train)
train = squad_to_ordinal(train)
train = one_hot(train, column='map')
train = aggregate_players(train)
train = train.reset_index(drop=True)

Unnamed: 0_level_0,target,duration,playerCount,teamCount,weekday,hour,squad_ordinal,map_fortkeep,map_rebirth
matchID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10082009794454927010,1.34,833000.0,53.0,15.0,4,0,4,1.0,0.0


df_core shape : (936, 9)


Unnamed: 0_level_0,playerStats.kills_mean,playerStats.kills_std,playerStats.kills_max,playerStats.deaths_mean,playerStats.deaths_std,playerStats.deaths_max,playerStats.assists_mean,playerStats.assists_std,playerStats.assists_max,player.awards.streak_5_mean,...,player.brMissionStats.missionStatsByType.scavenger.count_max,playerStats.scorePerMinute_mean,playerStats.scorePerMinute_std,playerStats.scorePerMinute_max,playerStats.distanceTraveled_mean,playerStats.distanceTraveled_std,playerStats.distanceTraveled_max,player.awards.avenger_mean,player.awards.avenger_std,player.awards.avenger_max
matchID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10082009794454927010,3.490566,3.667018,18.0,3.716981,2.230707,9.0,1.528302,1.671063,7.0,436720.941176,...,1.0,476.720511,337.135903,1648.351648,140855.366717,37584.231903,212318.5,72204.0,144408.0,288816.0


df_detailed shape : (936, 45)


Unnamed: 0_level_0,playerStats.kdRatio_mean,playerStats.percentTimeMoving_mean,playerStats.longestStreak_mean,playerStats.executions_mean,player.awards.revenge_mean,player.awards.pointblank_mean,player.awards.gun_butt_mean,player.awards.kill_jumper_mean,player.awards.longshot_mean,player.awards.save_teammate_mean,player.awards.comeback_mean,player.brMissionStats.missionsComplete_mean,player.brMissionStats.missionStatsByType.assassination.count_mean,player.brMissionStats.missionStatsByType.timedrun.count_mean,player.brMissionStats.missionStatsByType.masterassassination.count_mean,player.awards.low_health_kill_mean
matchID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
10082009794454927010,1.042138,80.05426,3.415094,0.0,244743.272727,255216.0,230100.0,185688.0,262656.0,221578.666667,421814.4,0.150943,1.0,,,387921.0
10134718954536800884,1.067029,81.815328,2.456522,0.0,266832.0,143145.6,,356860.8,208669.090909,217104.0,,0.086957,1.5,,,282457.6
10141051138442727030,1.158373,81.797826,3.675,0.0,384041.142857,277548.0,176275.2,350208.0,276020.571429,301889.454545,367500.0,0.25,1.0,1.0,1.0,329188.0
10150800201941496413,1.429418,81.008081,3.384615,0.019231,303202.909091,264676.0,244016.0,134112.0,248490.0,213700.8,498348.0,0.173077,1.0,1.0,,253433.6
10155179111233906919,1.085833,85.657347,2.68,0.04,362840.727273,226770.461538,51408.0,281592.0,247639.384615,215008.0,596688.0,0.18,1.0,1.0,1.0,271955.368421


df_simple shape : (936, 16)


Unnamed: 0_level_0,target,duration,playerCount,teamCount,weekday,hour,squad_ordinal,map_fortkeep,map_rebirth,playerStats.kills_mean,...,player.awards.gun_butt_mean,player.awards.kill_jumper_mean,player.awards.longshot_mean,player.awards.save_teammate_mean,player.awards.comeback_mean,player.brMissionStats.missionsComplete_mean,player.brMissionStats.missionStatsByType.assassination.count_mean,player.brMissionStats.missionStatsByType.timedrun.count_mean,player.brMissionStats.missionStatsByType.masterassassination.count_mean,player.awards.low_health_kill_mean
matchID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10082009794454927010,1.34,833000.0,53.0,15.0,4,0,4,1.0,0.0,3.490566,...,230100.0,185688.0,262656.0,221578.666667,421814.4,0.150943,1.0,,,387921.0


finale data shape : (936, 70)


In [20]:
test = shrink_features(test)
test = encode_datetime(test)
test = squad_to_ordinal(test)
test = one_hot(test, column='map')
test = aggregate_players(test)
test = test.reset_index(drop=True)

Unnamed: 0_level_0,target,duration,playerCount,teamCount,weekday,hour,squad_ordinal,map_fortkeep,map_rebirth
matchID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10006862765041148091,1.18,728000.0,45.0,16.0,6,20,3,0.0,1.0


df_core shape : (234, 9)


Unnamed: 0_level_0,playerStats.kills_mean,playerStats.kills_std,playerStats.kills_max,playerStats.deaths_mean,playerStats.deaths_std,playerStats.deaths_max,playerStats.assists_mean,playerStats.assists_std,playerStats.assists_max,player.awards.streak_5_mean,...,player.brMissionStats.missionStatsByType.scavenger.count_max,playerStats.scorePerMinute_mean,playerStats.scorePerMinute_std,playerStats.scorePerMinute_max,playerStats.distanceTraveled_mean,playerStats.distanceTraveled_std,playerStats.distanceTraveled_max,player.awards.avenger_mean,player.awards.avenger_std,player.awards.avenger_max
matchID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10006862765041148091,2.244444,2.487748,13.0,2.377778,1.466116,6.0,0.911111,1.164283,4.0,446584.0,...,,349.927508,284.799598,1215.517241,110463.055578,31001.475958,162307.88,16740.0,33480.0,66960.0


df_detailed shape : (234, 45)


Unnamed: 0_level_0,playerStats.kdRatio_mean,playerStats.percentTimeMoving_mean,playerStats.longestStreak_mean,playerStats.executions_mean,player.awards.revenge_mean,player.awards.pointblank_mean,player.awards.gun_butt_mean,player.awards.kill_jumper_mean,player.awards.longshot_mean,player.awards.save_teammate_mean,player.awards.comeback_mean,player.brMissionStats.missionsComplete_mean,player.brMissionStats.missionStatsByType.assassination.count_mean,player.brMissionStats.missionStatsByType.timedrun.count_mean,player.brMissionStats.missionStatsByType.masterassassination.count_mean,player.awards.low_health_kill_mean
matchID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
10006862765041148091,1.283333,83.80799,2.288889,0.0,319248.0,146098.285714,69000.0,159296.0,164214.857143,204192.0,,0.177778,1.5,1.0,,122260.363636
10022431813918295442,1.180797,86.934635,1.847826,0.0,196560.0,173095.384615,145152.0,0.0,242681.142857,213051.428571,,0.065217,1.0,,,77600.0
10090996203127539604,1.03,86.177693,1.66,0.0,154680.0,226244.571429,148800.0,213504.0,121704.0,255140.571429,,0.18,1.0,,,214616.727273
10153328630378107053,1.115278,84.269078,1.395833,0.020833,460080.0,91392.0,100240.0,,217929.6,103776.0,479280.0,0.145833,1.0,1.0,1.0,266934.857143
10154164266128423971,1.117628,81.230395,1.75,0.0,334056.0,264394.285714,68816.0,0.0,202704.0,425196.0,,,,,,194117.333333


df_simple shape : (234, 16)


Unnamed: 0_level_0,target,duration,playerCount,teamCount,weekday,hour,squad_ordinal,map_fortkeep,map_rebirth,playerStats.kills_mean,...,player.awards.gun_butt_mean,player.awards.kill_jumper_mean,player.awards.longshot_mean,player.awards.save_teammate_mean,player.awards.comeback_mean,player.brMissionStats.missionsComplete_mean,player.brMissionStats.missionStatsByType.assassination.count_mean,player.brMissionStats.missionStatsByType.timedrun.count_mean,player.brMissionStats.missionStatsByType.masterassassination.count_mean,player.awards.low_health_kill_mean
matchID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10006862765041148091,1.18,728000.0,45.0,16.0,6,20,3,0.0,1.0,2.244444,...,69000.0,159296.0,164214.857143,204192.0,,0.177778,1.5,1.0,,122260.363636


finale data shape : (234, 70)


#### Fit Xgboost, with cv / early stopping

In [22]:
X_train = train.drop('target', axis=1).copy()
y_train = train['target'].copy()
X_test = test.drop('target', axis=1).copy()
y_test = test['target'].copy()

In [26]:
# Sklearn API, default run with a chosen eta of 0.05
# early stopping set to 50 to prevent overfitting
reg = xgb.XGBRegressor(n_estimators=1000, early_stopping_rounds=50, learning_rate=0.05)
reg.fit(
    X_train,
    y_train,
    verbose=10,
    eval_set=[(X_train, y_train), (X_test, y_test)]
)

[0]	validation_0-rmse:0.62055	validation_1-rmse:0.62276
[10]	validation_0-rmse:0.38338	validation_1-rmse:0.39213
[20]	validation_0-rmse:0.24292	validation_1-rmse:0.25958
[30]	validation_0-rmse:0.16004	validation_1-rmse:0.18881
[40]	validation_0-rmse:0.11238	validation_1-rmse:0.15279
[50]	validation_0-rmse:0.08558	validation_1-rmse:0.13670
[60]	validation_0-rmse:0.07024	validation_1-rmse:0.12952
[70]	validation_0-rmse:0.06137	validation_1-rmse:0.12663
[80]	validation_0-rmse:0.05534	validation_1-rmse:0.12550
[90]	validation_0-rmse:0.05048	validation_1-rmse:0.12493
[100]	validation_0-rmse:0.04557	validation_1-rmse:0.12473
[110]	validation_0-rmse:0.04138	validation_1-rmse:0.12458
[120]	validation_0-rmse:0.03763	validation_1-rmse:0.12417
[130]	validation_0-rmse:0.03388	validation_1-rmse:0.12423
[140]	validation_0-rmse:0.03060	validation_1-rmse:0.12397
[150]	validation_0-rmse:0.02826	validation_1-rmse:0.12389
[160]	validation_0-rmse:0.02605	validation_1-rmse:0.12373
[170]	validation_0-rmse:0

In [27]:
pd.DataFrame(data=reg.feature_importances_,
             index=reg.feature_names_in_).sort_values(by=0, ascending=False).head(50)

Unnamed: 0,0
playerStats.damageDone_std,0.050676
player.awards.streak_5_mean,0.047482
playerStats.timePlayed_mean,0.044714
playerStats.rank_mean,0.03584
playerStats.damageDone_mean,0.032639
playerStats.kills_std,0.027382
playerStats.teamSurvivalTime_mean,0.024951
playerStats.longestStreak_mean,0.023215
playerStats.deaths_max,0.021821
playerStats.scorePerMinute_mean,0.021162


In [28]:
# using Xgboost API and cross validation
X = pd.concat([X_train, X_test], axis=0)
print(X.shape)
y = pd.concat([y_train, y_test], axis=0)
print(y.shape)

dmatrix = xgb.DMatrix(data=X, label=y)

params = {
    'objective':'reg:squarederror'
}

cv_results = xgb.cv(
    dtrain=dmatrix, 
    params=params, 
    nfold=10,
    as_pandas=True,
    seed=20,
    num_boost_round=1000
)
print('RMSE: %.4f' % cv_results['test-rmse-mean'].min())

(1170, 69)
(1170,)
RMSE: 0.1230


In [29]:
# using XgBoost API & cv, with a few tuned hyperparameters
dmatrix = xgb.DMatrix(data=X, label=y)

params = {
    'objective':'reg:squarederror',
    'max_depth': 5,
    'colsample_bytree':0.5,
    'learning_rate':0.03,
    'random_state':20
}

cv_results = xgb.cv(
    dtrain=dmatrix, 
    params=params, 
    nfold=10,
    as_pandas=True,
    seed=20,
    metrics={'rmse'},
    num_boost_round=1000
)
print('RMSE: %.4f' % cv_results['test-rmse-mean'].min())

RMSE: 0.1164


In [30]:
# Random grid search using RandomSearchCV() from Sklearn

from sklearn.model_selection import RandomizedSearchCV

# we already went into back and forth runs to run with below ranges :
params = {
    'learning_rate': [0.03, 0.04, 0.05, 0.06, 0.07],
    'max_depth': [3, 5, 6, 7],
    'subsample': np.arange(0.4, 1.0, 0.1),
    'colsample_bytree': np.arange(0.4, 1.0, 0.1),
    'colsample_bylevel': np.arange(0.4, 1.0, 0.1),
    'n_estimators': [300, 400, 500, 600, 700],
    'reg_alpha': [0, 0.5, 1],
    'reg_lambda': [0, 0.5, 5]
}

xgbr = xgb.XGBRegressor(seed = 20)
clf = RandomizedSearchCV(estimator=xgbr,
                         n_jobs= -1,
                         refit=True, #default would be True anyways
                         param_distributions=params,
                         scoring='neg_mean_squared_error',
                         n_iter=50,
                         verbose=1)
clf.fit(X, y)
print("Best parameters:", clf.best_params_)
print("Lowest RMSE: ", (-clf.best_score_)**(1/2.0))

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters: {'subsample': 0.7, 'reg_lambda': 5, 'reg_alpha': 0, 'n_estimators': 400, 'max_depth': 3, 'learning_rate': 0.05, 'colsample_bytree': 0.7999999999999999, 'colsample_bylevel': 0.6}
Lowest RMSE:  0.11694815374784426


In [31]:
best_pars = clf.best_params_
print(best_pars)
best_model = clf.best_estimator_

{'subsample': 0.7, 'reg_lambda': 5, 'reg_alpha': 0, 'n_estimators': 400, 'max_depth': 3, 'learning_rate': 0.05, 'colsample_bytree': 0.7999999999999999, 'colsample_bylevel': 0.6}


In [32]:
pd.DataFrame(data=best_model.feature_importances_,
             index=best_model.feature_names_in_).sort_values(by=0, ascending=False).head(50)

Unnamed: 0,0
player.awards.streak_5_mean,0.030659
playerStats.kills_std,0.030161
playerStats.damageDone_std,0.029609
playerStats.timePlayed_mean,0.024837
playerStats.teamSurvivalTime_mean,0.022074
playerStats.damageDone_mean,0.021969
playerStats.headshots_mean,0.021336
playerStats.rank_std,0.020857
playerStats.timePlayed_max,0.020784
playerStats.rank_mean,0.019955


In [35]:
# save best model
best_model.save_model('xgb_model_lobby_kd.json')

### Optional Test Hyperopt

In [33]:
from hyperopt import hp
from hyperopt import Trials

from scipy import stats

In [34]:
param_grid = {
    'learning_rate': [0.03, 0.04, 0.05, 0.06],
    'num_leaves': [10, 25, 35],
    'max_depth': [3, 5, 6]
}

param_dist = {
    'learning_rate': stats.uniform(0.02, 0.25),
    'num_leaves': stats.randint(20,40),
    'max_depth': [10, 12]
}

param_dist_hyperopt = {
    'max_depth': 15 + hp.randint('num_leaves', 5), 
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0)
}


regr_xgb = xgb.XGBRegressor(n_estimators=400, random_state=0, verbosity=0, n_jobs=-1)