In [1]:
import pandas as pd
import numpy as np
import random
import datetime
from datetime import datetime

## More power to you :

- Model implementation with a cleaner / code & transform pipeline available in my streamlit app [repository](https://github.com/matthieuvion/wzkd) `\src\predicty.py`

## Dataset preparation

### Load dataset

In [2]:
# 55k rows, 150+ features
data = pd.read_parquet('dataset_warzone_kd_bigger.parquet.gzip')
display(data.head())
print(data.shape)

Unnamed: 0,utcStartSeconds,utcEndSeconds,matchID,duration,playlistName,version,gameType,playerCount,teamCount,rankedTeams,...,player.awards.seven,player.awards.eight,playerStats.objectiveBrLootChopperBoxOpen,playerStats.objectiveShieldDamage,player.awards.simultaneous_kill,playerStats.objectiveBrPerseusLockerDoorOpenEe,squad,lobbykd,pct_playerskd,map
0,1662835000.0,1662836000.0,3409667761123289245,821000.0,,1.0,wz,40.0,12.0,,...,,,,,,,Quads,1.13,82.5,rebirth
1,1662835000.0,1662836000.0,3409667761123289245,821000.0,,1.0,wz,40.0,12.0,,...,,,,,,,Quads,1.13,82.5,rebirth
2,1662835000.0,1662836000.0,3409667761123289245,821000.0,,1.0,wz,40.0,12.0,,...,,,,,,,Quads,1.13,82.5,rebirth
3,1662835000.0,1662836000.0,3409667761123289245,821000.0,,1.0,wz,40.0,12.0,,...,,,,,,,Quads,1.13,82.5,rebirth
4,1662835000.0,1662836000.0,3409667761123289245,821000.0,,1.0,wz,40.0,12.0,,...,,,,,,,Quads,1.13,82.5,rebirth


(55270, 154)


In [3]:
# only two categorical features, that we identify and type convert for future operations
categorical_features = [
    'map',
    'squad'
]
data[categorical_features] = data[categorical_features].astype('category')

# rename 'lobbykd' as 'target', since we want to predict the avg k/d ratio of our game
data = data.rename(columns={'lobbykd':'target'})
data = data.reindex(columns = [col for col in data.columns if col != 'target'] + ['target'])
data.head(2)

Unnamed: 0,utcStartSeconds,utcEndSeconds,matchID,duration,playlistName,version,gameType,playerCount,teamCount,rankedTeams,...,player.awards.seven,player.awards.eight,playerStats.objectiveBrLootChopperBoxOpen,playerStats.objectiveShieldDamage,player.awards.simultaneous_kill,playerStats.objectiveBrPerseusLockerDoorOpenEe,squad,pct_playerskd,map,target
0,1662835000.0,1662836000.0,3409667761123289245,821000.0,,1.0,wz,40.0,12.0,,...,,,,,,,Quads,82.5,rebirth,1.13
1,1662835000.0,1662836000.0,3409667761123289245,821000.0,,1.0,wz,40.0,12.0,,...,,,,,,,Quads,82.5,rebirth,1.13


In [24]:
# types, missing values
data.info(max_cols=199, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55270 entries, 0 to 64457
Data columns (total 154 columns):
 #    Column                                                                 Non-Null Count  Dtype   
---   ------                                                                 --------------  -----   
 0    utcStartSeconds                                                        55270 non-null  float64 
 1    utcEndSeconds                                                          55270 non-null  float64 
 2    matchID                                                                55270 non-null  object  
 3    duration                                                               55270 non-null  float64 
 4    playlistName                                                           0 non-null      object  
 5    version                                                                55270 non-null  float64 
 6    gameType                                                            

## Exploratory data analysis

### Multileveled / dimensional data

The main specificity of our dataset is that each 'record' (a match) is 'multi-dimensional' (a match is composed of several players with n stats/features attached): <br>
I.e. We want to predict the avg kills/deaths (kd) ratio of each match, but each match does not correspond to a single row (a single set of features), but rather to n rows / players playing in this match.<br>
One Warzone "Resurgence" match --identified with a matchID, can count up to 50 teams of 1 player (solo mode), or up to 57 players when mode is set to duos, trios, quads. Each player (a row) in a match is attached with a number of features (number of kills, deaths, assists, headshots, xp awards...).<br>

### At a glance : shape, missing values, outliers

We went through a more detailed exploratory analysis prior / after our features engineering, using panda profiling & classic EDA tools. </br>
But here the basics to figure what we will be dealing with :

In [30]:
# One match, with one given matchId and a single target ("lobby kd"), is comprised of n players with their features attached (kpis this match)
data.query('matchID=="3409667761123289245"')[['matchID', 'player.username', 'playerCount', 'playerStats.kills', 'playerStats.damageDone','target']].head(3)

Unnamed: 0,matchID,player.username,playerCount,playerStats.kills,playerStats.damageDone,target
0,3409667761123289245,De captain,40.0,0.0,0.0,1.13
1,3409667761123289245,Kozzel,40.0,16.0,8214.0,1.13
2,3409667761123289245,JxKeR-_91,40.0,1.0,1535.0,1.13


In [32]:
# another match
data.query('matchID=="11730523208241334051"')[['matchID', 'player.username', 'playerCount', 'playerStats.kills', 'playerStats.damageDone','target']].head(3)

Unnamed: 0,matchID,player.username,playerCount,playerStats.kills,playerStats.damageDone,target
40,11730523208241334051,4orty,52.0,0.0,190.0,1.04
41,11730523208241334051,Jonathan David,52.0,0.0,31.0,1.04
42,11730523208241334051,Djbeby,52.0,1.0,662.0,1.04


In [4]:
# 55k records (players) but 1170 unique matches
print(data.shape)
print(data.matchID.nunique())

(55270, 154)
1170


In [5]:
data.describe()

Unnamed: 0,utcStartSeconds,utcEndSeconds,duration,version,playerCount,teamCount,playerStats.kills,playerStats.medalXp,playerStats.matchXp,playerStats.scoreXp,...,player.awards.backstab,playerStats.objectiveMedalScoreKillSsRadarDrone,player.awards.seven,player.awards.eight,playerStats.objectiveBrLootChopperBoxOpen,playerStats.objectiveShieldDamage,player.awards.simultaneous_kill,playerStats.objectiveBrPerseusLockerDoorOpenEe,pct_playerskd,target
count,55270.0,55270.0,55270.0,55270.0,55270.0,55270.0,55270.0,55270.0,55270.0,55270.0,...,0.0,1.0,4.0,2.0,6.0,2.0,2.0,1.0,55270.0,55270.0
mean,1662564000.0,1662565000.0,822161.697123,1.0,47.415017,17.527447,2.838321,110.598969,3104.130342,4270.571522,...,,2.0,0.0,0.0,1.0,2.0,0.0,1.0,79.011476,1.140005
std,485994.8,485997.2,50424.44463,0.0,5.08714,10.220768,2.996422,211.820421,1783.029725,3539.88024,...,,,0.0,0.0,0.0,1.414214,0.0,,9.189453,0.132615
min,1661549000.0,1661550000.0,534000.0,1.0,30.0,9.0,0.0,0.0,0.0,0.0,...,,2.0,0.0,0.0,1.0,1.0,0.0,1.0,60.0,0.8
25%,1662241000.0,1662242000.0,797000.0,1.0,42.0,11.0,1.0,0.0,1692.0,1500.0,...,,2.0,0.0,0.0,1.0,1.5,0.0,1.0,72.5,1.04
50%,1662377000.0,1662378000.0,824000.0,1.0,50.0,14.0,2.0,20.0,3153.0,3575.0,...,,2.0,0.0,0.0,1.0,2.0,0.0,1.0,79.069767,1.14
75%,1662850000.0,1662851000.0,851000.0,1.0,52.0,18.0,4.0,80.0,4324.0,6175.0,...,,2.0,0.0,0.0,1.0,2.5,0.0,1.0,85.454545,1.23
max,1663732000.0,1663733000.0,970000.0,1.0,58.0,50.0,29.0,3740.0,10622.0,72530.0,...,,2.0,0.0,0.0,1.0,3.0,0.0,1.0,100.0,1.67


In [6]:
data.groupby(by='matchID')[['target', 'playerStats.kills', 'playerStats.deaths', 'playerStats.kdRatio', 'playerStats.damageDone', 'playerStats.timePlayed']].agg(['last', 'count', 'sum', 'mean', 'median'])

Unnamed: 0_level_0,target,target,target,target,target,playerStats.kills,playerStats.kills,playerStats.kills,playerStats.kills,playerStats.kills,...,playerStats.damageDone,playerStats.damageDone,playerStats.damageDone,playerStats.damageDone,playerStats.damageDone,playerStats.timePlayed,playerStats.timePlayed,playerStats.timePlayed,playerStats.timePlayed,playerStats.timePlayed
Unnamed: 0_level_1,last,count,sum,mean,median,last,count,sum,mean,median,...,last,count,sum,mean,median,last,count,sum,mean,median
matchID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
10006862765041148091,1.18,45,53.10,1.18,1.18,0.0,45,101.0,2.244444,2.0,...,443.0,45,58445.0,1298.777778,1062.0,258.0,45,20270.0,450.444444,449.0
10022431813918295442,1.16,46,53.36,1.16,1.16,2.0,46,86.0,1.869565,1.0,...,923.0,46,44715.0,972.065217,799.5,316.0,46,17601.0,382.630435,327.0
10082009794454927010,1.34,53,71.02,1.34,1.34,4.0,53,185.0,3.490566,3.0,...,1356.0,53,104978.0,1980.716981,1658.0,422.0,53,31395.0,592.358491,674.0
10090996203127539604,0.99,50,49.50,0.99,0.99,0.0,50,83.0,1.660000,1.0,...,3.0,50,49394.0,987.880000,715.0,142.0,50,22176.0,443.520000,311.0
10134718954536800884,1.11,46,51.06,1.11,1.11,6.0,46,114.0,2.478261,2.0,...,1894.0,46,62215.0,1352.500000,1065.5,839.0,46,20768.0,451.478261,484.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9952914128324669505,0.85,52,44.20,0.85,0.85,1.0,52,163.0,3.134615,3.0,...,1059.0,52,94405.0,1815.480769,1552.0,380.0,52,36619.0,704.211538,774.0
9961080040106933532,1.17,40,46.80,1.17,1.17,2.0,40,111.0,2.775000,2.0,...,1064.0,40,67647.0,1691.175000,1528.5,819.0,40,23387.0,584.675000,731.5
9963452121608988341,1.24,52,64.48,1.24,1.24,1.0,52,140.0,2.692308,2.0,...,890.0,52,80934.0,1556.423077,1341.5,203.0,52,28570.0,549.423077,603.0
9971642112346017660,1.08,40,43.20,1.08,1.08,2.0,40,115.0,2.875000,2.0,...,1341.0,40,61750.0,1543.750000,1336.0,867.0,40,21195.0,529.875000,549.0


In [7]:
# pct missing values per features, whole dataset (= all players)
pd.DataFrame(data.isna().sum().div(len(data)).mul(100).sort_values(ascending=False))[0:50]

Unnamed: 0,0
playlistName,100.0
rankedTeams,100.0
player.awards.backstab,100.0
playerStats.objectiveHack,99.998191
playerStats.objectiveMedalScoreKillSsRadarDrone,99.998191
playerStats.objectiveDestroyedVehicleMedium,99.998191
playerStats.objectiveTrophyDefense,99.998191
playerStats.objectiveBrPerseusLockerDoorOpenEe,99.998191
playerStats.objectiveBrForgottenLockerDoorOpenEe,99.998191
player.awards.simultaneous_kill,99.996381


In [8]:
# some kpis can be missing at player level, but make more sens at match level
pd.DataFrame(data.groupby(by='matchID').agg('mean').isna().sum().sort_values(ascending=False))[70:90]

Unnamed: 0,0
player.brMissionStats.missionStatsByType.masterassassination.xp,528
player.awards.triple,524
player.awards.explosive_stick,477
player.brMissionStats.missionStatsByType.timedrun.count,331
player.brMissionStats.missionStatsByType.timedrun.weaponXp,331
player.brMissionStats.missionStatsByType.timedrun.xp,331
player.awards.comeback,310
player.awards.air_to_air_kill,289
player.brMissionStats.missionStatsByType.scavenger.count,288
player.brMissionStats.missionStatsByType.scavenger.xp,288


##### small focus : outliers on DamageDone

In [11]:
# notes:
# EDA : one extreme outlier (+ some suspects) on damageDone with max values way above average
# but we decided to keep them as we will encounter them "in real life" + we're using xgboost that handles well outliers

In [10]:
data[['matchID', 'playerCount', 'squad', 'map', 'playerStats.damageDone']].sort_values(by='playerStats.damageDone', ascending=False).head(20)

Unnamed: 0,matchID,playerCount,squad,map,playerStats.damageDone
38613,1370069958461487008,52.0,Quads,fortkeep,4294967000.0
55090,855032827875055390,42.0,Quads,rebirth,12718.0
63532,10628305472550279382,40.0,Quads,rebirth,12391.0
10273,7362251540313581052,52.0,Quads,fortkeep,11487.0
39966,3765989571523210640,44.0,Trios,rebirth,11138.0
9986,1501974320764785975,40.0,Quads,rebirth,10565.0
13144,13547399530503207787,52.0,Quads,fortkeep,10535.0
50310,1766746682563678467,40.0,Quads,rebirth,10527.0
49386,2449047677272433734,39.0,Quads,rebirth,10477.0
46143,14089138655969584359,40.0,Quads,rebirth,10262.0


## XGB Prediction

### Train Test Split

As we have multidimensional data for each match (one match = several players/rows, make sure we train test split </br>
keeping all rows (players) with same match Id in respective train or test dataset </br>
we're keeping 20 pct of match Ids for test dataset </br>

In [None]:
random.seed(42)
match_uuids = list(set(data.matchID.tolist()))
split_ids = random.sample(match_uuids, int(20*len(match_uuids)/100))

test = data.query('matchID in @split_ids')
train = data.query('matchID not in @split_ids')

display(train.head(2))
print(test.shape)
print(train.shape)

### Features engineering

Features selection & creation </br>
- A lot of back and forth there using `sklearn` default feature_importances, and `SHAP` </br>

Feature creation, besides the retained operations below, we also tried, with no upgrade to our rmse: : </br>
- kills, deaths, damage / players count
- time played / game duration
- remove teams with lowest match placement (<= 30) to remove 'noise'/ rows with less information </br>
- aggregation by players groups : 3 tiers for top 10, 25-11 and 40-26 </br>
- aggregation max, mean, std for tier 1 (top 10), mean, std for the rest </br>
- aggregation mean / std on selected set of features, mean for the rest </br>
- aggregation mean / std on an extended set of features, mean for the rest <= best result so far </br>
- aggregation mean / std on ALL set of features </br>
- percentile_75 = lambda x: np.percentile(x, q=75) </br>
- percentile_25 = lambda x: np.percentile(x, q=25) </br>
- what we could add (but too much corr with existing ?) skewness, kurtosis </br>

#### Features selection

In [None]:
def select_features(df):
    """
    Retains the best columns we will build features upon
    """
    to_keep = [
        "matchID",
        "utcEndSeconds",
        "map",
        "squad",
        "duration",
        "playerCount",
        "teamCount",
        "playerStats.kills",
        "playerStats.deaths",
        "playerStats.assists",
        "playerStats.scorePerMinute",
        "playerStats.headshots",
        "playerStats.rank",
        "playerStats.distanceTraveled",
        "playerStats.teamSurvivalTime",
        "playerStats.kdRatio",
        "playerStats.timePlayed",
        "playerStats.percentTimeMoving",
        "playerStats.damageDone",
        "playerStats.damageTaken",
        "player.awards.streak_5",
        "player.awards.double",
        "player.brMissionStats.missionsComplete",
    ]

    return df[to_keep]

#### Encode time / categorical features

In [None]:
from sklearn.preprocessing import OneHotEncoder

def encode_features(df):
    """
    Encode datetime, categorical (squad size, map type) columns
    """

    def encode_datetime(df):
        """
        Add day of week, hour, from timestamp
        """

        df["utcEndSeconds"] = df["utcEndSeconds"].apply(
            lambda x: datetime.fromtimestamp(x)
        )
        df["weekday"] = df["utcEndSeconds"].dt.weekday
        df["hour"] = df["utcEndSeconds"].dt.hour
        # df.drop("utcEndSeconds", axis=1, inplace=True)

        return df

    def squad_to_ordinal(df):
        """
        label (ordinal) encoding for 'squad' (Solos, Duos, Trios...)
        (could also use one hot, but squad is kind of ordinal)
        """
        squad_order = {"Solos": 1, "Duos": 2, "Trios": 3, "Quads": 4}
        df["squad_ordinal"] = df["squad"].map(squad_order)
        df["squad_ordinal"] = df["squad_ordinal"].astype("int64")
        df.drop("squad", axis=1, inplace=True)

        return df

    def one_hot(df, column):
        """
        One Hot Encode one categorical column using sklearn
        ohe encoder previously fit when we built our model
        """

        with open("src/model/ohe_encoder.pickle", "rb") as f:
            enc = pickle.load(f)
        encoded_features = enc.transform(df[[column]]).toarray()

        df_features = pd.DataFrame(encoded_features)
        columns = enc.get_feature_names_out([column]).tolist()
        df_features.columns = columns

        for _ in [df, df_features]:
            _.reset_index(drop=True, inplace=True)
        augmented_df = pd.concat([df, df_features], axis=1)
        augmented_df.drop(column, axis=1, inplace=True)

        return augmented_df

    # apply encoding
    encoded_df = encode_datetime(df)
    encoded_df = squad_to_ordinal(encoded_df)
    encoded_df = one_hot(encoded_df, column="map")

    return encoded_df

#### Retained created features

In [None]:
def create_new_features(df):
    """
    Create new features from existing features
    We calculated and tried a lot, but found those ones to work better
    """

    def add_time_slot(df):
        """
        Hour (0-24) had not much effect, let's custom-bin it
        morning (1), noon (2), afternoon (3), evening (4), late evening (5)
        """
        dict_ = {
            6:1, 7:1, 8:1, 9:1,
            10:1, 11:2, 12:2,
            13:2, 14:3, 15:3,
            16:3, 17:3, 18:4,
            19:4, 20:4, 21:4,
            22:5, 23:5, 0:5,
            1:5, 2:5, 3:5,
            4:5, 5:5,
        }
        df["time_slot"] = df["hour"].map(dict_)

        return df

    def normalize_by_time_played(df):
        """
        kills, damage... / time played
        """
        columns = [
            "playerStats.kills",
            "playerStats.deaths",
            "playerStats.damageDone",
            "playerStats.damageTaken",
        ]
        for col in columns:
            df[col + "_by_timePlayed"] = df[col].div(df["playerStats.timePlayed"])

        return df

    def damage_by_kill(df):
        """
        damageDone to get a kill
        """
        columns = [
            "playerStats.damageDone",
        ]
        for col in columns:
            df[col + "_by_kill"] = df[col].div(df["playerStats.kills"])

        return df

    def headshot_by_kill(df):
        """
        headshot / kill
        """
        columns = [
            "playerStats.headshots",
        ]
        for col in columns:
            # + .1 to prevent inf / nan values
            df[col + "_by_kill"] = df[col].add(0.1).div(df["playerStats.kills"] + 0.1)

        return df

    # apply features creation

    augmented_df = add_time_slot(df)
    augmented_df = normalize_by_time_played(augmented_df)
    augmented_df = damage_by_kill(augmented_df)
    augmented_df = headshot_by_kill(augmented_df)

    return augmented_df

#### Features aggregation : keep one row (players' stats aggregation) per matchID

In [None]:
def perform_aggregations(df):
    """
    A match consist of +- 40 players (rows); we have only one single given target : "lobby kd"
    We aggregate players rows, sometimes adding new features, to keep one single array of features per match
    We tried others methods to aggregate (players placement, percentiles etc), but they did not add as much.
    """

    # those features won't be aggregated, because they're the same for all players
    no_agg_columns = [
        "utcEndSeconds",
        "duration",
        "playerCount",
        "teamCount",
        "weekday",
        "hour",
        "squad_ordinal",
        "map_fortkeep",
        "map_rebirth",
        "time_slot",
    ]

    # Features such as kills, deaths etc.. + newly created features are aggregated using, mean, std, median
    detailed_agg_columns = [
        "playerStats.rank",
        "playerStats.kdRatio",
        "playerStats.kills",
        "playerStats.deaths",
        "playerStats.assists",
        "playerStats.damageDone",
        "playerStats.damageTaken",
        "playerStats.kills_by_timePlayed",
        "playerStats.deaths_by_timePlayed",
        "playerStats.damageDone_by_timePlayed",
        "playerStats.damageTaken_by_timePlayed",
        "playerStats.damageDone_by_kill",
        "playerStats.scorePerMinute",
        "playerStats.teamSurvivalTime",
        "playerStats.timePlayed",
        "playerStats.percentTimeMoving",
        "player.awards.streak_5",
        "player.awards.double",
        "playerStats.headshots",
        "player.brMissionStats.missionsComplete",
        "playerStats.headshots_by_kill",
    ]

    # keep core features (do not vary per players) :

    df_core = df.groupby("matchID")[no_agg_columns].agg("last")

    # perform mean, std, median groupby-agg on other features:

    df_detailed = df.groupby("matchID")[detailed_agg_columns].agg(
        ["mean", "std", "median"]
    )
    df_detailed.columns = ["_".join(x) for x in df_detailed.columns]

    # perform special aggregations (count of given variables among players of a match),
    # adding new features :

    pct_players_0_kills = (
        df[["matchID", "playerStats.kills"]]
        .groupby("matchID")[["playerStats.kills"]]
        .apply(lambda x: (x == 0).sum())
    )
    pct_players_0_kills.columns = ["pct_players_0_kills"]
    pct_players_5_kills = (
        df[["matchID", "playerStats.kills"]]
        .groupby("matchID")[["playerStats.kills"]]
        .apply(lambda x: (x >= 5).sum())
    )
    pct_players_5_kills.columns = ["pct_players_5_kills"]
    pct_players_10_kills = (
        df[["matchID", "playerStats.kills"]]
        .groupby("matchID")[["playerStats.kills"]]
        .apply(lambda x: (x >= 10).sum())
    )
    pct_players_10_kills.columns = ["pct_players_10_kills"]

    pct_players_with_streak_5 = (
        df[["matchID", "player.awards.streak_5"]]
        .groupby("matchID")[["player.awards.streak_5"]]
        .apply(lambda x: (x.notnull()).sum() / len(x) * 100)
    )
    pct_players_with_streak_5.columns = ["pct_players_with_streak_5"]

    pct_players_with_double = (
        df[["matchID", "player.awards.double"]]
        .groupby("matchID")[["player.awards.double"]]
        .apply(lambda x: (x.notnull()).sum() / len(x) * 100)
    )
    pct_players_with_double.columns = ["pct_players_with_double"]

    pct_players_with_headshots = (
        df[["matchID", "playerStats.headshots"]]
        .groupby("matchID")[["playerStats.headshots"]]
        .apply(lambda x: (x.notnull()).sum() / len(x) * 100)
    )
    pct_players_with_headshots.columns = ["pct_players_with_headshots"]

    # concatenate all columns (features), along matchID index
    df = pd.concat(
        [
            df_core,
            df_detailed,
            pct_players_0_kills,
            pct_players_5_kills,
            pct_players_10_kills,
            pct_players_with_streak_5,
            pct_players_with_double,
            pct_players_with_headshots,
        ],
        axis=1,
    ).reset_index()

    return df

#### Complete Features engineering Pipeline 

In [None]:
def pipeline_transform(last_session):
    """
    Apply all above functions to get our data ready for prediction
    """
    df = pd.read_parquet('dataset_warzone_kd_bigger.parquet.gzip')
    
    df = select_features(df)
    df = encode_features(df)
    df = create_new_features(df)
    df = perform_aggregations(df)

    df = df.drop(["matchID", "utcEndSeconds"], axis=1)
    return df

### XGBOOST

#### Apply features engineering to train test data

In [None]:
train = shrink_features(train)
train = encode_datetime(train)
train = add_time_slot(train)
train = squad_to_ordinal(train)
train = one_hot(train, column='map')
train = normalize_by_time_played(train)
train = damage_by_kill(train)
train = headshot_by_kill(train)
train = aggregate_players(train)

train = train.reset_index()
train.head(1)

In [None]:
train.info(max_cols=199, show_counts=True)

In [None]:
test = shrink_features(test)
test = encode_datetime(test)
test = add_time_slot(test)
test = squad_to_ordinal(test)
test = one_hot(test, column='map')
test = normalize_by_time_played(test)
test = damage_by_kill(test)
test = headshot_by_kill(test)
test = aggregate_players(test)

test = test.reset_index()
test.head(1)

#### Features / target split

In [None]:
X_train = train.drop(['matchID', 'target'], axis=1).copy()
y_train = train['target'].copy()
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")

X_test = test.drop(['matchID', 'target'], axis=1).copy()
y_test = test['target'].copy()
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")


X = pd.concat([X_train, X_test], axis=0)
y = pd.concat([y_train, y_test], axis=0)
print(f"full X: {X.shape}")
print(f"full y: {y.shape}")

#### Model with XGBoost

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [None]:
# Sklearn API, default run, no hyper params tuning (expect learning rate found to be 0.03 - 0.05)
# early stopping set to 50 to prevent overfitting

reg = xgb.XGBRegressor(n_estimators=1000, early_stopping_rounds=50, learning_rate=0.03)
reg.fit(
    X_train,
    y_train,
    verbose=10,
    eval_set=[(X_train, y_train), (X_test, y_test)]
)

#### Features importance

In [None]:
# using sklearn, default measure
pd.DataFrame(data=reg.feature_importances_,
             index=reg.feature_names_in_).sort_values(by=0, ascending=False).head(50)

In [None]:
# using SHAP
import shap

explainer = shap.Explainer(reg)
shap_values = explainer(X)
shap.plots.waterfall(shap_values[0], max_display=25)

In [None]:
shap.plots.beeswarm(shap_values)

In [None]:
shap.plots.bar(shap_values, max_display=25)

#### Hyperparameters tuning

In [None]:
# Random grid search using RandomSearchCV() from Sklearn

from sklearn.model_selection import RandomizedSearchCV

# we already went into back and forth runs to run with below ranges :
params = {
    'learning_rate': [0.03, 0.031, 0.032, 0.033, 0.034],
    'max_depth': [3],
    'subsample': [0.6],
    'colsample_bytree': [0.7, 0.75, 0.77],
    'colsample_bylevel': [0.7, 0.75, 0.77],
    'n_estimators': [230, 240, 250, 280],
    'reg_alpha': [0, 0.5],
    'reg_lambda': [0, 0.5]
}

xgbr = xgb.XGBRegressor(seed = 20)
clf = RandomizedSearchCV(estimator=xgbr,
                         n_jobs= -1,
                         refit=True, #default would be True anyways
                         param_distributions=params,
                         scoring='neg_mean_squared_error',
                         n_iter=200,
                         verbose=1,
                         random_state=42)
clf.fit(X_train, y_train)
print("Lowest RMSE: ", (-clf.best_score_)**(1/2.0))
print("Best parameters:\n", clf.best_params_)

In [None]:
best_pars = clf.best_params_
best_model = clf.best_estimator_

#### Final evaluation on Testing set

In [None]:
# when trained on the train dataset

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse

In [None]:
# when trained on the union of train and validation sets

best_model.fit(X, y)
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse

In [None]:
# save best model with sklearn
# best_model.save_model('xgb_model_lobby_kd.json')

### Save / load / predict

In [None]:
# save
best_model.save_model('xgb_model_lobby_kd_2.json')

In [None]:
# load
model = xgb.XGBRegressor()
model.load_model("xgb_model_lobby_kd_2.json")

In [None]:
predictions = model.predict(X_test)
predictions

In [None]:
full_prediction = X_test.copy(deep=True)
full_prediction['wzranked_kd'] = test['target']
full_prediction['predicted_kd'] = predictions
full_prediction

In [None]:
X_test.columns.tolist()

### misc Pipeline / export

Export One hot encoder, to be used later on production

In [None]:
data = pd.read_parquet('dataset_warzone_kd_bigger.parquet.gzip')

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(data[['map']])
with open('ohe_encoder.pickle', 'wb') as f:
    pickle.dump(enc, f)