## Import libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

## Load Dataset

In [2]:
data = pd.read_csv('../data/processed/matches-2018-11-18T19:33:05.278801.csv', delimiter=',')

In [3]:
# Based on previous investigation
useless_columns = ['id', 'killPoints', 'killPointsDelta', 'rankPoints', 'winPoints', 'winPointsDelta',
                   'stats', 'tags', 'createdAt', 'name', 'playerId', 'lastWinPoints', 'lastKillPoints',
                   'titleId', 'shardId', 'seasonState', 'isCustomMatch', 'mostDamage', 'groupRank']
y_column = 'winPlace'

## Split Sets

In [4]:
data = data.drop(columns=useless_columns)

# Grab 80% of data for training at random
train = data.sample(frac=0.8)
test = data.drop(train.index)

train_Y = train[y_column]
train_X = train.drop(columns=[y_column])

test_Y = test[y_column]
test_X = test.drop(columns=[y_column])

## Pre-processing

In [5]:
# Create subsets of columns

categorical_columns = ['deathType', 'mapName', 'gameMode']
ordinal_columns = ['groupId', 'matchId']
# Ordinal columns are different from other_columns because they need to be encoded
other_columns = list(set(train_X.columns.values) - set(categorical_columns) - set(ordinal_columns))

In [6]:
# Vectorize categorical columns

for c in categorical_columns:
    le = LabelEncoder()
    train_X[c] = le.fit_transform(train_X[c])
    ohe = OneHotEncoder(sparse = False)
    train_X = train_X.join(pd.DataFrame(list(ohe.fit_transform(train_X.pop(c).values.reshape(-1, 1))), columns = ['{}-{}'.format(c, i) for i in ohe.active_features_], index = train_X.index))
    test_X[c] = le.transform(test_X[c])
    test_X = test_X.join(pd.DataFrame(list(ohe.transform(test_X.pop(c).values.reshape(-1, 1))), columns = ['{}-{}'.format(c, i) for i in ohe.active_features_], index = test_X.index))

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [7]:
# Encode ids

for c in ordinal_columns:
    le = LabelEncoder()
    train_X[c] = le.fit_transform(train_X[c])
    test_X[c] = le.transform(test_X[c])

In [8]:
# Scale

for c in other_columns:
    s = MinMaxScaler()
    train_X[c] = s.fit_transform(train_X[c].values.reshape(-1, 1))
    test_X[c] = s.transform(test_X[c].values.reshape(-1, 1))



## Random Forest

In [12]:
rfr = RandomForestRegressor()
rfr.fit(X=train_X, y=train_Y)

pred = rfr.predict(test_X)

pred_test = rfr.predict(train_X)

print('Error: {}'.format(mean_squared_error(test_Y, pred)))
print('Train Error: {}'.format(mean_squared_error(train_Y, pred_test)))



Error: 6.637188378134774
Train Error: 1.1948727533004615


## Feature Importances

In [10]:
# Source: https://towardsdatascience.com/running-random-forests-inspect-the-feature-importances-with-this-code-2b00dd72b92e

feature_importances = pd.DataFrame(rfr.feature_importances_,
                                   index = train_X.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
feature_importances

Unnamed: 0,importance
groupCount,0.405034
walkDistance,0.381292
timeSurvived,0.090663
killPlace,0.083551
duration,0.016929
mapName-3,0.004031
playerCount,0.002473
deathType-0,0.002248
matchId,0.002043
groupId,0.001422
