## Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

  from numpy.core.umath_tests import inner1d


## Load Dataset

In [2]:
data = pd.read_csv('../data/processed/matches-2018-11-18T19:33:05.278801.csv', delimiter=',')

In [3]:
# Based on previous investigation
useless_columns = ['id', 'killPoints', 'killPointsDelta', 'rankPoints', 'winPoints', 'winPointsDelta',
                   'stats', 'tags', 'createdAt', 'name', 'playerId', 'lastWinPoints', 'lastKillPoints',
                   'titleId', 'shardId', 'seasonState', 'isCustomMatch']
y_column = 'winPlace'

## Split Sets

In [4]:
data = data.drop(columns=useless_columns)

# Grab 80% of data for training at random
train = data.sample(frac=0.8)
test = data.drop(train.index)

train_Y = train[y_column]
train_X = train.drop(columns=[y_column])

test_Y = test[y_column]
test_X = test.drop(columns=[y_column])

## Pre-processing

In [5]:
# Create subsets of columns

categorical_columns = ['deathType', 'mapName', 'gameMode']
ordinal_columns = ['group_id', 'match_id']
# Ordinal columns are different from other_columns because they need to be encoded
other_columns = list(set(train_X.columns.values) - set(categorical_columns) - set(ordinal_columns))

In [6]:
# Vectorize categorical columns

for c in categorical_columns:
    le = LabelEncoder()
    train_X[c] = le.fit_transform(train_X[c])
    ohe = OneHotEncoder(sparse = False)
    train_X = train_X.join(pd.DataFrame(list(ohe.fit_transform(train_X.pop(c).values.reshape(-1, 1))), columns = ['{}-{}'.format(c, i) for i in ohe.active_features_], index = train_X.index))
    test_X[c] = le.transform(test_X[c])
    test_X = test_X.join(pd.DataFrame(list(ohe.transform(test_X.pop(c).values.reshape(-1, 1))), columns = ['{}-{}'.format(c, i) for i in ohe.active_features_], index = test_X.index))

In [7]:
# Encode ids

for c in ordinal_columns:
    le = LabelEncoder()
    train_X[c] = le.fit_transform(train_X[c])
    test_X[c] = le.transform(test_X[c])

In [8]:
# Scale

for c in other_columns:
    s = MinMaxScaler()
    train_X[c] = s.fit_transform(train_X[c].values.reshape(-1, 1))
    test_X[c] = s.transform(test_X[c].values.reshape(-1, 1))



## Random Forest

In [9]:
# Parameters to tune
params = {
    'n_estimators': [int(x) for x in np.linspace(start=10, stop=200, num=20)],
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0, 2.0],
    'loss': ['ls', 'lad', 'huber', 'quantile'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_depth': [int(x) for x in np.linspace(10, 110, num=11)],
    'max_features': ['auto', 'sqrt', 'log2'],
}

In [10]:
gbr = GradientBoostingRegressor()

gridsearch = GridSearchCV(gbr, param_grid=params, n_jobs=-1, verbose=2)

gridsearch.fit(X=train_X, y=train_Y)

pred = gridsearch.predict(test_X)

print('Best params: {}'.format(gridsearch.best_params_))

print('Error: {}'.format(mean_squared_error(test_Y, pred)))

Fitting 3 folds for each of 360 candidates, totalling 1080 fits
[CV] learning_rate=0.01, loss=linear, n_estimators=10 ................
[CV] learning_rate=0.01, loss=linear, n_estimators=10 ................
[CV] learning_rate=0.01, loss=linear, n_estimators=10 ................
[CV] learning_rate=0.01, loss=linear, n_estimators=20 ................
[CV] . learning_rate=0.01, loss=linear, n_estimators=10, total=   4.7s
[CV] . learning_rate=0.01, loss=linear, n_estimators=10, total=   4.7s
[CV] . learning_rate=0.01, loss=linear, n_estimators=10, total=   4.6s
[CV] learning_rate=0.01, loss=linear, n_estimators=20 ................
[CV] learning_rate=0.01, loss=linear, n_estimators=20 ................
[CV] learning_rate=0.01, loss=linear, n_estimators=30 ................
[CV] . learning_rate=0.01, loss=linear, n_estimators=20, total=   8.9s
[CV] learning_rate=0.01, loss=linear, n_estimators=30 ................
[CV] . learning_rate=0.01, loss=linear, n_estimators=20, total=   9.0s
[CV] . learni

KeyboardInterrupt: 