In [1]:
import os
import numpy as np
import pandas as pd
from joblib import dump

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Dataset

In [2]:
df = pd.read_csv('../data/train.csv')

types = df['playlist_id'].unique()

data = {}
for key in types:
    data[key] = df[df['playlist_id'] == key].copy()
    print(f'{key} shape: {data[key].shape}')

ranked-doubles shape: (28494, 96)
ranked-duels shape: (3394, 96)
ranked-standard shape: (17542, 96)


## Feature Selection

In [3]:
# drop stats columns that are inconsistent
max_nulls = 0.02 # drop columns with more than 2% null values

for key in data:
    cols_ = [c for c in data[key].columns if c.startswith('stats')]
    print(f'{key} has {len(cols_)} stats columns')

    # rate of null values in each column
    nulls = data[key][cols_].isnull().sum() / len(data[key])
    nulls_ = nulls[nulls > max_nulls].index
    for col in nulls_:
        print(f'\tDropping {col} from {key} due to {nulls[col]:.2%} null values')

    data[key].drop(nulls_, axis=1, inplace=True)

    cols_ = [c for c in data[key].columns if c.startswith('stats')]
    print(f'{key} has {len(cols_)} stats columns after dropping nulls')


ranked-doubles has 84 stats columns
	Dropping stats.positioning.goals_against_while_last_defender from ranked-doubles due to 30.27% null values
ranked-doubles has 83 stats columns after dropping nulls
ranked-duels has 84 stats columns
	Dropping stats.positioning.avg_distance_to_mates from ranked-duels due to 100.00% null values
	Dropping stats.positioning.goals_against_while_last_defender from ranked-duels due to 9.19% null values
ranked-duels has 82 stats columns after dropping nulls
ranked-standard has 84 stats columns
	Dropping stats.positioning.goals_against_while_last_defender from ranked-standard due to 47.41% null values
ranked-standard has 83 stats columns after dropping nulls


In [4]:
for key in data:
    data[key].fillna(0, inplace=True)

# Train

In [5]:
headers = [
    'replay_id',
    'player_id',
    'date',
    'playlist_id',
    'team',
    'id',
]

denylist = [
    'tier',
]

models = {}
for key in data:
    print(f'Training {key}...')
    cols = [c for c in data[key].columns if c not in headers + denylist]
    
    y = data[key]['tier']
    X = data[key][cols]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    model = RandomForestRegressor(n_estimators=10, n_jobs=-1)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    print(f'\t{key} MAE: {metrics.mean_absolute_error(y_test, y_pred)}')
    print(f'\t{key} MSE: {metrics.mean_squared_error(y_test, y_pred)}')
    print(f'\t{key} RMSE: {np.sqrt(metrics.mean_squared_error(y_test, y_pred))}')
    print(f'\t{key} R2 Score: {metrics.r2_score(y_test, y_pred)}')
    print(f'\t{key} Explained Variance Score: {metrics.explained_variance_score(y_test, y_pred)}')
    print(f'\t{key} Max Error: {metrics.max_error(y_test, y_pred)}')
    print(f'\t{key} Accuracy: {metrics.accuracy_score(y_test, y_pred.round())}') 

    models[key] = {
        'model': model,
        'features': cols,
    }

Training ranked-doubles...
	ranked-doubles MAE: 2.2744104435710275
	ranked-doubles MSE: 9.787269792251545
	ranked-doubles RMSE: 3.1284612499200857
	ranked-doubles R2 Score: 0.7509988440114177
	ranked-doubles Explained Variance Score: 0.751222466187124
	ranked-doubles Max Error: 17.7
	ranked-doubles Accuracy: 0.1624087591240876
Training ranked-duels...
	ranked-duels MAE: 2.1500588928150766
	ranked-duels MSE: 8.65432273262662
	ranked-duels RMSE: 2.94182302877427
	ranked-duels R2 Score: 0.8169305010568828
	ranked-duels Explained Variance Score: 0.8172881901293786
	ranked-duels Max Error: 18.7
	ranked-duels Accuracy: 0.1625441696113074
Training ranked-standard...
	ranked-standard MAE: 2.139420884632923
	ranked-standard MSE: 8.015629274965802
	ranked-standard RMSE: 2.8311886682038345
	ranked-standard R2 Score: 0.7394585544145678
	ranked-standard Explained Variance Score: 0.7404538651979369
	ranked-standard Max Error: 17.5
	ranked-standard Accuracy: 0.16233470132238942


In [6]:
for key in models:
    dump(models[key], f'../models/{key}.joblib')