## Import libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.metrics import mean_squared_error

## Load Dataset

In [2]:
data = pd.read_csv('../data/processed/matches-2018-11-18T19:33:05.278801.csv', delimiter=',')

In [3]:
# Based on previous investigation
useless_columns = ['id', 'killPoints', 'killPointsDelta', 'rankPoints', 'winPoints', 'winPointsDelta',
                   'stats', 'tags', 'createdAt', 'name', 'playerId', 'lastWinPoints', 'lastKillPoints',
                   'titleId', 'shardId', 'seasonState', 'isCustomMatch']
y_column = 'winPlace'

## Split Sets

In [4]:
data = data.drop(columns=useless_columns)

# Grab 80% of data for training at random
train = data.sample(frac=0.8)
test = data.drop(train.index)

train_Y = train[y_column]
train_X = train.drop(columns=[y_column])

test_Y = test[y_column]
test_X = test.drop(columns=[y_column])

## Pre-processing

In [5]:
# Create subsets of columns

categorical_columns = ['deathType', 'mapName', 'gameMode']
ordinal_columns = ['group_id', 'match_id']
# Ordinal columns are different from other_columns because they need to be encoded
other_columns = list(set(train_X.columns.values) - set(categorical_columns) - set(ordinal_columns))

In [6]:
# Vectorize categorical columns

for c in categorical_columns:
    le = LabelEncoder()
    train_X[c] = le.fit_transform(train_X[c])
    ohe = OneHotEncoder(sparse = False)
    train_X = train_X.join(pd.DataFrame(list(ohe.fit_transform(train_X.pop(c).values.reshape(-1, 1))), columns = ['{}-{}'.format(c, i) for i in ohe.active_features_], index = train_X.index))
    test_X[c] = le.transform(test_X[c])
    test_X = test_X.join(pd.DataFrame(list(ohe.transform(test_X.pop(c).values.reshape(-1, 1))), columns = ['{}-{}'.format(c, i) for i in ohe.active_features_], index = test_X.index))

In [7]:
# Encode ids

for c in ordinal_columns:
    le = LabelEncoder()
    train_X[c] = le.fit_transform(train_X[c])
    test_X[c] = le.transform(test_X[c])

In [8]:
# Scale

for c in other_columns:
    s = MinMaxScaler()
    train_X[c] = s.fit_transform(train_X[c].values.reshape(-1, 1))
    test_X[c] = s.transform(test_X[c].values.reshape(-1, 1))



## Linear Regression

In [9]:
lr = LinearRegression()
lr.fit(X=train_X, y=train_Y)

pred = lr.predict(test_X)

print('Regular Linear Regression Error: {}'.format(mean_squared_error(test_Y, pred)))

lr = RidgeCV()
lr.fit(X=train_X, y=train_Y)

pred = lr.predict(test_X)

print('Linear Regression with Ridge Error: {}'.format(mean_squared_error(test_Y, pred)))
print('Best alpha: {}'.format(lr.alpha_))

lr = LassoCV()
lr.fit(X=train_X, y=train_Y)

pred = lr.predict(test_X)

print('Linear Regression with Lasso Error: {}'.format(mean_squared_error(test_Y, pred)))
print('Best alpha: {}'.format(lr.alpha_))

Regular Linear Regression Error: 66.30482696328612
Linear Regression with Ridge Error: 66.30982640554949
Best alpha: 0.1
Linear Regression with Lasso Error: 108.31291443128379
Best alpha: 0.3896464250485547
