## Import libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Load Dataset

In [2]:
data = pd.read_csv('../data/processed/matches-2018-11-18T19:33:05.278801.csv', delimiter=',')

## Visualize Data

In [3]:
# Deprecated columns, or not needed
useless_columns = ['id', 'killPoints', 'killPointsDelta', 'rankPoints', 'winPoints', 'winPointsDelta', 'stats', 'tags', 'createdAt','name', 'playerId', 'match_id', 'group_id']
# Value that we are predicting
y_column = 'winPlace'

columns = list(set(data.columns) - set(useless_columns) - set([y_column]))

# for col in columns:
#     fig = plt.figure()
#     plt.title("{} vs. {}".format(col, y_column))
#     plt.scatter(data[col], data[y_column])
#     #plt.savefig("figures/{}vs{}".format(col, y_column))
#     plt.show(block=False)
#     plt.close(fig)
    

## Split Sets

In [4]:
data = data.drop(columns=useless_columns)

# Grab 80% of data for training at random
train = data.sample(frac=0.8)
test = data.drop(train.index)

train_Y = train[y_column]
train_X = train.drop(columns=[y_column])

test_Y = test[y_column]
test_X = test.drop(columns=[y_column])

## Pre-processing

In [5]:
backup = train_X
# Vectorization
categorical_columns = ['deathType', 'mapName', 'seasonState', 'gameMode', 'titleId', 'shardId', 'isCustomMatch']#, 'group_id', 'match_id'
other_columns = list(set(train_X.columns.values) - set(categorical_columns))

for c in categorical_columns:
    le = LabelEncoder()
    train_X[c] = le.fit_transform(train_X[c])
    ohe = OneHotEncoder(sparse = False)
    train_X = train_X.join(pd.DataFrame(list(ohe.fit_transform(train_X.pop(c).values.reshape(-1, 1))), columns = ['{}-{}'.format(c, i) for i in ohe.active_features_], index = train_X.index))
    test_X[c] = le.transform(test_X[c])
    test_X = test_X.join(pd.DataFrame(list(ohe.transform(test_X.pop(c).values.reshape(-1, 1))), columns = ['{}-{}'.format(c, i) for i in ohe.active_features_], index = test_X.index))

    

In [6]:
# Scale

for c in other_columns:
    s = MinMaxScaler()
    train_X[c] = s.fit_transform(train_X[c].values.reshape(-1, 1))
    test_X[c] = s.transform(test_X[c].values.reshape(-1, 1))



## Logistic Regression

In [7]:
train_X.shape

(75444, 41)

In [None]:
lr = LogisticRegression()
lr.fit(X=train_X, y=train_Y)

pred = lr.predict(test_X)

print('Accuracy: ' + str(accuracy_score(test_Y, pred)))

# 