In [1]:

import pandas as pd
import numpy as np
import pickle
import dataset
from importlib import reload
reload(dataset)
from sklearn import linear_model
from sklearn import metrics

In [63]:
# Load combined data
df = pd.read_csv('../data/combined.csv')

# Load hero feature data
df_features = pd.read_csv('../data/features.csv')
df_features = df_features.set_index('hero_id')

# Load standard filter
df_filters = pd.read_csv('../models/filters.csv')

In [10]:
def get_filt_idx(filt):
    '''Returns indices of desired matches given a boolean array filter e.g. True, False, True returns [0,2]'''
    # DotaV1 data handling (two graphs for every match: 0-49999 radiant, 0-49999 dire, 50000-99999 radiant, etc.)
    step = 50000
    filt_vals = []
    for i in range(0,int(np.ceil(len(filt)/step))):
        start = i*step
        end = start+step
        # Add filters for match range twice, as matches repeated every 50000
        filt_vals = np.append(filt_vals, filt[start:end])
        filt_vals = np.append(filt_vals, filt[start:end])

    # Get indices of True values in filters
    filt_idx = [i for i, x in enumerate(filt_vals) if x]
    return filt_idx

### Data generation

In [None]:
# Graph data already has the scaled features and match results
# Load graph dataset 50000 matches at a time
dir = '../data/graphs_v1_scaled/'
count = 0
total = len(df)
step = 50000

for i in range(0,int(np.ceil(total/step))):
    start = i*step
    end = start+step-1 if (start+step)<total else total-1
    path = dir+f'graphs_v1_scaled_{start}-{end}.pkl'
    print(path)
    file = open(path,'rb')
    if i==0:
        graphs = pickle.load(file)
    else:
        graphs = graphs + pickle.load(file)

In [None]:
# Create X matrix (predictors) and y array (response) from graph data
# Training
X = np.empty([len(graphs), 20])
y = np.empty(len(graphs))

print('Converting graphs to X and y')
for i, graph in enumerate(graphs):
    if i%1000000==0:
        print(i)
    X[i,:] = np.mean(graphs[i].x,0)
    y[i] = graphs[i].y

print('Saving X and y numpy')
with open('../data/standard_v1/X.npy', 'wb') as f:
    np.save(f, X)
with open('../data/standard_v1/y.npy', 'wb') as f:
    np.save(f, y)

### Modelling

#### All features

In [42]:
# Load X and y data
with open('../data/standard_v1/X.npy', 'rb') as f:
    X = np.load(f)
with open('../data/standard_v1/y.npy', 'rb') as f:
    y = np.load(f)

# Filter data
# Standard filter
filt = get_filt_idx(df_filters['filt_std'])
X_filt = X[filt]
y_filt = y[filt]

# Removed attack backswing feature
X_filt = np.delete(X_filt,14,1)

In [44]:
# Training/validation/test data split
np.random.seed(10)
idxs = np.random.permutation(len(y_filt))
split_va, split_te = int(0.64 * len(y_filt)), int(0.8 * len(y_filt)) #64% training, 16% validation, 20% test
idx_tr, idx_va, idx_te = np.split(idxs, [split_va, split_te])
X_tr, X_va, X_te = X_filt[idx_tr], X_filt[idx_va], X_filt[idx_te]
y_tr, y_va, y_te = y_filt[idx_tr], y_filt[idx_va], y_filt[idx_te]

print(f'Training data: {np.round(len(y_tr)/len(y_filt),2)*100}%')
print(f'Validation data: {np.round(len(y_va)/len(y_filt),2)*100}%')
print(f'Test data: {np.round(len(y_te)/len(y_filt),2)*100}%')

Training data: 64.0%
Validation data: 16.0%
Test data: 20.0%


In [45]:
# Fit model
regr = linear_model.LogisticRegression(penalty='l1',
    solver='saga',  # or 'liblinear'
    C=10000)
regr = linear_model.LogisticRegression()
regr.fit(X_tr, y_tr)

In [46]:
y_pred_tr = regr.predict(X_tr)
y_pred_va = regr.predict(X_va)
print(f'Train Accuracy: {np.round(metrics.accuracy_score(y_tr, y_pred_tr)*100,2)}%')
print(f'Validation Accuracy: {np.round(metrics.accuracy_score(y_va, y_pred_va)*100,2)}%')

Train Accuracy: 52.29%
Validation Accuracy: 52.28%


#### Reduced features

In [53]:
# Load X and y data
with open('../data/standard_v1/X.npy', 'rb') as f:
    X = np.load(f)
with open('../data/standard_v1/y.npy', 'rb') as f:
    y = np.load(f)

# Filter data
# Standard filter
filt = get_filt_idx(df_filters['filt_std'])
X_filt = X[filt]
y_filt = y[filt]

# Removed attack backswing feature
X_filt = np.delete(X_filt,14,1)

# Filter to reduced features
X_filt = X_filt[:,[0,1,2,3,4,5,6,7,8,9,10,11,18]]

In [54]:
# Training/validation/test data split
np.random.seed(10)
idxs = np.random.permutation(len(y_filt))
split_va, split_te = int(0.64 * len(y_filt)), int(0.8 * len(y_filt)) #64% training, 16% validation, 20% test
idx_tr, idx_va, idx_te = np.split(idxs, [split_va, split_te])
X_tr, X_va, X_te = X_filt[idx_tr], X_filt[idx_va], X_filt[idx_te]
y_tr, y_va, y_te = y_filt[idx_tr], y_filt[idx_va], y_filt[idx_te]

print(f'Training data: {np.round(len(y_tr)/len(y_filt),2)*100}%')
print(f'Validation data: {np.round(len(y_va)/len(y_filt),2)*100}%')
print(f'Test data: {np.round(len(y_te)/len(y_filt),2)*100}%')

Training data: 64.0%
Validation data: 16.0%
Test data: 20.0%


In [55]:
# Fit model
regr = linear_model.LogisticRegression(penalty='l1',
    solver='saga',  # or 'liblinear'
    C=10000)
regr = linear_model.LogisticRegression()
regr.fit(X_tr, y_tr)

In [56]:
y_pred_tr = regr.predict(X_tr)
y_pred_va = regr.predict(X_va)
print(f'Train Accuracy: {np.round(metrics.accuracy_score(y_tr, y_pred_tr)*100,2)}%')
print(f'Validation Accuracy: {np.round(metrics.accuracy_score(y_va, y_pred_va)*100,2)}%')

Train Accuracy: 51.72%
Validation Accuracy: 51.75%


#### MMR Ranges

In [59]:
# Load X and y data
with open('../data/standard_v1/X.npy', 'rb') as f:
    X = np.load(f)
with open('../data/standard_v1/y.npy', 'rb') as f:
    y = np.load(f)

# Removed attack backswing feature
X = np.delete(X,14,1)

In [62]:
for group in range(1,7):
    # Filter data: standard filter + MMR
    filt = get_filt_idx(df_filters['filt_std'].values & df_filters[f'filt_mmr_{group}'].values)
    X_filt = X[filt]
    y_filt = y[filt]
    # Training/validation/test data split
    np.random.seed(10)
    idxs = np.random.permutation(len(y_filt))
    split_va, split_te = int(0.7 * len(y_filt)), int(1.0 * len(y_filt)) #70% training, 30% validation
    idx_tr, idx_va, idx_te = np.split(idxs, [split_va, split_te])
    X_tr, X_va, X_te = X_filt[idx_tr], X_filt[idx_va], X_filt[idx_te]
    y_tr, y_va, y_te = y_filt[idx_tr], y_filt[idx_va], y_filt[idx_te]

    # Fit model
    regr = linear_model.LogisticRegression(penalty='l1',
        solver='saga',  # or 'liblinear'
        C=10000)
    regr = linear_model.LogisticRegression()
    regr.fit(X_tr, y_tr)

    y_pred_tr = regr.predict(X_tr)
    y_pred_va = regr.predict(X_va)
    print(f'MMR group {group}')
    print(f'Train Accuracy: {np.round(metrics.accuracy_score(y_tr, y_pred_tr)*100,2)}%')
    print(f'Validation Accuracy: {np.round(metrics.accuracy_score(y_va, y_pred_va)*100,2)}%')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


MMR group 1
Train Accuracy: 52.57%
Validation Accuracy: 52.43%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


MMR group 2
Train Accuracy: 52.42%
Validation Accuracy: 52.5%
MMR group 3
Train Accuracy: 52.4%
Validation Accuracy: 52.3%
MMR group 4
Train Accuracy: 52.26%
Validation Accuracy: 52.19%
MMR group 5
Train Accuracy: 51.9%
Validation Accuracy: 51.9%
MMR group 6
Train Accuracy: 51.33%
Validation Accuracy: 51.4%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### Duration Ranges

In [65]:
# Load X and y data
with open('../data/standard_v1/X.npy', 'rb') as f:
    X = np.load(f)
with open('../data/standard_v1/y.npy', 'rb') as f:
    y = np.load(f)

# Removed attack backswing feature
X = np.delete(X,14,1)

In [68]:
for group in range(1,7):
    # Filter data: standard filter + duration
    filt = get_filt_idx(df_filters['filt_std'].values & df_filters[f'filt_duration_{group}'].values)
    X_filt = X[filt]
    y_filt = y[filt]
    # Training/validation/test data split
    np.random.seed(10)
    idxs = np.random.permutation(len(y_filt))
    split_va, split_te = int(0.7 * len(y_filt)), int(1.0 * len(y_filt)) #70% training, 30% validation
    idx_tr, idx_va, idx_te = np.split(idxs, [split_va, split_te])
    X_tr, X_va, X_te = X_filt[idx_tr], X_filt[idx_va], X_filt[idx_te]
    y_tr, y_va, y_te = y_filt[idx_tr], y_filt[idx_va], y_filt[idx_te]

    # Fit model
    regr = linear_model.LogisticRegression(penalty='l1',
        solver='saga',  # or 'liblinear'
        C=10000)
    regr = linear_model.LogisticRegression()
    regr.fit(X_tr, y_tr)

    y_pred_tr = regr.predict(X_tr)
    y_pred_va = regr.predict(X_va)
    print(f'Duration group {group} ... tr: {len(y_tr)} /// va: {len(y_va)}')
    print(f'Train Accuracy: {np.round(metrics.accuracy_score(y_tr, y_pred_tr)*100,2)}%')
    print(f'Validation Accuracy: {np.round(metrics.accuracy_score(y_va, y_pred_va)*100,2)}%')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Duration group 1 ... tr: 916706 /// va: 392874
Train Accuracy: 56.45%
Validation Accuracy: 56.45%
Duration group 2 ... tr: 1756997 /// va: 752999
Train Accuracy: 53.39%
Validation Accuracy: 53.37%
Duration group 3 ... tr: 1723446 /// va: 738620
Train Accuracy: 52.37%
Validation Accuracy: 52.26%
Duration group 4 ... tr: 1450533 /// va: 621657
Train Accuracy: 52.59%
Validation Accuracy: 52.5%
Duration group 5 ... tr: 802573 /// va: 343961
Train Accuracy: 52.67%
Validation Accuracy: 52.6%
Duration group 6 ... tr: 458005 /// va: 196289
Train Accuracy: 52.65%
Validation Accuracy: 52.54%
