In [3]:

import pandas as pd
import numpy as np
import pickle
import dataset
from importlib import reload
reload(dataset)
from sklearn import linear_model

In [4]:
# Load combined data
df = pd.read_csv('../data/combined.csv')

# Load hero feature data
df_features = pd.read_csv('../data/features.csv')
df_features = df_features.set_index('hero_id')

# Load standard filter
df_filters = pd.read_csv('../models/filters.csv')

### Data generation

In [5]:
# Graph data already has the scaled features and match results
# Load graph dataset 50000 matches at a time
dir = '../data/graphs_v1_scaled/'
count = 0
total = len(df)
step = 50000

for i in range(0,int(np.ceil(total/step))):
    start = i*step
    end = start+step-1 if (start+step)<total else total-1
    path = dir+f'graphs_v1_scaled_{start}-{end}.pkl'
    print(path)
    file = open(path,'rb')
    if i==0:
        graphs = pickle.load(file)
    else:
        graphs = graphs + pickle.load(file)

../data/graphs_v1_scaled/graphs_v1_scaled_0-49999.pkl
../data/graphs_v1_scaled/graphs_v1_scaled_50000-99999.pkl
../data/graphs_v1_scaled/graphs_v1_scaled_100000-149999.pkl
../data/graphs_v1_scaled/graphs_v1_scaled_150000-199999.pkl
../data/graphs_v1_scaled/graphs_v1_scaled_200000-249999.pkl
../data/graphs_v1_scaled/graphs_v1_scaled_250000-299999.pkl
../data/graphs_v1_scaled/graphs_v1_scaled_300000-349999.pkl
../data/graphs_v1_scaled/graphs_v1_scaled_350000-399999.pkl
../data/graphs_v1_scaled/graphs_v1_scaled_400000-449999.pkl
../data/graphs_v1_scaled/graphs_v1_scaled_450000-499999.pkl
../data/graphs_v1_scaled/graphs_v1_scaled_500000-549999.pkl
../data/graphs_v1_scaled/graphs_v1_scaled_550000-599999.pkl
../data/graphs_v1_scaled/graphs_v1_scaled_600000-649999.pkl
../data/graphs_v1_scaled/graphs_v1_scaled_650000-699999.pkl
../data/graphs_v1_scaled/graphs_v1_scaled_700000-749999.pkl
../data/graphs_v1_scaled/graphs_v1_scaled_750000-799999.pkl
../data/graphs_v1_scaled/graphs_v1_scaled_800000

In [6]:
def get_filt_idx(filt):
    '''Returns indices of desired matches given a boolean array filter e.g. True, False, True returns [0,2]'''
    # DotaV1 data handling (two graphs for every match: 0-49999 radiant, 0-49999 dire, 50000-99999 radiant, etc.)
    step = 50000
    filt_vals = []
    for i in range(0,int(np.ceil(len(filt)/step))):
        start = i*step
        end = start+step
        # Add filters for match range twice, as matches repeated every 50000
        filt_vals = np.append(filt_vals, filt[start:end])
        filt_vals = np.append(filt_vals, filt[start:end])

    # Get indices of True values in filters
    filt_idx = [i for i, x in enumerate(filt_vals) if x]
    return filt_idx

In [7]:
# Filter graph dataset
filt = df_filters['filt_std'].values
filt_idx = get_filt_idx(filt)
graphs_filt = graphs[filt_idx]
print('Standard filtering complete')

Standard filtering complete


In [8]:
# Remove attack_backswing feature
for i in range(0,len(graphs_filt)):
    # if(i%100000==0):
    graphs_filt[i].x = graphs_filt[i].x[:,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,16,17,18,19]] # remove attack_backswing as a feature
print('Attack backswing feature removed')

Attack backswing feature removed


In [9]:
# Split graphs into training/validation/test
np.random.seed(10)
idxs = np.random.permutation(len(graphs_filt))
split_va, split_te = int(0.64 * len(graphs_filt)), int(0.8 * len(graphs_filt)) #64% training, 16% validation, 20% test
idx_tr, idx_va, idx_te = np.split(idxs, [split_va, split_te])
data_tr = graphs_filt[idx_tr]
data_va = graphs_filt[idx_va]
data_te = graphs_filt[idx_te]

print(f'Training data: {np.round(len(data_tr)/len(graphs_filt),2)*100}%')
print(f'Validation data: {np.round(len(data_va)/len(graphs_filt),2)*100}%')
print(f'Test data: {np.round(len(data_te)/len(graphs_filt),2)*100}%')

Training data: 64.0%
Validation data: 16.0%
Test data: 20.0%


In [10]:
# Create training/validation/test data from graph splits
# Training
X_tr = np.empty([len(data_tr), 19])
y_tr = np.empty(len(data_tr))

for i, graph in enumerate(data_tr):
    if i%100000==0:
        print(i)
    X_tr[i,:] = np.mean(data_tr[i].x,0)
    y_tr[i] = data_tr[i].y

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000
4000000
4100000
4200000
4300000
4400000
4500000
4600000
4700000
4800000
4900000
5000000
5100000
5200000
5300000
5400000
5500000
5600000
5700000
5800000
5900000
6000000
6100000
6200000
6300000
6400000
6500000


In [11]:
with open('X_tr.npy', 'wb') as f:
    np.save(f, X_tr)
with open('y_tr.npy', 'wb') as f:
    np.save(f, y_tr)

In [12]:
# Validation
X_va = np.empty([len(data_va), 19])
y_va = np.empty(len(data_va))

for i, graph in enumerate(data_va):
    if i%100000==0:
        print(i)
    X_va[i,:] = np.mean(data_va[i].x,0)
    y_va[i] = data_va[i].y

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000


In [13]:
with open('X_va.npy', 'wb') as f:
    np.save(f, X_va)
with open('y_va.npy', 'wb') as f:
    np.save(f, y_va)

In [14]:
# Test
X_te = np.empty([len(data_te), 19])
y_te = np.empty(len(data_te))

for i, graph in enumerate(data_te):
    if i%100000==0:
        print(i)
    X_te[i,:] = np.mean(data_te[i].x,0)
    y_te[i] = data_te[i].y

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000


In [15]:
with open('X_te.npy', 'wb') as f:
    np.save(f, X_te)
with open('y_te.npy', 'wb') as f:
    np.save(f, y_te)

### Modelling

In [18]:
regr = linear_model.LogisticRegression(penalty='l1',
    solver='saga',  # or 'liblinear'
    C=10000)
regr = linear_model.LogisticRegression()
regr.fit(X_tr, y_tr)

5078155.0

In [None]:
y_pred_test = regr.predict(X_test_s)
y_pred_train = regr.predict(X_train_s)
print(f'Train Accuracy: {metrics.accuracy_score(y_train, y_pred_train)}')
print(f'Test Accuracy: {metrics.accuracy_score(y_test, y_pred_test)}')