In [1]:
%matplotlib inline

import itertools as it

import numpy as np
from sklearn.linear_model import LogisticRegression
#from sklearn.learning_curve import learning_curve
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
import matplotlib.pyplot as plt

import sys
sys.path.append('..')

from helpers.helpers import load_matches, HERO_ID_MAP, HERO_COUNT

In [2]:
matches = load_matches()

matches_train, matches_test = train_test_split(matches, test_size=.1)

len(matches)

30227

In [3]:
synergy_all_tbl = np.zeros((HERO_COUNT, HERO_COUNT), np.int32)
synergy_win_tbl = np.zeros((HERO_COUNT, HERO_COUNT), np.int32)
counter_all_tbl = np.zeros((HERO_COUNT, HERO_COUNT), np.int32)
counter_win_tbl = np.zeros((HERO_COUNT, HERO_COUNT), np.int32)

for idx, match in enumerate(matches_train):
    heros = [HERO_ID_MAP[pl.hero_id] for pl in match.players]

    for i, j in it.combinations(range(0, 5), 2):
        synergy_all_tbl[heros[i], heros[j]] += 1
        if match.radiant_win:
            synergy_win_tbl[heros[i], heros[j]] += 1

    for i, j in it.combinations(range(5, 10), 2):
        synergy_all_tbl[heros[i], heros[j]] += 1
        if not match.radiant_win:
            synergy_win_tbl[heros[i], heros[j]] += 1

    for i, j in it.combinations_with_replacement(range(0, 5), 2):
        counter_all_tbl[heros[i], heros[j+5]] += 1
        if match.radiant_win:
            counter_win_tbl[heros[i], heros[j+5]] += 1

with np.errstate(divide='ignore', invalid='ignore'):
    synergy_tbl = synergy_win_tbl / synergy_all_tbl
    synergy_tbl[~np.isfinite(synergy_tbl)] = .5
    counter_tbl = counter_win_tbl / counter_all_tbl
    counter_tbl[~np.isfinite(counter_tbl)] = .5

In [4]:
def match_to_sample(match, sample):
    heros = [HERO_ID_MAP[pl.hero_id] for pl in match.players]

    for hero in heros[:5]:
        sample[hero] = 1

    for hero in heros[5:]:
        sample[HERO_COUNT + hero] = 1
    
    radiant_synergy = 0
    for i, j in it.combinations(range(0, 5), 2):
        radiant_synergy += synergy_tbl[heros[i], heros[j]]

    dire_synergy = 0
    for i, j in it.combinations(range(5, 10), 2):
        dire_synergy += synergy_tbl[heros[i], heros[j]]

    sample[HERO_COUNT * 2] = radiant_synergy - dire_synergy

    radiant_counter = 0
    for i, j in it.combinations_with_replacement(range(0, 5), 2):
        radiant_counter += counter_tbl[heros[i], heros[j+5]]

    sample[HERO_COUNT * 2 + 1] = radiant_counter

def transform_matches(matches):
    X = np.zeros((len(matches), HERO_COUNT * 2 + 2))
    y = np.empty(len(matches), np.bool)

    for idx, match, sample in zip(it.count(), matches, X):
        match_to_sample(match, sample)
        y[idx] = match.radiant_win
    
    return preprocessing.scale(X), y

In [5]:
X_train, y_train = transform_matches(matches_train)
X_test, y_test = transform_matches(matches_test)

In [6]:
logistic = LogisticRegression()
logistic.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [7]:
accuracy_score(y_test, logistic.predict(X_test))

0.59080383724776708

In [8]:
def plot_learning_curve(estimator, X, y):
    plt.figure()
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y)
    
    train_scores_mean = np.mean(train_scores, 1)
    train_scores_std = np.std(train_scores, 1)
    test_scores_mean = np.mean(test_scores, 1)
    test_scores_std = np.std(test_scores, 1)
    
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")
    plt.legend(loc="best")
    
    return plt

#plot_learning_curve(LogisticRegression(), X_train, y_train)

#plt.show()