In [4]:

import pandas as pd
import numpy as np
import csv as csv
from sklearn.tree import DecisionTreeRegressor
from sklearn.cross_validation import train_test_split
from datetime import datetime
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn import tree, neighbors, linear_model
import matplotlib.pyplot as plt

# To suppress some deprecation warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [5]:
# load the match history data
data_df = pd.read_csv('all-data.csv', header=0)
data_df.columns = ['match_id', 'radiant_win', 'duration', 'hero_1_id', 'hero_2_id', 'hero_3_id', 'hero_4_id', 'hero_5_id', 'hero_6_id', 'hero_7_id', 'hero_8_id', 'hero_9_id', 'hero_10_id']
print(data_df.shape)
data_df.head()

(7991, 13)


Unnamed: 0,match_id,radiant_win,duration,hero_1_id,hero_2_id,hero_3_id,hero_4_id,hero_5_id,hero_6_id,hero_7_id,hero_8_id,hero_9_id,hero_10_id
0,1729093401,False,2549,73,27,86,8,7,41,36,60,30,25
1,1729093402,False,2960,41,100,52,105,14,15,53,11,104,25
2,1729093430,True,2580,104,51,46,21,55,68,44,70,57,74
3,1729093432,False,2409,35,70,41,25,105,6,73,17,68,21
4,1729093433,False,3272,112,9,25,21,2,98,101,22,71,8


In [6]:
# load the hero data
hero_df = pd.read_csv('heroes_data.csv', header=0)
hero_df.columns = ['id', 'name', 'carry', 'support', 'nuker', 'disabler', 'jungler', 'durable', 'escape', 'pusher', 'initiator']
hero_df.head()

hero_map = { }

for n in hero_df.iterrows():
    hero_map[n[1]['id']] = n[1]

# CREATING/EXTRACTING A NEW FEATURE

In [7]:
def helper_function(feature, h1, h2, h3, h4, h5):
    return [hero_map[h1][feature], hero_map[h2][feature], hero_map[h3][feature], hero_map[h4][feature], hero_map[h5][feature]]

def calculate_feature_total(feature, h1, h2, h3, h4, h5):
    return sum(helper_function(feature, h1, h2, h3, h4, h5))

def calculate_feature_max(feature, h1, h2, h3, h4, h5):
    return max(helper_function(feature, h1, h2, h3, h4, h5))

def calculate_feature_min(feature, h1, h2, h3, h4, h5):
    return min(helper_function(feature, h1, h2, h3, h4, h5))

def calculate_feature_std(feature, h1, h2, h3, h4, h5):
    return np.std(helper_function(feature, h1, h2, h3, h4, h5))

def calculate_feature_mean(feature, h1, h2, h3, h4, h5):
    return calculate_feature_total(feature, h1, h2, h3, h4, h5)/5

features = ['carry', 'support', 'nuker', 'disabler', 'jungler', 'durable', 'escape', 'pusher', 'initiator']
col = []
for i in range(len(features)):
    #calculate for radiant
    data_df['radiant_' + features[i] + '_total']  = data_df.apply(lambda row: calculate_feature_total(features[i], row['hero_1_id'], row['hero_2_id'], row['hero_3_id'], row['hero_4_id'], row['hero_5_id']), axis=1)
    data_df['radiant_' + features[i] + '_max']    = data_df.apply(lambda row: calculate_feature_max(features[i], row['hero_1_id'], row['hero_2_id'], row['hero_3_id'], row['hero_4_id'], row['hero_5_id']), axis=1)
    data_df['radiant_' + features[i] + '_min']    = data_df.apply(lambda row: calculate_feature_min(features[i], row['hero_1_id'], row['hero_2_id'], row['hero_3_id'], row['hero_4_id'], row['hero_5_id']), axis=1)
    data_df['radiant_' + features[i] + '_mean']   = data_df.apply(lambda row: calculate_feature_mean(features[i], row['hero_1_id'], row['hero_2_id'], row['hero_3_id'], row['hero_4_id'], row['hero_5_id']), axis=1)
    data_df['radiant_' + features[i] + '_std']    = data_df.apply(lambda row: calculate_feature_std(features[i], row['hero_1_id'], row['hero_2_id'], row['hero_3_id'], row['hero_4_id'], row['hero_5_id']), axis=1)    
    #calculate for dire
    data_df['dire_'    + features[i] + '_total']  = data_df.apply(lambda row: calculate_feature_total(features[i], row['hero_6_id'], row['hero_7_id'], row['hero_8_id'], row['hero_9_id'], row['hero_10_id']), axis=1)
    data_df['dire_'    + features[i] + '_max']    = data_df.apply(lambda row: calculate_feature_max(features[i], row['hero_6_id'], row['hero_7_id'], row['hero_8_id'], row['hero_9_id'], row['hero_10_id']), axis=1)
    data_df['dire_'    + features[i] + '_min']    = data_df.apply(lambda row: calculate_feature_min(features[i], row['hero_6_id'], row['hero_7_id'], row['hero_8_id'], row['hero_9_id'], row['hero_10_id']), axis=1)
    data_df['dire_'    + features[i] + '_mean']   = data_df.apply(lambda row: calculate_feature_mean(features[i], row['hero_6_id'], row['hero_7_id'], row['hero_8_id'], row['hero_9_id'], row['hero_10_id']), axis=1)
    data_df['dire_'    + features[i] + '_std']    = data_df.apply(lambda row: calculate_feature_std(features[i], row['hero_6_id'], row['hero_7_id'], row['hero_8_id'], row['hero_9_id'], row['hero_10_id']), axis=1)

    col.append('radiant_' + features[i] + '_total')
    col.append('radiant_' + features[i] + '_max')
    col.append('radiant_' + features[i] + '_min')
    col.append('radiant_' + features[i] + '_mean')
    col.append('radiant_' + features[i] + '_std')
    col.append('dire_' + features[i] + '_total')
    col.append('dire_' + features[i] + '_max')
    col.append('dire_' + features[i] + '_min')
    col.append('dire_' + features[i] + '_mean')
    col.append('dire_' + features[i] + '_std')

# Split the dataset 2/3s train 1/3 test
train_df, test_df = train_test_split(
    data_df, test_size=0.33333333333333333333, random_state=0)

# (starting with simple sanity check)

In [8]:
data_df.head()


Unnamed: 0,match_id,radiant_win,duration,hero_1_id,hero_2_id,hero_3_id,hero_4_id,hero_5_id,hero_6_id,hero_7_id,...,radiant_initiator_total,radiant_initiator_max,radiant_initiator_min,radiant_initiator_mean,radiant_initiator_std,dire_initiator_total,dire_initiator_max,dire_initiator_min,dire_initiator_mean,dire_initiator_std
0,1729093401,False,2549,73,27,86,8,7,41,36,...,5,3,0,1.0,1.095445,5,3,0,1.0,1.264911
1,1729093402,False,2960,41,100,52,105,14,15,53,...,7,3,0,1.4,1.2,1,1,0,0.2,0.4
2,1729093430,True,2580,104,51,46,21,55,68,44,...,5,3,0,1.0,1.095445,0,0,0,0.0,0.0
3,1729093432,False,2409,35,70,41,25,105,6,73,...,3,3,0,0.6,1.2,2,1,0,0.4,0.489898
4,1729093433,False,3272,112,9,25,21,2,98,101,...,3,3,0,0.6,1.2,2,2,0,0.4,0.8


In [9]:
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn import tree, neighbors, linear_model
# fit the model to the casual data
classifier = neighbors.KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
            metric_params=None, n_neighbors=4, p=2, weights='uniform')

classifier.fit(train_df[col], train_df['radiant_win'])

y_true, y_pred = train_df['radiant_win'], classifier.predict(train_df[col])

print(classification_report(y_true, y_pred))

             precision    recall  f1-score   support

      False       0.64      0.86      0.73      2556
       True       0.81      0.55      0.65      2771

avg / total       0.72      0.70      0.69      5327



## cross validation for model checking

In [20]:
from sklearn.grid_search import GridSearchCV

score = 'precision'


print("# Tuning hyper-parameters for %s" % score)

# Set the parameters by cross-validation

#parameters for K-nearest neighbors     
#tuned_parametersKN = [{'n_neighbors': []}]

#parameters for Decision Tree
#tuned_parametersDT = [{'min_samples_split': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000, 10000, 1000000] }]

#use 1 through 100 for k values
#for n in range(1, 25):
 #   tuned_parametersKN[0]['n_neighbors'].append(n)

#parameters for LogisticRegression()
tuned_parametersLR = [{'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000, 10000, 1000000], 'penalty': ['L1']}]
    
# clf = GridSearchCV(SVC(C=1), tuned_parametersSVM, cv=5, scoring=score)
#clf = GridSearchCV(neighbors.KNeighborsClassifier(), tuned_parametersKN, cv=5, scoring=score)
clf = GridSearchCV(linear_model.LogisticRegression(), tuned_parametersLR, cv=5, scoring=score)
#clf = GridSearchCV(tree.DecisionTreeClassifier(), tuned_parametersDT, cv=5, scoring=score)

clf.fit(train_df[col], train_df['radiant_win'])

print("Best parameters set found on development set:")
print()
print(clf.best_estimator_)
print()
print("Grid scores on development set:")
print()
for params, mean_score, scores in clf.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r"
          % (mean_score, scores.std() / 2, params))

# Tuning hyper-parameters for precision
Best parameters set found on development set:

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='L1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

Grid scores on development set:

0.000 (+/-0.000) for {'penalty': 'L1', 'C': 1e-05}
0.000 (+/-0.000) for {'penalty': 'L1', 'C': 0.0001}
0.000 (+/-0.000) for {'penalty': 'L1', 'C': 0.001}
0.544 (+/-0.004) for {'penalty': 'L1', 'C': 0.01}
0.555 (+/-0.003) for {'penalty': 'L1', 'C': 0.1}
0.555 (+/-0.006) for {'penalty': 'L1', 'C': 1}
0.555 (+/-0.006) for {'penalty': 'L1', 'C': 10}
0.554 (+/-0.006) for {'penalty': 'L1', 'C': 100}
0.554 (+/-0.006) for {'penalty': 'L1', 'C': 1000}
0.554 (+/-0.006) for {'penalty': 'L1', 'C': 10000}
0.554 (+/-0.006) for {'penalty': 'L1', 'C': 1000000}


In [21]:
#classifier = neighbors.KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
          #  metric_params=None, n_neighbors=3, p=2, weights='uniform')

#classifier = tree.DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
       #     max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
        #    min_samples_split=1000, min_weight_fraction_leaf=0.0,
         #   random_state=None, splitter='best')
            
classifier = linear_model.LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='L1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

classifier.fit(train_df[col2], train_df['radiant_win'])

y_true, y_pred = test_df['radiant_win'], classifier.predict(test_df[col2])

print(classification_report(y_true, y_pred))

             precision    recall  f1-score   support

      False       0.56      0.45      0.50      1332
       True       0.54      0.65      0.59      1332

avg / total       0.55      0.55      0.55      2664

