# Scrubby Pubs - Win Predictor for Dota2

In [4]:
import pandas as pd
import numpy as np
import csv as csv
from sklearn.tree import DecisionTreeRegressor
from sklearn.cross_validation import train_test_split
from datetime import datetime
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn import tree, neighbors, linear_model
import matplotlib.pyplot as plt
import warnings

# To suppress some deprecation warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# Loading the Match Data
Columns:
* match_id : each match has a unique match_id 
* radiant_win [what we are trying to predict] : True if Radiant (team 1) won, Falst if Radiant lost 
* duration : match length (not used)
* hero_x_id : id of the xth hero in the match (10 heroes per match, 1-5 are Radiant (team 1), 6-10 are Dire (team 2)
* hero_x_feature : concrete values earned by xth hero in regards to a specific feature (gpm, xpm, kills, etc.)

In [6]:
data_df = pd.read_csv('data4.csv', header=0)
data_df.columns = ['match_id', 'radiant_win', 'duration', 'hero_1_id', 'hero_2_id', 'hero_3_id', 'hero_4_id', 'hero_5_id', 'hero_6_id', 'hero_7_id', 'hero_8_id', 'hero_9_id', 'hero_10_id', 
                   'hero_1_gpm', 'hero_2_gpm', 'hero_3_gpm', 'hero_4_gpm', 'hero_5_gpm', 'hero_6_gpm', 'hero_7_gpm', 'hero_8_gpm', 'hero_9_gpm', 'hero_10_gpm',
                   'hero_1_xpm', 'hero_2_xpm', 'hero_3_xpm', 'hero_4_xpm', 'hero_5_xpm', 'hero_6_xpm', 'hero_7_xpm', 'hero_8_xpm', 'hero_9_xpm', 'hero_10_xpm',
                   'hero_1_kills', 'hero_2_kills', 'hero_3_kills', 'hero_4_kills', 'hero_5_kills', 'hero_6_kills', 'hero_7_kills', 'hero_8_kills', 'hero_9_kills', 'hero_10_kills',
                   'hero_1_deaths', 'hero_2_deaths', 'hero_3_deaths', 'hero_4_deaths', 'hero_5_deaths', 'hero_6_deaths', 'hero_7_deaths', 'hero_8_deaths', 'hero_9_deaths', 'hero_10_deaths',
                   'hero_1_assists', 'hero_2_assists', 'hero_3_assists', 'hero_4_assists', 'hero_5_assists', 'hero_6_assists', 'hero_7_assists', 'hero_8_assists', 'hero_9_assists', 'hero_10_assists']

print(data_df.shape)
data_df.head()

(10957, 63)


Unnamed: 0,match_id,team_1_win,duration,hero_1_id,hero_2_id,hero_3_id,hero_4_id,hero_5_id,hero_6_id,hero_7_id,...,hero_1_assists,hero_2_assists,hero_3_assists,hero_4_assists,hero_5_assists,hero_6_assists,hero_7_assists,hero_8_assists,hero_9_assists,hero_10_assists
0,1885069276,True,3038,4,70,43,100,68,67,41,...,12,13,18,17,21,22,11,18,17,19
1,1885069277,False,1576,13,95,47,73,52,14,53,...,7,1,5,2,2,11,4,13,6,10
2,1885069278,True,3022,28,21,52,74,112,5,56,...,17,15,26,17,16,30,11,13,11,15
3,1885069280,True,3270,102,86,21,11,44,22,41,...,12,13,12,10,12,15,10,5,11,10
4,1885069281,False,3094,5,46,4,68,74,41,102,...,22,10,7,18,10,11,12,20,20,19


# Loading the Hero Data
Columns:
* id : Hero's unique id 
* name : Hero's unique name
* atrributes [carry...initiator] : Scores in the range 0-3 that explain how "good" a hero is at a specific role, 3 meaning best

In [11]:
hero_df = pd.read_csv('heroes_data.csv', header=0)
hero_df.columns = ['id', 'name', 'carry', 'support', 'nuker', 'disabler', 'jungler', 'durable', 'escape', 'pusher', 'initiator']

print(hero_df.shape)
hero_df.head()

(112, 11)


Unnamed: 0,id,name,carry,support,nuker,disabler,jungler,durable,escape,pusher,initiator
0,0,,0,0,0,0,0,0,0,0,0
1,1,Anti-Mage,3,0,1,0,0,0,3,0,0
2,2,Axe,0,0,0,2,2,3,0,0,3
3,3,Bane,0,2,1,3,0,1,0,0,0
4,4,Bloodseeker,1,0,1,1,1,0,0,0,1


# Adding Some Hero Features
We loop through all of our data to calculate values for each Hero.

* Winrate : A hero's win probability (won games / total games)
* Experience [earned] per Minute (xpm) : A Hero's average xpm over all games
* Gold [earned] per Minute (gpm) : A Hero's  average gpm over all games
* Kills : Average number of kills a Hero gets (larger numbers are better)
* Deaths : Average number of times a Hero dies (smaller numbers are better)
* Assists : Average number of assists a Hero gets

In [None]:
def calculate_winrate(hero_id):
    wins = 0
    games = 0
    
    for row in data_df.iterrows():
        radiant = [row[1]['hero_1_id'], row[1]['hero_2_id'], row[1]['hero_3_id'], row[1]['hero_4_id'], row[1]['hero_5_id']]
        dire = [row[1]['hero_6_id'], row[1]['hero_7_id'], row[1]['hero_8_id'], row[1]['hero_9_id'], row[1]['hero_10_id']]
        if(hero_id in radiant or hero_id in dire):
            games += 1
            if(hero_id in radiant and row[1]['radiant_win']):
                wins += 1
            elif(hero_id in dire and not row[1]['radiant_win']):
                wins += 1
    if(games == 0):
        return 0
    return wins/games

In [None]:
def calculate_averageHeroAttribute(attribute, hero_id):
    attribute_total = 0
    games = 0
    
    for row in data_df.iterrows():
        teams = [row[1]['hero_1_id'], row[1]['hero_2_id'], row[1]['hero_3_id'], row[1]['hero_4_id'], row[1]['hero_5_id'], row[1]['hero_6_id'], row[1]['hero_7_id'], row[1]['hero_8_id'], row[1]['hero_9_id'], row[1]['hero_10_id']]
        if(hero_id in teams):
             games += 1
             index = teams.index(hero_id) + 1
             attribute_total += row[1]['hero_' + str(index) + '_' + attribute]
    if(games == 0):
        return 0
    return attribute_total/games

In [None]:
hero_df['winrate'] = hero_df.apply(lambda row: calculate_winrate(row['id']), axis=1)
hero_df['xpm']     = hero_df.apply(lambda row: calculate_averageHeroFeature('xpm', row['id']), axis=1)
hero_df['gpm']     = hero_df.apply(lambda row: calculate_averageHeroFeature('gpm', row['id']), axis=1)
hero_df['kills']   = hero_df.apply(lambda row: calculate_averageHeroFeature('kills', row['id']), axis=1)
hero_df['deaths']  = hero_df.apply(lambda row: calculate_averageHeroFeature('deaths', row['id']), axis=1)
hero_df['assists'] = hero_df.apply(lambda row: calculate_averageHeroFeature('assists', row['id']), axis=1)

#hero_df.to_csv('heroes_data_extended.csv')

#Converting dataframe to a map so we can use it easier below
hero_map = { }

for n in hero_df.iterrows():
    hero_map[n[1]['id']] = n[1]

# Creating and Extracting New Features
Calculating Team's score for each attribute (carry, support, gpm, kills, etc):
* total : Team's total value (sum) for an attribute
* max : Team's max value (hero with highest value) for an attribute
* min : Team's min value (hero with lowest value) for an attribute
* std : Team's standard deviation for an attribute
* mean : Team's average for an attribute

In [None]:
def helper_function(feature, h1, h2, h3, h4, h5):
    return [hero_map[h1][feature], hero_map[h2][feature], hero_map[h3][feature], hero_map[h4][feature], hero_map[h5][feature]]

def calculate_feature_total(feature, h1, h2, h3, h4, h5):
    return sum(helper_function(feature, h1, h2, h3, h4, h5))

def calculate_feature_max(feature, h1, h2, h3, h4, h5):
    return max(helper_function(feature, h1, h2, h3, h4, h5))

def calculate_feature_min(feature, h1, h2, h3, h4, h5):
    return min(helper_function(feature, h1, h2, h3, h4, h5))

def calculate_feature_std(feature, h1, h2, h3, h4, h5):
    return np.std(helper_function(feature, h1, h2, h3, h4, h5))

def calculate_feature_mean(feature, h1, h2, h3, h4, h5):
    return calculate_feature_total(feature, h1, h2, h3, h4, h5)/5

features = ['carry', 'support', 'nuker', 'disabler', 'jungler', 'durable', 'escape', 'pusher', 'initiator', 'xpm', 'gpm', 'kills', 'deaths', 'assists']
col = []
for i in range(len(features)):
    #calculate for radiant
    data_df['radiant_' + features[i] + '_total']  = data_df.apply(lambda row: calculate_feature_total(features[i], row['hero_1_id'], row['hero_2_id'], row['hero_3_id'], row['hero_4_id'], row['hero_5_id']), axis=1)
    data_df['radiant_' + features[i] + '_max']    = data_df.apply(lambda row: calculate_feature_max(features[i], row['hero_1_id'], row['hero_2_id'], row['hero_3_id'], row['hero_4_id'], row['hero_5_id']), axis=1)
    data_df['radiant_' + features[i] + '_min']    = data_df.apply(lambda row: calculate_feature_min(features[i], row['hero_1_id'], row['hero_2_id'], row['hero_3_id'], row['hero_4_id'], row['hero_5_id']), axis=1)
    data_df['radiant_' + features[i] + '_mean']   = data_df.apply(lambda row: calculate_feature_mean(features[i], row['hero_1_id'], row['hero_2_id'], row['hero_3_id'], row['hero_4_id'], row['hero_5_id']), axis=1)
    data_df['radiant_' + features[i] + '_std']    = data_df.apply(lambda row: calculate_feature_std(features[i], row['hero_1_id'], row['hero_2_id'], row['hero_3_id'], row['hero_4_id'], row['hero_5_id']), axis=1)    
    #calculate for dire
    data_df['dire_'    + features[i] + '_total']  = data_df.apply(lambda row: calculate_feature_total(features[i], row['hero_6_id'], row['hero_7_id'], row['hero_8_id'], row['hero_9_id'], row['hero_10_id']), axis=1)
    data_df['dire_'    + features[i] + '_max']    = data_df.apply(lambda row: calculate_feature_max(features[i], row['hero_6_id'], row['hero_7_id'], row['hero_8_id'], row['hero_9_id'], row['hero_10_id']), axis=1)
    data_df['dire_'    + features[i] + '_min']    = data_df.apply(lambda row: calculate_feature_min(features[i], row['hero_6_id'], row['hero_7_id'], row['hero_8_id'], row['hero_9_id'], row['hero_10_id']), axis=1)
    data_df['dire_'    + features[i] + '_mean']   = data_df.apply(lambda row: calculate_feature_mean(features[i], row['hero_6_id'], row['hero_7_id'], row['hero_8_id'], row['hero_9_id'], row['hero_10_id']), axis=1)
    data_df['dire_'    + features[i] + '_std']    = data_df.apply(lambda row: calculate_feature_std(features[i], row['hero_6_id'], row['hero_7_id'], row['hero_8_id'], row['hero_9_id'], row['hero_10_id']), axis=1)
    #throw everything into our col
    col.append('radiant_' + features[i] + '_total')
    col.append('radiant_' + features[i] + '_max')
    col.append('radiant_' + features[i] + '_min')
    col.append('radiant_' + features[i] + '_mean')
    col.append('radiant_' + features[i] + '_std')
    col.append('dire_' + features[i] + '_total')
    col.append('dire_' + features[i] + '_max')
    col.append('dire_' + features[i] + '_min')
    col.append('dire_' + features[i] + '_mean')
    col.append('dire_' + features[i] + '_std')
    
data_df.head()

# Split Dataset into 2/3s train 1/3 test
Now that we have all the features we want, we can split the data (done this was just so we don't have to readd the features later)

In [None]:
# Split the dataset 2/3s train 1/3 test
train_df, test_df = train_test_split(
    data_df, test_size=0.33333333333333333333, random_state=0)

## cross validation for model checking

In [None]:
from sklearn.grid_search import GridSearchCV
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn import tree, neighbors, linear_model

score = 'precision'

print("# Tuning hyper-parameters for %s" % score)

# Set the parameters by cross-validation

#parameters for K-nearest neighbors     
#tuned_parametersKN = [{'n_neighbors': []}]

#parameters for Decision Tree
#tuned_parametersDT = [{'min_samples_split': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000, 10000, 1000000] }]

#use 1 through 100 for k values
#for n in range(1, 25):
 #   tuned_parametersKN[0]['n_neighbors'].append(n)

#parameters for LogisticRegression()
tuned_parametersLR = [{'C': [1, 10, 100, 1000, 10000, 1000000], 'penalty': ['L1']}]
    
# clf = GridSearchCV(SVC(C=1), tuned_parametersSVM, cv=5, scoring=score)
#clf = GridSearchCV(neighbors.KNeighborsClassifier(), tuned_parametersKN, cv=5, scoring=score)
clf = GridSearchCV(linear_model.LogisticRegression(), tuned_parametersLR, cv=5, scoring=score)
#clf = GridSearchCV(tree.DecisionTreeClassifier(), tuned_parametersDT, cv=5, scoring=score)

clf.fit(train_df[col], train_df['radiant_win'])

print("Best parameters set found on development set:")
print()
print(clf.best_estimator_)
print()
print("Grid scores on development set:")
print()
for params, mean_score, scores in clf.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r"
          % (mean_score, scores.std() / 2, params))

In [None]:
print('hi')

In [None]:
#classifier = neighbors.KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
          #  metric_params=None, n_neighbors=3, p=2, weights='uniform')

#classifier = tree.DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
       #     max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
        #    min_samples_split=1000, min_weight_fraction_leaf=0.0,
         #   random_state=None, splitter='best')
            
classifier = linear_model.LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='L1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

classifier.fit(train_df[col], train_df['radiant_win'])

y_true, y_pred = test_df['radiant_win'], classifier.predict(test_df[col])

print(classification_report(y_true, y_pred))

In [None]:
print(classifier.coef_)