In [70]:
#import sys
#!{sys.executable} -m pip install SelectKBest

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
from sklearn import preprocessing
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [71]:
col_names = ['win', 'region', 'mode', 'type']
num_arr = np.arange(113).astype(str)
all_names = np.concatenate((col_names, num_arr), axis = 0)

dota_data = pd.read_csv("dota2Test.csv", header=None, names=all_names)

In [72]:
dota_data.head(5)

Unnamed: 0,win,region,mode,type,0,1,2,3,4,5,...,103,104,105,106,107,108,109,110,111,112
0,-1,223,8,2,0,-1,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
1,1,227,8,2,0,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
2,-1,136,2,2,1,0,0,0,-1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,227,2,2,-1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,184,2,3,0,0,0,-1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [73]:
dota_data.shape

(10294, 117)

In [74]:
#Promena vrednosti -1 u 2 u svim celijama
dota_data = dota_data.replace(-1, 2)

#Brisanje duplikata
dota_data = dota_data.drop_duplicates()

In [75]:
#Koliko se koji heroj pojavljuje u mecevima
#Moze se reci da se heroji pojavljuju skoro podjednako u timu 1 i 2, i da je njihovo ukupno pojavljivanje priblizno jednako sa izuzecima manjeg pojavljivanja
heroes = dota_data.iloc[:,4:]
heroes_counts=[]
for x in num_arr:
    heroes_counts.append(heroes[x].value_counts())
    
heroes_counts

[0    8662
 2     824
 1     808
 Name: 0, dtype: int64,
 0    8062
 1    1131
 2    1101
 Name: 1, dtype: int64,
 0    9988
 2     154
 1     152
 Name: 2, dtype: int64,
 0    9052
 2     621
 1     621
 Name: 3, dtype: int64,
 0    9153
 2     601
 1     540
 Name: 4, dtype: int64,
 0    7925
 1    1188
 2    1181
 Name: 5, dtype: int64,
 0    8986
 1     668
 2     640
 Name: 6, dtype: int64,
 0    7687
 2    1329
 1    1278
 Name: 7, dtype: int64,
 0    6760
 1    1800
 2    1734
 Name: 8, dtype: int64,
 0    9212
 2     558
 1     524
 Name: 9, dtype: int64,
 0    9145
 2     577
 1     572
 Name: 10, dtype: int64,
 0    9496
 2     416
 1     382
 Name: 11, dtype: int64,
 0    9728
 2     283
 1     283
 Name: 12, dtype: int64,
 0    7110
 1    1593
 2    1591
 Name: 13, dtype: int64,
 0    9845
 1     229
 2     220
 Name: 14, dtype: int64,
 0    9018
 1     649
 2     627
 Name: 15, dtype: int64,
 0    9003
 1     653
 2     638
 Name: 16, dtype: int64,
 0    8457
 2     919
 1

In [76]:
#Broj null podataka u svakoj koloni
dota_data.isnull().sum()

win       0
region    0
mode      0
type      0
0         0
         ..
108       0
109       0
110       0
111       0
112       0
Length: 117, dtype: int64

In [77]:
#provera tipova podataka
dota_data.dtypes

win       int64
region    int64
mode      int64
type      int64
0         int64
          ...  
108       int64
109       int64
110       int64
111       int64
112       int64
Length: 117, dtype: object

In [78]:
#Grupisanje regiona
dota_data.loc[dota_data["region"].isin([111,112,113,114]), "region"] = 1 #US West
dota_data.loc[dota_data["region"].isin([121,122,123,124]), "region"] = 2 #US East
dota_data.loc[dota_data["region"].isin([131,132,133,134,135,136,137,138]), "region"] = 3 #Europe West
dota_data.loc[dota_data["region"].isin([142,143,144,145]), "region"] = 4 #South Korea
dota_data.loc[dota_data["region"].isin([151,152,153,154,155,156]), "region"] = 5 #Southeast Asia
dota_data.loc[dota_data["region"].isin([161,163, 221,222,223,224,225,227,231]), "region"] = 6 #China
dota_data.loc[dota_data["region"].isin([171]), "region"] = 7 #Australia
dota_data.loc[dota_data["region"].isin([181,182,183,184, 185,186,187,188]), "region"] = 8 #Russia
dota_data.loc[dota_data["region"].isin([191,192]), "region"] = 9 #Europe East
dota_data.loc[dota_data["region"].isin([200,202,203,204]), "region"] = 10 #South America
dota_data.loc[dota_data["region"].isin([211,212,213]), "region"] = 11 #South Africa
dota_data.loc[dota_data["region"].isin([241,242]), "region"] = 12 #Chile
dota_data.loc[dota_data["region"].isin([251]), "region"] = 13 #Peru
dota_data.loc[dota_data["region"].isin([261]), "region"] = 14 #India
dota_data['region'].head(5)

0    6
1    6
2    3
3    6
4    8
Name: region, dtype: int64

In [79]:
#Uklanjanje instanci ciji je tip igre 'Tutorial' 
df_f1=dota_data[dota_data['type'] != 3]

#Uklanjanje instanci ciji je mod igre 'Tutorial' 
df_f2=df_f1[df_f1['mode'] != 10]

In [80]:
df_f2['win'].value_counts()

1    3332
2    2957
Name: win, dtype: int64

In [81]:
Y = df_f2.iloc[:,0]
X = df_f2.iloc[:,1:]

In [69]:
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,Y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Columns','Score']  #naming the dataframe columns
print(featureScores.nlargest(10,'Score'))  #print 10 best features

    Columns      Score
0    region  88.149865
66       63  14.237835
101      98  13.986666
55       52  12.594571
44       41  11.114287
112     109  11.041193
38       35   7.416157
82       79   6.401269
71       68   5.944453
92       89   5.756112
