In [1]:
#import sys
#!{sys.executable} -m pip install SelectKBest

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
from sklearn import preprocessing
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [27]:
col_names = ['win', 'region', 'mode', 'type']
num_arr = np.arange(113).astype(str)
all_names = np.concatenate((col_names, num_arr), axis = 0)

dota_data = pd.read_csv("dota2Test.csv", header=None, names=all_names)

In [28]:
dota_data.head(5)

Unnamed: 0,win,region,mode,type,0,1,2,3,4,5,...,103,104,105,106,107,108,109,110,111,112
0,-1,223,8,2,0,-1,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
1,1,227,8,2,0,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
2,-1,136,2,2,1,0,0,0,-1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,227,2,2,-1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,184,2,3,0,0,0,-1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
dota_data.shape

(10294, 117)

In [41]:
#Promena vrednosti -1 u 2 u svim celijama
dota_data = dota_data.replace(-1, 2)

In [31]:
#Broj null podataka u svakoj koloni
dota_data.isnull().sum()

win       0
region    0
mode      0
type      0
0         0
         ..
108       0
109       0
110       0
111       0
112       0
Length: 117, dtype: int64

In [32]:
#provera tipova podataka
dota_data.dtypes

win       int64
region    int64
mode      int64
type      int64
0         int64
          ...  
108       int64
109       int64
110       int64
111       int64
112       int64
Length: 117, dtype: object

In [33]:
#Grupisanje regiona
dota_data.loc[dota_data["region"].isin([111,112,113,114]), "region"] = 1 #US West
dota_data.loc[dota_data["region"].isin([121,122,123,124]), "region"] = 2 #US East
dota_data.loc[dota_data["region"].isin([131,132,133,134,135,136,137,138]), "region"] = 3 #Europe West
dota_data.loc[dota_data["region"].isin([142,143,144,145]), "region"] = 4 #South Korea
dota_data.loc[dota_data["region"].isin([151,152,153,154,155,156]), "region"] = 5 #Southeast Asia
dota_data.loc[dota_data["region"].isin([161,163, 221,222,223,224,225,227,231]), "region"] = 6 #China
dota_data.loc[dota_data["region"].isin([171]), "region"] = 7 #Australia
dota_data.loc[dota_data["region"].isin([181,182,183,184, 185,186,187,188]), "region"] = 8 #Russia
dota_data.loc[dota_data["region"].isin([191,192]), "region"] = 9 #Europe East
dota_data.loc[dota_data["region"].isin([200,202,203,204]), "region"] = 10 #South America
dota_data.loc[dota_data["region"].isin([211,212,213]), "region"] = 11 #South Africa
dota_data.loc[dota_data["region"].isin([241,242]), "region"] = 12 #Chile
dota_data.loc[dota_data["region"].isin([251]), "region"] = 13 #Peru
dota_data.loc[dota_data["region"].isin([261]), "region"] = 14 #India
dota_data['region'].head(5)

0    6
1    6
2    3
3    6
4    8
Name: region, dtype: int64

In [34]:
#Uklanjanje instanci ciji je tip igre 'Tutorial' 
df_f1=dota_data[dota_data['type'] != 3]

#Uklanjanje instanci ciji je mod igre 'Tutorial' 
df_f2=df_f1[df_f1['mode'] != 10]

In [35]:
df_f2['win'].value_counts()

1    3332
2    2957
Name: win, dtype: int64

In [36]:
df_majority = df_f2[df_f2.win == 1]
df_minority = df_f2[df_f2.win == 2]

df_majority_downsampled = resample(df_majority,
                                 replace=False,    # sample without replacement
                                 n_samples=2957,     # to match minority class
                                 random_state=0) # reproducible results
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
df_downsampled.win.value_counts()

2    2957
1    2957
Name: win, dtype: int64

In [38]:
Y = df_downsampled.iloc[:,0]
X = df_downsampled.iloc[:,1:]

In [40]:
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,Y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Columns','Score']  #naming the dataframe columns
print(featureScores.nlargest(10,'Score'))  #print 10 best features

    Columns       Score
0    region  132.482376
101      98   14.410526
66       63   13.087464
55       52   11.953642
44       41   10.286613
112     109    9.589532
89       86    6.652174
78       75    5.951945
71       68    5.835735
64       61    5.675570
