In [65]:
#import sys
#!{sys.executable} -m pip install SelectKBest

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
from sklearn import preprocessing
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

## Učitavanje podataka i imenovanje kolona

In [2]:
col_names = ['win', 'region', 'mode', 'type']
num_arr = np.arange(113).astype(str)
all_names = np.concatenate((col_names, num_arr), axis = 0)

dota_data = pd.read_csv("dota2.csv", header=None, names=all_names)

In [3]:
dota_data.head(5)

Unnamed: 0,win,region,mode,type,0,1,2,3,4,5,...,103,104,105,106,107,108,109,110,111,112
0,-1,223,8,2,0,-1,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
1,1,227,8,2,0,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
2,-1,136,2,2,1,0,0,0,-1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,227,2,2,-1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,184,2,3,0,0,0,-1,0,0,...,0,0,0,0,0,0,0,0,0,0


## Provera tipova podataka i postojanje null vrednosti

In [4]:
dota_data.shape

(102944, 117)

In [5]:
#Promena vrednosti -1 u 2 u svim celijama
#dota_data = dota_data.replace(-1, 2)

#Brisanje duplikata
dota_data = dota_data.drop_duplicates()

In [6]:
#Koliko se koji heroj pojavljuje u mecevima
#Moze se reci da se heroji pojavljuju skoro podjednako u timu 1 i 2, i da je njihovo ukupno pojavljivanje priblizno jednako sa izuzecima manjeg pojavljivanja
heroes = dota_data.iloc[:,4:]
heroes_counts=[]
for x in num_arr:
    heroes_counts.append(heroes[x].value_counts())
    
heroes_counts

[ 0    86339
 -1     8386
  1     8219
 Name: 0, dtype: int64,
  0    80448
 -1    11278
  1    11218
 Name: 1, dtype: int64,
  0    100114
  1      1446
 -1      1384
 Name: 2, dtype: int64,
  0    90000
 -1     6509
  1     6435
 Name: 3, dtype: int64,
  0    91753
 -1     5719
  1     5472
 Name: 4, dtype: int64,
  0    78875
  1    12185
 -1    11884
 Name: 5, dtype: int64,
  0    90360
 -1     6294
  1     6290
 Name: 6, dtype: int64,
  0    77010
 -1    13170
  1    12764
 Name: 7, dtype: int64,
  0    67362
  1    18101
 -1    17481
 Name: 8, dtype: int64,
  0    92055
  1     5499
 -1     5390
 Name: 9, dtype: int64,
  0    91385
  1     5864
 -1     5695
 Name: 10, dtype: int64,
  0    94858
 -1     4115
  1     3971
 Name: 11, dtype: int64,
  0    97815
  1     2571
 -1     2558
 Name: 12, dtype: int64,
  0    71475
  1    15779
 -1    15690
 Name: 13, dtype: int64,
  0    98314
  1     2331
 -1     2299
 Name: 14, dtype: int64,
  0    90655
  1     6213
 -1     6076
 Name: 1

In [7]:
#Broj null podataka u svakoj koloni
dota_data.isnull().sum()

win       0
region    0
mode      0
type      0
0         0
         ..
108       0
109       0
110       0
111       0
112       0
Length: 117, dtype: int64

In [8]:
#provera tipova podataka
dota_data.dtypes

win       int64
region    int64
mode      int64
type      int64
0         int64
          ...  
108       int64
109       int64
110       int64
111       int64
112       int64
Length: 117, dtype: object

## Preprocesiranje podataka

In [9]:
#Grupisanje regiona
dota_data.loc[dota_data["region"].isin([111,112,113,114]), "region"] = 1 #US West
dota_data.loc[dota_data["region"].isin([121,122,123,124]), "region"] = 2 #US East
dota_data.loc[dota_data["region"].isin([131,132,133,134,135,136,137,138]), "region"] = 3 #Europe West
dota_data.loc[dota_data["region"].isin([142,143,144,145]), "region"] = 4 #South Korea
dota_data.loc[dota_data["region"].isin([151,152,153,154,155,156]), "region"] = 5 #Southeast Asia
dota_data.loc[dota_data["region"].isin([161,163, 221,222,223,224,225,227,231]), "region"] = 6 #China
dota_data.loc[dota_data["region"].isin([171]), "region"] = 7 #Australia
dota_data.loc[dota_data["region"].isin([181,182,183,184, 185,186,187,188]), "region"] = 8 #Russia
dota_data.loc[dota_data["region"].isin([191,192]), "region"] = 9 #Europe East
dota_data.loc[dota_data["region"].isin([200,202,203,204]), "region"] = 10 #South America
dota_data.loc[dota_data["region"].isin([211,212,213]), "region"] = 11 #South Africa
dota_data.loc[dota_data["region"].isin([241,242]), "region"] = 12 #Chile
dota_data.loc[dota_data["region"].isin([251]), "region"] = 13 #Peru
dota_data.loc[dota_data["region"].isin([261]), "region"] = 14 #India
dota_data['region'].head(5)

0    6
1    6
2    3
3    6
4    8
Name: region, dtype: int64

In [10]:
#Uklanjanje instanci ciji je tip igre 'Tutorial' 
df_f1=dota_data[dota_data['type'] != 3]

#Uklanjanje instanci ciji je mod igre 'Tutorial' 
df_f2=df_f1[df_f1['mode'] != 10]

In [11]:
df_f2['win'].value_counts()

 1    33397
-1    29895
Name: win, dtype: int64

In [12]:
#balansiranje dataseta
g = df_f2.groupby('win')
g=pd.DataFrame(g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True)))
g['win'].value_counts()

 1    29895
-1    29895
Name: win, dtype: int64

## Razdvajanje feature i target kolona

In [13]:
Y = g.iloc[:,0]
X = g.iloc[:,1:]

## Razdvajanje train i test setova 80:20

In [67]:
from sklearn.model_selection import train_test_split
#random state = 1, kako bi se koristio int seed za RNG
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [68]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
svc_model = LinearSVC(random_state=0,max_iter = 100000, dual = False, penalty='l2')

pred = svc_model.fit(X_train, Y_train).predict(X_test)
print("LinearSVC accuracy : ",accuracy_score(Y_test, pred, normalize = True))
print("Confusion matrix:")
print(confusion_matrix(Y_test, pred))
print("Report:")
print(classification_report(Y_test, pred))

LinearSVC accuracy :  0.5981769526676701
Confusion matrix:
[[3561 2344]
 [2461 3592]]
Report:
              precision    recall  f1-score   support

          -1       0.59      0.60      0.60      5905
           1       0.61      0.59      0.60      6053

    accuracy                           0.60     11958
   macro avg       0.60      0.60      0.60     11958
weighted avg       0.60      0.60      0.60     11958



In [82]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(max_iter=1000, dual=False)
logreg.fit(X_train,Y_train)
pred = logreg.predict(X_test)
print('Accuracy of Logistic regression classifier on training set: '
     ,logreg.score(X_train, Y_train))
print('Accuracy of Logistic regression classifier on test set: ',
     accuracy_score(Y_test, pred),'\n')
print("Confusion matrix:")
print(confusion_matrix(Y_test, pred))
print("Report:")
print(classification_report(Y_test, pred))

Accuracy of Logistic regression classifier on training set:  0.6017101521993644
Accuracy of Logistic regression classifier on test set:  0.597842448569995 

Confusion matrix:
[[3561 2344]
 [2465 3588]]
Report:
              precision    recall  f1-score   support

          -1       0.59      0.60      0.60      5905
           1       0.60      0.59      0.60      6053

    accuracy                           0.60     11958
   macro avg       0.60      0.60      0.60     11958
weighted avg       0.60      0.60      0.60     11958



In [83]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=100, random_state=0, n_estimators=200, criterion='entropy', min_samples_split=2
                            ,n_jobs=-1,bootstrap=True,min_samples_leaf=20)
clf.fit(X_train, Y_train)
pred = clf.predict(X_test)
print('Accuracy of Random forest classifier on test set: ',accuracy_score(Y_test, pred))
print("Confusion matrix:")
print(confusion_matrix(Y_test, pred))
print("Report:")
print(classification_report(Y_test, pred))

Accuracy of Random forest classifier on test set:  0.5866365612978759
Confusion matrix:
[[3569 2336]
 [2607 3446]]
Report:
              precision    recall  f1-score   support

          -1       0.58      0.60      0.59      5905
           1       0.60      0.57      0.58      6053

    accuracy                           0.59     11958
   macro avg       0.59      0.59      0.59     11958
weighted avg       0.59      0.59      0.59     11958



In [84]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='log', penalty='l1',fit_intercept=False, learning_rate='adaptive',eta0=1
                   ,power_t=0.5, max_iter=1000)
clf.fit(X_train, Y_train)
pred = clf.predict(X_test)
print('Accuracy of SGD classifier on test set: ',accuracy_score(Y_test, pred))
print("Confusion matrix:")
print(confusion_matrix(Y_test, pred))
print("Report:")
print(classification_report(Y_test, pred))

Accuracy of SGD classifier on test set:  0.5975915704967386
Confusion matrix:
[[3572 2333]
 [2479 3574]]
Report:
              precision    recall  f1-score   support

          -1       0.59      0.60      0.60      5905
           1       0.61      0.59      0.60      6053

    accuracy                           0.60     11958
   macro avg       0.60      0.60      0.60     11958
weighted avg       0.60      0.60      0.60     11958

