In [None]:

#!pip install opencv-contrib-python
#!pip install scikit-learn
#!pip install scikit-image
#!pip install imutils

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale

%matplotlib inline

In [None]:
#leitura do arquivo

#data=pd.read_csv('BRA.csv', delimiter=',')
# ou:
data=pd.read_csv('/home/marcelo-pinto/projects/python/DataFut/BRA.csv', delimiter=',')

#verificando os primeiros...
display(data.head())

In [None]:
#explorando os dados

matches = data.shape[0]

features = data.shape[1] -1

home_win = len(data[data.Res==1])
away_win = len(data[data.Res==2])
draw = len(data[data.Res==0])
val=[home_win, away_win, draw]

win_rate = (float(home_win)/(matches)) * 100

print('Total de jogos:', matches)
print('Total de colunas:', features)
print('Total de jogos ganhos em casa:', home_win)
print('Total de jogos ganhos pelo visitante:', away_win)
print('Total de jogos empatados:', draw)
print('Percentual de jogos ganhos em casa: {:.2f}%'.format( win_rate ))



In [None]:
x = np.arange(3)
plt.bar(x, val)
plt.xticks(x, ('Home', 'Away', 'Draw'))
plt.show()


In [None]:
#deixar somente variáveis numéricas

num_data = data.drop(['Country', 'League', 'Season', 'Date', 'Time', 'Home', 'Away'],1)

display(num_data.head())

In [None]:
features = num_data.drop(['Res'],1)

labels = num_data['Res']

print('Features')
print(features.head())

print('=========')

print('Labels')
print(labels.head())


In [None]:
features_list = ('HG', 'AG', 'PH', 'PD', 'PA', 'MaxH', 'MaxD', 'MaxA', 'AvgH', 'AvgD', 'AvgA')

k_best_features = SelectKBest(k='all')
k_best_features.fit_transform(features, labels)
k_best_features_scores = k_best_features.scores_
raw_pairs = zip(features_list[1:], k_best_features_scores)
ordered_pairs = list(reversed(sorted(raw_pairs, key=lambda x: x[1])))


k_best_features_final = dict(ordered_pairs[:15])
best_features = k_best_features_final.keys()
print('')
print('Melhores features:')
print(k_best_features_final)


In [None]:
features = num_data.drop(['Res', 'game_id', 'home_id', 'Away_id', 'AG', 'PD', 'PH'],1)

labels = num_data['Res']

print('Features')
print(features.head())

print('=========')

print('Labels')
print(labels.head())


In [None]:
#novo padrão
scaler = MinMaxScaler().fit(features)
features_scale = scaler.transform(features)

print('Features: ', features_scale.shape)
print(features_scale)

In [None]:
X_train = features_scale[:1932]
X_test = features_scale[1932:2155]
y_train = labels[:1932]
y_test = labels[1932:2155]

print( len(X_train), len(y_train))

print( len(X_test), len(y_test))

In [None]:
print ('LogisticRegression')


clf_LR = LogisticRegression(multi_class='multinomial',max_iter=2000)
clf_LR.fit(X_train, y_train)
pred= clf_LR.predict(X_test)

lg_acc = accuracy_score(y_test,  pred)
f1=f1_score(y_test,pred,average = 'micro')
print ('Acurácia LogisticRegression:{}'.format(lg_acc))
print ('F1 Score:{}'.format(f1) )

In [None]:
#parâmetros

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }

search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid)

search.fit(X_train, y_train)
clf = search.best_estimator_
pred= clf.predict(X_test)
lg_acc = accuracy_score(y_test, pred)


f1=f1_score(y_test,pred,average = 'macro')

print ('Acurácia LogisticRegression:{}'.format(lg_acc))
print ('F1 Score:{}'.format(f1) )

print (clf)

In [None]:
print('SVC')


clf = SVC()
clf.fit(X_train, y_train)
pred= clf.predict(X_test)

svc_acc = accuracy_score(y_test, pred)
f1=f1_score(y_test,pred, average='micro')
print ('Acurácia SVC:{}'.format(svc_acc))
print ('F1 Score:{}'.format(f1) )


In [None]:
#hiper parâmetros

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }

search = GridSearchCV(SVC(), param_grid)

search.fit(X_train, y_train)
clf_SVC = search.best_estimator_
pred= clf_SVC.predict(X_test)
acc = accuracy_score(y_test, pred)

f1=f1_score(y_test, pred, average = 'micro')

print ('F1 Score:{}'.format(f1))

print ('Acurácia LogisticRegression{}'.format(acc))

print (clf_SVC)


In [None]:
print ('Decision Tree')



clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
pred= clf.predict(X_test)

dt_acc = accuracy_score(y_test, pred)
f1=f1_score(y_test, pred, average='macro')
print ('Acurácia Tree:{}'.format(dt_acc))
print ('F1 Score:{}'.format(f1) )


n_estimators = [10, 50, 100, 200]
max_depth = [3, 10, 20, 40]

In [None]:
#testes de parâmetros

print('Decision Tree')

param_grid = {
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12]
}

search.fit(X_train, y_train)
clf = search.best_estimator_
pred= clf.predict(X_test)
dt_acc = accuracy_score(y_test, pred)


f1=f1_score(y_test, pred, average = 'micro')


print ('Acurácia Decision Tree:{}'.format(dt_acc))
print ('F1 Score:{}'.format(f1))

print (clf)


In [None]:
#treinando e testando os modelos

print ('Naive Baeys')

clf = GaussianNB()
clf.fit(X_train, y_train)
pred= clf.predict(X_test)

nb_acc = accuracy_score(y_test, pred)
f1=f1_score(y_test, pred, average='micro')
print('Acurácia Naive Baeys:{}'.format(nb_acc))
print('F1 Score:{}'.format(f1) )


In [None]:
#executando previsão:

previsao=features_scale[2155:]


game_id_full=data['game_id']
game_id=game_id_full[2155:]

res_full=data['Res']
res=res_full[2155:]

pred=clf_SVC.predict(previsao)

df=pd.DataFrame({'real': res, 'previsao':pred, 'game_id': game_id})

print(df)

In [None]:
#método confusion Matrix

df=pd.DataFrame(df,columns=['real', 'previsao'])

cf_matrix=pd.crosstab(df['real'], df['previsao'], rownames=['real'], colnames=['previsao'])

sns.heatmap(cf_matrix, annot=True, cmap='Reds')