# Imports

In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

import pandas as pd
import numpy as np
import scipy as sp
import itertools
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomTreesEmbedding, VotingClassifier, RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier, GradientBoostingClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score

types = {k: np.int32 for k in range(21515)}

# Datensätze

LineUp mit Spielernamen

In [None]:
fifa_names = pd.read_csv("data/data_names.csv", encoding='utf8', index_col=None, header=None)

In [None]:
fifa_names.head()

In [None]:
fifa_names.drop([0], axis=1)

In [None]:
fifa_names.loc[:,1:11]

In [None]:
fifa_names.loc[:,12:22]

In [None]:
fifa_names.loc[:,23]

LineUp mit Spieler IDs

In [None]:
fifa_simple = pd.read_csv("data/data_simple_train.csv", encoding='utf8', dtype=types, index_col=None, header=None)

In [None]:
fifa_simple.head()

LineUp alle Spieler gegen alle Spieler (1: nimmt teil, 0: nimmt nicht teil)

In [2]:
fifa_complex = pd.read_csv("data/data_complex_train.csv", encoding='utf8', dtype=types, index_col=None, header=None)

In [None]:
fifa_complex.head()

In [None]:
fifa_complex.loc[:,100:120]

In [None]:
fifa_complex.loc[:,5535:5558]

In [None]:
row = 0

home = 0
away = 0
for column in range(1,10758):
    if (fifa_complex.loc[row,column] == 1):
        home += 1
    if (fifa_complex.loc[row,column + 10756] == 1):
        away += 1
        
print("Row", row,":")
print("Homeplayer:", home)
print("Awayplayer:", away)

# Feature Set, Sparse Matrix und Train/Test Split

In [3]:
feature = fifa_complex.drop([0,21515], axis=1).values
target = fifa_complex[21515].values

feature_csc = sp.sparse.csc_matrix(feature)
feature_csr = sp.sparse.csr_matrix(feature)

In [None]:
print(feature_csc)

In [None]:
print(feature_csr)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(feature_csr, target, test_size=0.2)
print(X_train.shape[0])
print(X_test.shape[0])

5170
1293


# Classifier Vergleich

In [None]:
classifiers = [
    LinearSVC(),
    SVC(),
    NuSVC(),
    DecisionTreeClassifier(),
    KNeighborsClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    ExtraTreesClassifier(),
    BaggingClassifier(),
    GradientBoostingClassifier()
]

In [None]:
for i, clf in enumerate(classifiers):
    c = clf.fit(X_train, y_train)
    scores = clf.score(X_test, y_test)
    print("Richtigkeit: %0.3f [%s]" % (scores.mean(), type(c).__name__))

# Bewertung und Darstellung der Ergebnisse

Decision Trees

In [None]:
classifierDecisionTree = [
    DecisionTreeClassifier(criterion='gini'), #default
    DecisionTreeClassifier(criterion='entropy'),
    DecisionTreeClassifier(splitter='best'), #default
    DecisionTreeClassifier(splitter='random'),
    DecisionTreeClassifier(class_weight='balanced'),
    DecisionTreeClassifier(max_depth=2),
    DecisionTreeClassifier(max_depth=4),
    DecisionTreeClassifier(max_depth=6),
    DecisionTreeClassifier(max_depth=8)
]

In [None]:
for i, clf in enumerate(classifierDecisionTree):
    c = clf.fit(X_train, y_train)
    scores = clf.score(X_test, y_test)
    print("Richtigkeit: %0.3f [%s]" % (scores.mean(), type(c).__name__))

Support Vector Machines

In [None]:
classifierSVMs = [
    LinearSVC(),
    SVC(kernel='linear'),
    SVC(kernel='poly'),
    SVC(kernel='rbf'), #default
    SVC(kernel='sigmoid'),
    #SVC(kernel='precomputed'), #no suppport for sparse
    NuSVC(kernel='linear'),
    NuSVC(kernel='poly'),
    NuSVC(kernel='rbf'), #default
    NuSVC(kernel='sigmoid'),
    #NuSVC(kernel='precomputed'), #no suppport for sparse
]

In [None]:
for i, clf in enumerate(classifierSVMs):
    c = clf.fit(X_train, y_train)
    scores = clf.score(X_test, y_test)
    print("Richtigkeit: %0.3f [%s]" % (scores.mean(), type(c).__name__))

Ensemble Methods (Random Forest Classifier / Voting Classifier)

In [5]:
classifierRandomForest = [
    RandomForestClassifier(n_estimators=10,max_depth=None), #default
    RandomForestClassifier(n_estimators=100,max_depth=None),
    RandomForestClassifier(n_estimators=100,max_depth=6),
    RandomForestClassifier(n_estimators=10000,max_depth=None),
    RandomForestClassifier(n_estimators=10000,max_depth=6)
]

In [6]:
for i, clf in enumerate(classifierRandomForest):
    c = clf.fit(X_train, y_train)
    scores = clf.score(X_test, y_test)
    print("Richtigkeit: %0.3f [%s]" % (scores.mean(), type(c).__name__))

Richtigkeit: 0.481 [RandomForestClassifier]
Richtigkeit: 0.519 [RandomForestClassifier]
Richtigkeit: 0.441 [RandomForestClassifier]
Richtigkeit: 0.518 [RandomForestClassifier]
Richtigkeit: 0.441 [RandomForestClassifier]


In [None]:
clf1 = AdaBoostClassifier(algorithm='SAMME',base_estimator=NuSVC(kernel='linear', probability=True))
clf2 = RandomForestClassifier(n_estimators=10750)
clf21 = RandomForestClassifier(n_estimators=1000)
clf3 = BaggingClassifier(base_estimator=NuSVC(kernel='linear', probability=True))
clf4 = GradientBoostingClassifier()

classifierVoting = [
    VotingClassifier(estimators=[('ab', clf1), ('rf', clf3)], voting='soft'),
    VotingClassifier(estimators=[('ab', clf1), ('rf', clf21), ('bc', clf3), ('gb', clf4)], voting='hard'),
    VotingClassifier(estimators=[('ab', clf1), ('rf', clf21), ('bc', clf3), ('gb', clf4)], voting='soft'),
    VotingClassifier(estimators=[('ab', clf1), ('rf', clf21), ('bc', clf3)], voting='soft'),
    VotingClassifier(estimators=[('ab', clf1), ('rf', clf2), ('bc', clf3), ('gb', clf4)], voting='soft')
]

In [None]:
for i, clf in enumerate(classifierVoting):
    c = clf.fit(X_train, y_train)
    scores = clf.score(X_test, y_test)
    print("Richtigkeit: %0.3f [%s]" % (scores.mean(), type(c).__name__))

Ergebnisse

In [None]:
clf_predict = VotingClassifier(estimators=[('ab', clf1), ('rf', clf21), ('bc', clf3)], voting='soft')

classifierResult = [
    NuSVC(kernel='linear'),
    RandomForestClassifier(n_estimators=10750),
    clf_predict
]

In [None]:
# from scikit-learn
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    #else:
        #print('Confusion matrix, without normalization')

    #print(cm)
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
for i, clf in enumerate(classifierResult):
    c = clf.fit(X_train, y_train)
    y_predict = clf.predict(X_test)
    accuracy_number = accuracy_score(y_test, y_predict, normalize=False)
    accuracy_percent = accuracy_score(y_test, y_predict)
    
    class_names = ('Unentschieden', 'SiegerHeim', 'SiegerAuswärts')
    
    # Compute confusion matrix
    cnf_matrix = confusion_matrix(y_test, y_predict)
    np.set_printoptions(precision=2)

    # Plot non-normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(cnf_matrix, classes=class_names,
                          title='Confusion matrix, without normalization')

    # Plot normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                          title='Normalized confusion matrix')
    
    print("Richtigkeit: %0.3f (%d) [%s]" % (accuracy_percent, accuracy_number, type(c).__name__))
    
    plt.show()

# WM 2018 KO-Runde

In [None]:
fifa_wm_ko = pd.read_csv("data/data_complex_test.csv", encoding='utf8', dtype=types, index_col=None, header=None)

feature_wm_ko = fifa_wm_ko.drop([0,21515], axis=1).values
target_wm_ko = fifa_wm_ko[21515].values

feature_wm_ko = sp.sparse.csr_matrix(feature_wm_ko)

In [None]:
fifa_wm_ko.head()

In [None]:
wm_predict = clf_predict.predict(feature_wm_ko)

accuracy_number = accuracy_score(target_wm_ko, wm_predict, normalize=False)
accuracy_percent = accuracy_score(target_wm_ko, wm_predict)
print("Richtigkeit: %0.3f (%d) [%s]" % (accuracy_percent, accuracy_number, type(c).__name__))