In [None]:
from sklearn.datasets import make_classification
from sklearn import ensemble
from sklearn.neural_network import MLPClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import duckdb as db
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
from matplotlib.collections import QuadMesh
import matplotlib.font_manager as fm

In [None]:
df = pd.read_csv('Full_match_outcome.csv')

In [None]:
df

In [None]:
df.columns

In [None]:
from sklearn.model_selection import train_test_split
# use index-based sampling since we have time series data
train, test = train_test_split(df, test_size=0.4, shuffle=True)

In [None]:
pred_vars = ['t1_t1_entropy','t2_t2_entropy', 't1_t2_entropy', 't2_t1_entropy']
y_var = 'home_win'

In [None]:
dtree = tree.DecisionTreeClassifier(criterion='entropy', max_depth=10)
dtree.fit(train[pred_vars], train[y_var])

In [None]:
rf = ensemble.RandomForestClassifier(n_estimators=200)
rf.fit(train[pred_vars], train[y_var])

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(20, 20), max_iter=500, n_iter_no_change=20)
mlp.fit(train[pred_vars], train[y_var])

In [None]:
nb = GaussianNB()
nb.fit(train[pred_vars], train[y_var])

In [None]:
lr = LogisticRegression()
lr.fit(train[pred_vars], train[y_var])

In [None]:
knr = KNeighborsClassifier()
knr.fit(train[pred_vars], train[y_var])

In [None]:
fitted = [dtree, rf, mlp, nb, lr, knr] #took svc out, and knr, sgd

# empty dataframe to store the results
result_table = pd.DataFrame(columns=['classifier_name', 'fpr','tpr','auc', 
                                     'log_loss', 'clf_report'])

for clf in fitted:
    # print the name of the classifier
    print(clf.__class__.__name__)
    
    # get predictions

    yproba = clf.predict_proba(test[pred_vars])
    yclass = clf.predict(test[pred_vars])
    
    # auc information
    fpr, tpr, _ = metrics.roc_curve(test[y_var],  yproba[:,1])
    auc = metrics.roc_auc_score(test[y_var], yproba[:,1])
    
    # log loss
    log_loss = metrics.log_loss(test[y_var], yproba[:,1])
    
    # add some other stats based on confusion matrix
    clf_report = metrics.classification_report(test[y_var], yclass)
    
    # add the results to the dataframe
    result_table = result_table.append({'classifier_name':clf.__class__.__name__,
                                        'fpr':fpr, 
                                        'tpr':tpr, 
                                        'auc':auc,
                                        'log_loss': log_loss,
                                        'clf_report': clf_report}, ignore_index=True)

In [None]:
result_table.set_index('classifier_name', inplace=True)
display(result_table)

In [None]:
for i in result_table.index:
    print('\n---- statistics for', i, "----\n")
    print(result_table.loc[i, 'clf_report'])
    print("Model log loss:", result_table.loc[i, 'log_loss'])

In [None]:
fig = plt.figure(figsize=(14,12))

for i in result_table.index:
    plt.plot(result_table.loc[i]['fpr'], 
             result_table.loc[i]['tpr'], 
             label="{}, AUC={:.3f}".format(i, result_table.loc[i]['auc']))
    
plt.plot([0,1], [0,1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size':13}, loc='lower right')

plt.show()

In [None]:
predicted = dtree.predict(test.loc[:, pred_vars])
cm = metrics.confusion_matrix(y_true=test[y_var], y_pred=predicted, labels=[1, 0])
df_cm = pd.DataFrame(cm,  columns=['Actual Win','Actual No Win'], index=['Predicted Win', 'Predicted No Win'])
pp_matrix(df_cm,cmap='copper', file='dtree_cm.png') #Copper

In [None]:
predicted = rf.predict(test.loc[:, pred_vars])
cm = metrics.confusion_matrix(y_true=test[y_var], y_pred=predicted, labels=[1, 0])
df_cm = pd.DataFrame(cm,  columns=['Actual Win','Actual No Win'], index=['Predicted Win', 'Predicted No Win'])
pp_matrix(df_cm,cmap='copper', title='Random Forrest Confusion Matrix', file='rf_cm.png') #Copper

In [None]:
predicted = mlp.predict(test.loc[:, pred_vars])
cm = metrics.confusion_matrix(y_true=test[y_var], y_pred=predicted, labels=[1, 0])
df_cm = pd.DataFrame(cm,  columns=['Actual Win','Actual No Win'], index=['Predicted Win', 'Predicted No Win'])
pp_matrix(df_cm,cmap='copper', title='MLP Confusion Matrix', file='MLP_cm.png') #Copper

In [None]:
predicted = nb.predict(test.loc[:, pred_vars])
cm = metrics.confusion_matrix(y_true=test[y_var], y_pred=predicted, labels=[1, 0])
df_cm = pd.DataFrame(cm,  columns=['Actual Win','Actual No Win'], index=['Predicted Win', 'Predicted No Win'])
pp_matrix(df_cm,cmap='copper', title='Naive Bayes Confusion Matrix', file='nb_cm.png') #Copper

In [None]:
predicted = lr.predict(test.loc[:, pred_vars])
cm = metrics.confusion_matrix(y_true=test[y_var], y_pred=predicted, labels=[1, 0])
df_cm = pd.DataFrame(cm,  columns=['Actual Win','Actual No Win'], index=['Predicted Win', 'Predicted No Win'])
pp_matrix(df_cm,cmap='copper', title='Linear Regression Confusion Matrix', file='lr_cm.png') #Copper

In [None]:
predicted = knr.predict(test.loc[:, pred_vars])
cm = metrics.confusion_matrix(y_true=test[y_var], y_pred=predicted, labels=[1, 0])
df_cm = pd.DataFrame(cm,  columns=['Actual Win','Actual No Win'], index=['Predicted Win', 'Predicted No Win'])
pp_matrix(df_cm,cmap='copper', title='KNearest Neighbors Confusion Matrix', file='knr_cm.png') #Copper

In [None]:
predicted = test['Home_Win_Pred']
cm = metrics.confusion_matrix(y_true=test[y_var], y_pred=predicted, labels=[1, 0])
df_cm = pd.DataFrame(cm,  columns=['Actual Win','Actual No Win'], index=['Predicted Win', 'Predicted No Win'])
pp_matrix(df_cm,cmap='copper', title='SEI Confusion Matrix', file='SEI_cm.png') #Copper

In [None]:
def configcell_text_and_colors(
    array_df, lin, col, oText, facecolors, posi, fz, fmt, show_null_values=0
):
    """
    config cell text and colors
    and return text elements to add and to dell
    @TODO: use fmt
    """
    text_add = []
    text_del = []
    cell_val = array_df[lin][col]
    tot_all = array_df[-1][-1]
    per = (float(cell_val) / tot_all) * 100
    curr_column = array_df[:, col]
    ccl = len(curr_column)

    # last line  and/or last column
    if (col == (ccl - 1)) or (lin == (ccl - 1)):
        # tots and percents
        if cell_val != 0:
            if (col == ccl - 1) and (lin == ccl - 1):
                tot_rig = 0
                for i in range(array_df.shape[0] - 1):
                    tot_rig += array_df[i][i]
                per_ok = (float(tot_rig) / cell_val) * 100
            elif col == ccl - 1:
                tot_rig = array_df[lin][lin]
                per_ok = (float(tot_rig) / cell_val) * 100
            elif lin == ccl - 1:
                tot_rig = array_df[col][col]
                per_ok = (float(tot_rig) / cell_val) * 100
            per_err = 100 - per_ok
        else:
            per_ok = per_err = 0

        per_ok_s = ["%.2f%%" % (per_ok), "100%"][per_ok == 100]

        # text to DEL
        text_del.append(oText)

        # text to ADD
        font_prop = fm.FontProperties(weight="bold", size=fz)
        text_kwargs = dict(
            color="w",
            ha="center",
            va="center",
            gid="sum",
            fontproperties=font_prop,
        )
        lis_txt = ["%d" % (cell_val), per_ok_s, "%.2f%%" % (per_err)]
        lis_kwa = [text_kwargs]
        dic = text_kwargs.copy()
        dic["color"] = "g"
        lis_kwa.append(dic)
        dic = text_kwargs.copy()
        dic["color"] = "r"
        lis_kwa.append(dic)
        lis_pos = [
            (oText._x, oText._y - 0.3),
            (oText._x, oText._y),
            (oText._x, oText._y + 0.3),
        ]
        for i in range(len(lis_txt)):
            newText = dict(
                x=lis_pos[i][0],
                y=lis_pos[i][1],
                text=lis_txt[i],
                kw=lis_kwa[i],
            )
            text_add.append(newText)

        # set background color for sum cells (last line and last column)
        carr = [0.27, 0.30, 0.27, 1.0]
        if (col == ccl - 1) and (lin == ccl - 1):
            carr = [0.17, 0.20, 0.17, 1.0]
        facecolors[posi] = carr

    else:
        if per > 0:
            txt = "%s\n%.2f%%" % (cell_val, per)
        else:
            if show_null_values == 0:
                txt = ""
            elif show_null_values == 1:
                txt = "0"
            else:
                txt = "0\n0.0%"
        oText.set_text(txt)

        # main diagonal
        if col == lin:
            # set color of the textin the diagonal to white
            oText.set_color("w")
            # set background color in the diagonal to blue
            facecolors[posi] = [0.35, 0.8, 0.55, 1.0]
        else:
            oText.set_color("r")

    return text_add, text_del


def get_new_fig(fn, figsize=[9, 9]):
    """Init graphics"""
    fig1 = plt.figure(fn, figsize)
    ax1 = fig1.gca()  # Get Current Axis
    ax1.cla()  # clear existing plot
    return fig1, ax1

def insert_totals(df_cm):
    """insert total column and line (the last ones)"""
    sum_col = []
    for c in df_cm.columns:
        sum_col.append(df_cm[c].sum())
    sum_lin = []
    for item_line in df_cm.iterrows():
        sum_lin.append(item_line[1].sum())
    df_cm["sum_lin"] = sum_lin
    sum_col.append(np.sum(sum_lin))
    df_cm.loc["sum_col"] = sum_col
    
def pp_matrix(
    df_cm,
    annot=True,
    cmap="Oranges",
    fmt=".2f",
    fz=11,
    lw=0.5,
    cbar=False,
    figsize=[8, 8],
    show_null_values=0,
    pred_val_axis="y",
    title="Confusion matrix",
    file=''
):
    """
    print conf matrix with default layout (like matlab)
    params:
      df_cm          dataframe (pandas) without totals
      annot          print text in each cell
      cmap           Oranges,Oranges_r,YlGnBu,Blues,RdBu, ... see:
      fz             fontsize
      lw             linewidth
      pred_val_axis  where to show the prediction values (x or y axis)
                      'col' or 'x': show predicted values in columns (x axis) instead lines
                      'lin' or 'y': show predicted values in lines   (y axis)
    """
    if pred_val_axis in ("col", "x"):
        xlbl = "Predicted"
        ylbl = "Actual"
    else:
        xlbl = "Actual"
        ylbl = "Predicted"
        df_cm = df_cm.T

    # create "Total" column
    insert_totals(df_cm)

    # this is for print allways in the same window
    fig, ax1 = get_new_fig("Conf matrix default", figsize)

    ax = sn.heatmap(
        df_cm,
        annot=annot,
        annot_kws={"size": fz},
        linewidths=lw,
        ax=ax1,
        cbar=cbar,
        cmap=cmap,
        linecolor="w",
        fmt=fmt,
    )

    # set ticklabels rotation
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize=10)
    ax.set_yticklabels(ax.get_yticklabels(), rotation=25, fontsize=10)

    # Turn off all the ticks
    for t in ax.xaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False
    for t in ax.yaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False

    # face colors list
    quadmesh = ax.findobj(QuadMesh)[0]
    facecolors = quadmesh.get_facecolors()

    # iter in text elements
    array_df = np.array(df_cm.to_records(index=False).tolist())
    text_add = []
    text_del = []
    posi = -1  # from left to right, bottom to top.
    for t in ax.collections[0].axes.texts:  # ax.texts:
        pos = np.array(t.get_position()) - [0.5, 0.5]
        lin = int(pos[1])
        col = int(pos[0])
        posi += 1

        # set text
        txt_res = configcell_text_and_colors(
            array_df, lin, col, t, facecolors, posi, fz, fmt, show_null_values
        )

        text_add.extend(txt_res[0])
        text_del.extend(txt_res[1])

    # remove the old ones
    for item in text_del:
        item.remove()
    # append the new ones
    for item in text_add:
        ax.text(item["x"], item["y"], item["text"], **item["kw"])

    # titles and legends
    ax.set_title(title)
    ax.set_xlabel(xlbl)
    ax.set_ylabel(ylbl)
    plt.tight_layout()  # set layout slim
    plt.show()
    if file != '':
        fig.savefig(file) 