In [None]:
#First we import all packeges we need
import numpy as np # Numerical computing
import pandas as pd # Dataframes
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, cross_val_predict, cross_val_score # CV function
from sklearn.metrics import make_scorer, classification_report, confusion_matrix # Metrics
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression # Logistic regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline 

In [None]:
#We import our cleanded data frame
df = pd.read_csv("data/df_cleaned.csv")
#To run our models we have to be sure that there are no NaN's
df.isnull().sum().max()

In [None]:
#Now we have to build two subsets of our data set
#X should contain all variables that help us to evaluate the target variable
#y contains the target variable
X = df.drop(columns=['PO_1._R','PO_2._R','Conf._Fin', 'Finals','Champion'])
#Here we enter one of our five target variables 
y = df["PO_2._R"]
#We split our data frame, so that we can test our final models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=72)

In [None]:
#We define all pre-processing steps of our data to fit them in a pipeline
#We need the pipeline to be able to cross-validate our models properly
#When we don't use the pipline we risk data leakage and our cross-validation is not accurate
smt = SMOTE(random_state=0)
scaler = StandardScaler()
pca = PCA(n_components=10)
logistic = LogisticRegression()
pipe= Pipeline(steps = [['scaler', scaler],['pca',pca],['smote', smt],['classifier', logistic]])

In [None]:
#With npoly we define the polynomial degree of our models to build and validate
npoly = range(1,4)
#Instantiate a list to keep track of the accuracy in cross-validation of our different logistic regressions
acc_list=[]
#Now we build as many models as we defined in nestimators and cross-validate each one            
for poly_degree in npoly:
    poly = PolynomialFeatures(degree = poly_degree, interaction_only=False, include_bias=False)
    X_poly = poly.fit_transform(X_train)
    
    #Fit it to the data
    y_train = np.array(y_train).astype(int)
    y_train = y_train.ravel()
    pipe.fit(X_poly, y_train)
    
    #Run cross-validation
    accuracy=cross_val_score(pipe, poly.transform(X_train), y_train, scoring='accuracy',cv=10)
      
    #Store the mean of the cross-validation scores of each model
    acc_list.append(accuracy.mean())

#Now we evaluate how many estimators the model with the highest accuracy in cross-validation had
best = np.argmax(acc_list)
final_model_poly=npoly[best]
print(final_model_poly)

In [None]:
#We rebuild our best model to get all important stats
poly = PolynomialFeatures(degree = final_model_poly, interaction_only=False, include_bias=False)
X_poly = poly.fit_transform(X_train)

# Fit it to the data
pipe.fit(X_train, y_train)

       
# Create custom scorers for false positive rate and false negative rates
fpos_rate = lambda y_true, y_pred: np.mean(y_pred[np.where(y_true == 0)])
fneg_rate = lambda y_true, y_pred: 1 - np.mean(y_pred[np.where(y_true == 1)])
fpos_scorer = make_scorer(fpos_rate, greater_is_better=False)
fneg_scorer = make_scorer(fneg_rate, greater_is_better=False)

# Estimate 10-fold cross-validation, return accuracy, FPR, FNR
nfolds = 10
cv_results = cross_validate(pipe, poly.transform(X_train), y_train, return_train_score=True , cv=nfolds,
scoring={"accuracy": "accuracy", "fpr": fpos_scorer, "fnr": fneg_scorer,})

# Compute the metrics for the cross-validation
misclass_cv = 1 - cv_results["test_accuracy"]
accuracy_cv= cv_results["test_accuracy"]
false_pos_cv = -cv_results["test_fpr"]
false_neg_cv = -cv_results["test_fnr"]

# Quick and dirty helpers to compute mean and standard error thereof
mean_and_se = lambda x, n: (np.mean(100 * x), np.std(100 * x) / np.sqrt(n))
# Compute means and standard errors
misclass_mean, misclass_se = mean_and_se(misclass_cv, nfolds)
accuracy_mean, accuracy_se = mean_and_se(accuracy_cv, nfolds)
false_pos_mean, false_pos_se = mean_and_se(false_pos_cv, nfolds)
false_neg_mean, false_neg_se = mean_and_se(false_neg_cv, nfolds)

#Now we print our important metrics
print(f"The mean missclassification rate is {misclass_mean:.2f}% (± {misclass_se:.2f}%)")
print(f"The mean accuracy is: {accuracy_mean:.2f}% (± {accuracy_se:.2f}%)")
print(f"The mean false positive rate is {false_pos_mean:.2f}% (± {false_pos_se:.2f}%)")
print(f"The mean false negative rate is {false_neg_mean:.2f}% (± {false_neg_se:.2f}%)")

#Now we evaluate precision and recall with the function classification_report and print the confusion matrix
y_pred = cross_val_predict(pipe, X_train, y_train, cv=10)
class_report = classification_report(y_train, y_pred)
print(class_report)
conf_matrix=confusion_matrix(y_train,y_pred)
print(conf_matrix)

In [None]:
#Now we test our final model for the prediction of the first and second round of the playoff
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
class_report = classification_report(y_test, y_pred)
print(class_report)
conf_matrix=confusion_matrix(y_test,y_pred)
print(conf_matrix)