In [None]:
#First we import all packeges we need
import numpy as np # Numerical computing
import pandas as pd # Dataframes
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, cross_val_predict# CV function
from sklearn.metrics import make_scorer, classification_report, confusion_matrix# Metrics
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
df = pd.read_csv("data/df_cleaned.csv")
#To run our models we have to be sure that there are no NaN's
df.isnull().sum().max()

In [None]:
#Now we have to build two subsets of our data set
#X should contain all variables that help us to evaluate the target variable
#y contains the target variable
X = df.drop(columns=['PO_1._R','PO_2._R','Conf._Fin', 'Finals','Champion'])
#Here we enter one of our five target variables 
y = df["Champion"]
#We split our data frame, so that we can test our final models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=72)

In [None]:
#We define all pre-processing steps of our data to fit them in a pipeline
#We need the pipeline to be able to cross-validate our models properly
#When we don't use the pipline we risk data leakage and our cross-validation is not accurate
smt = SMOTE(random_state=0)
scaler = StandardScaler()
pca = PCA(n_components=10)
model = GradientBoostingClassifier()
pipe= Pipeline(steps = [['scaler', scaler],['pca',pca],['smote', smt],['classifier', model]])

#Now we define which parameters of the algorithms should be optimized
param_grid = {
              "classifier__n_estimators":range(10,250,5),
              "classifier__ccp_alpha": 10.0 ** -np.arange(1,7)
            }

#Now we run the grid search function that searches for the parameters with the highest accuracy score in the cross-validation 
grid_search = GridSearchCV(pipe, param_grid, cv=10, scoring='accuracy')
grid_search.fit(X_train, y_train)

#Finally, we store the best model
best_model = grid_search.best_estimator_
print(best_model)

In [None]:
#Now we evaluate all relevant metrics for our best_model to compare its score to the other models

# Create custom scorers for false positive rate and false negative rates
fpos_rate = lambda y_true, y_pred: np.mean(y_pred[np.where(y_true == 0)])
fneg_rate = lambda y_true, y_pred: 1 - np.mean(y_pred[np.where(y_true == 1)])
fpos_scorer = make_scorer(fpos_rate, greater_is_better=False)
fneg_scorer = make_scorer(fneg_rate, greater_is_better=False)

# Estimate 10-fold cross-validation, return accuracy, FPR, FNR
nfolds = 10
cv_results = cross_validate(best_model, X_train, y_train, return_train_score=True , cv=nfolds,
scoring={"accuracy": "accuracy", "fpr": fpos_scorer, "fnr": fneg_scorer,})

# Compute the metrics for the cross-validation
misclass_cv = 1 - cv_results["test_accuracy"]
accuracy_cv= cv_results["test_accuracy"]
false_pos_cv = -cv_results["test_fpr"]
false_neg_cv = -cv_results["test_fnr"]

# Quick and dirty helpers to compute mean and standard error thereof
mean_and_se = lambda x, n: (np.mean(100 * x), np.std(100 * x) / np.sqrt(n))
# Compute means and standard errors
misclass_mean, misclass_se = mean_and_se(misclass_cv, nfolds)
accuracy_mean, accuracy_se = mean_and_se(accuracy_cv, nfolds)
false_pos_mean, false_pos_se = mean_and_se(false_pos_cv, nfolds)
false_neg_mean, false_neg_se = mean_and_se(false_neg_cv, nfolds)

#Now we print our important metrics
print(f"The mean missclassification rate is {misclass_mean:.2f}% (± {misclass_se:.2f}%)")
print(f"The mean accuracy is: {accuracy_mean:.2f}% (± {accuracy_se:.2f}%)")
print(f"The mean false positive rate is {false_pos_mean:.2f}% (± {false_pos_se:.2f}%)")
print(f"The mean false negative rate is {false_neg_mean:.2f}% (± {false_neg_se:.2f}%)")

#Now we evaluate precision and recall with the function classification_report and print the confusion matrix
ypred=cross_val_predict(best_model, X_train, y_train)
print(classification_report(y_train,ypred))
conf_matrix=confusion_matrix(y_train,ypred)
print(conf_matrix)