# KFold cross-validation for regression and classification

## Exercise 2: Model selection for classification

In [None]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

# Import custom functions for the machine learning pipeline
from machine_learning_pipeline import split_dataset, KFold_split, evaluate, model_selection, best_model_evaluation, basic_pipeline, pipeline_with_KFold

### 2.1 Creating your own unbalanced dataset

In [None]:
def generate_binary_dataset(ratio, n_samples=10000, seed=264):
    """ Generate a binary dataset, "ratio" defining the ratio between classes"""
    X, Y = make_classification(
        n_samples=n_samples, 
        n_classes=2, 
        n_features=2, 
        n_redundant=0, 
        n_repeated=0, 
        weights=[ratio],
        flip_y=0, 
        random_state=seed 
    )
    return X, Y

def plot_confusion_matrix(confusion_matrix, ax=None):
    sns.heatmap(
        data=confusion_matrix.round(2), annot=True, fmt='d', 
        cmap=sns.color_palette("RdBu_r", 1000), ax=ax
    )
    plt.title("Confusion matrix")
    plt.tight_layout()
    
def plot_scores(ratios, test_accs, test_f1_scores):
    fig, ax = plt.subplots(tight_layout=True)
    ax.plot(ratios, test_accs, label="Test accuracy")
    ax.plot(ratios, test_f1_scores, label="Test f1 scores")
    ax.set_xlabel("Ratio of 1st class instances")
    ax.set_ylabel("Score")
    fig.suptitle("Comparison of accuracy and f1 score metrics on imbalanced datasets")
    fig.legend()
    plt.show()
    return fig, ax

In [None]:
ratios = [0.6, 0.75, 0.9, 0.95, 0.98, 0.99]

# Create the list of hyper-parameters instances:
perf={"metric" : accuracy_score, "minimize" : False}
hyper_parameters = [{"n_neighbors": i} for i in range(1,12,2)]
n_models = len(hyper_parameters)
model_classes = [KNeighborsClassifier]*n_models

summaries = []
test_accs = []
test_f1_scores = []
test_confusion_matrices = []
for ratio in ratios:
    print(" ========== Current ratio: ", ratio)
    X, Y = generate_binary_dataset(ratio)
    
    # Select the best knn model for this ratio using accuracy score
    models, i_best, best_model, summary = basic_pipeline(
        X, Y, model_classes, hyper_parameters, perf
    )
    
    # Summary containing info about the entire pipeline
    summaries.append(summary)
    
    # Evaluate the selected model using different metrics
    test_accs.append(evaluate(
        best_model, summary["X_test"], summary["Y_test"], accuracy_score
    ))
    test_f1_scores.append(evaluate(
        best_model, summary["X_test"], summary["Y_test"], f1_score
    ))
    test_confusion_matrices.append(evaluate(
        best_model, summary["X_test"], summary["Y_test"], confusion_matrix
    ))

In [None]:
fig, axes = plt.subplots(2,3)
for i, ax in enumerate(axes.flat):
    plot_confusion_matrix(test_confusion_matrices[i], ax)
plt.show()
    
fig, ax = plot_scores(ratios, test_accs, test_f1_scores)

Let us focus on the confusion matrices first. With a binary dataset, the confusion matrix contains the true negatives in the upper-left square, the false positives in the upper-right square, the false negatives in the bottom-left square and the true positives in the bottom-right square, where the "positive" class is attributed to the minority class. Equivalently, the rows of the confusion matrix represent the actual class of each sample whereas the columns represent their predicted class.

Now if we look at the different confusion matrices, we notice that when data is balanced, there is a symmetry between the true positives and the true negatives, and the same can be said about the false positives with respect to the false negatives. But the more unbalanced the data, the more the symmetry collapses in the confusion matrix: true negatives (the correctly classified samples from the dominant class) converge to the total amount of samples, false positives converge to 0 and true positives are less and less prevalent until there are more false negatives than there are true positives. This shows that for the very unbalanced datasets, our $K$-NN model totally failed to capture the underlying structure of the minority class. 

If we consider the accuracy metric, it only captured the information that true negatives massively dominate all other categories in the presence of important data imbalance, which makes for an overall increasing accuracy as the imbalance in the data rises. In other word, the accuracy metric is not suitable to correctly assess the performance of a model when the data is unbalanced.

### 2.2 Using a predefined dataset

In [None]:
def load_custom_unbalanced_dataset(filename='custom_unbalanced_dataset.pickle'):
    """Load an unbalanced binary dataset"""
    with open('custom_unbalanced_dataset.pickle', 'rb') as unbalanced_dataset:
        X, Y = pickle.load(unbalanced_dataset)
    return X, Y

X, Y = load_custom_unbalanced_dataset()

In [None]:
# Create the list of hyper-parameters instances:
perf={"metric" : f1_score, "minimize" : False}
hyper_parameters = [
    {"n_neighbors": 9}, {"n_neighbors": 19}, 
    {}, 
    {} 
]

model_classes = [
    KNeighborsClassifier, KNeighborsClassifier,
    DecisionTreeClassifier, 
    LogisticRegression
]

# Select the best knn model for this ratio using accuracy score
models, i_best, best_model, summary = pipeline_with_KFold(
    X, Y, model_classes, hyper_parameters, perf, k=10
)

In [None]:
# Evaluate the selected model using different metrics
test_acc = evaluate(
    best_model, summary["X_test"], summary["Y_test"], accuracy_score
)
test_f1_score = evaluate(
    best_model, summary["X_test"], summary["Y_test"], f1_score
)
test_confusion_matrix = evaluate(
    best_model, summary["X_test"], summary["Y_test"], confusion_matrix
)

plot_confusion_matrix(test_confusion_matrix)
print("Selected model test performances using different metrics:")
print("Accuracy:", test_acc)
print("f1 score:", test_f1_score)

To perform cross-validation, we can choose the F1-score metric, which takes into account precision and recall. Precision is the ratio of true positives among the true positives and the false positives. Recall is the ratio of true positives among the true positives and the false negatives. In the $K$-NN example above, we can see that this metric reduces significantly as the imbalance in the data increases, so it is more suitable than the accuracy metric in our case, since we are dealing once again with a very unbalanced dataset (unbalance ratio of 0.9/0.1)