# KFold cross-validation for regression and classification

In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score, confusion_matrix

## Exercise 1: Model selection for regression

In [None]:
def KFold_split(X, Y, num_folds, seed=264):
    """
    Split and shuffle X and Y into k=num_folds different folds:
    """
    KFold_splitter = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
    X_train_folds = []
    X_val_folds = []
    Y_train_folds = []
    Y_val_folds = []
    for (kth_fold_train_idxs, kth_fold_val_idxs) in KFold_splitter.split(X, Y):
        X_train_folds.append(X[kth_fold_train_idxs])
        X_val_folds.append(...)    #TODO!
        Y_train_folds.append(...)  #TODO!
        Y_val_folds.append(...)    #TODO!
    return X_train_folds, X_val_folds, Y_train_folds, Y_val_folds

## Exercise 2.1: Choosing the right metrics when dealing with unbalanced data

In [None]:
def generate_binary_dataset(ratio, n_samples=10000, seed=264):
    """ Generate a binary dataset, "ratio" defining the ratio between classes"""
    X, Y = make_classification(
        n_samples=n_samples,
        n_classes=2,
        n_features=2,
        n_redundant=0,
        n_repeated=0,
        weights=[ratio],
        flip_y=0,
        random_state=seed
    )
    return X, Y

def plot_confusion_matrix(confusion_matrix, ax=None):
    sns.heatmap(
        data=confusion_matrix.round(2), annot=True, fmt='d',
        cmap=sns.color_palette("RdBu_r", 1000), ax=ax
    )
    plt.title("Confusion matrix")
    plt.tight_layout()

def plot_scores(ratios, test_accs, test_f1_scores):
    fig, ax = plt.subplots(tight_layout=True)
    ax.plot(ratios, test_accs, label="Test accuracy")
    ax.plot(ratios, test_f1_scores, label="Test f1 scores")
    ax.set_xlabel("Ratio of 1st class instances")
    ax.set_ylabel("Score")
    fig.suptitle("Comparison of accuracy and f1 score metrics on imbalanced datasets")
    fig.legend()
    plt.show()
    return fig, ax

## Exercise 2.2: Kfold cross-validation for classification on unbalanced data

In [None]:
def load_custom_unbalanced_dataset(filename='custom_unbalanced_dataset.pickle'):
    """Load an unbalanced binary dataset"""
    with open('custom_unbalanced_dataset.pickle', 'rb') as unbalanced_dataset:
        X, Y = pickle.load(unbalanced_dataset)
    return X, Y