# Import packages

In [1]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

# Exercise 1: Template for model selection of a Ridge model on a list of hyper-parameters instances with KFold cross-validation

In [None]:
### Split+shuffle X and Y into k=num_folds different folds:
def KFold_split(X, Y, num_folds, seed):
    KFold_splitter = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
    X_train_folds = []
    X_val_folds = []
    Y_train_folds = []
    Y_val_folds = []
    for (kth_fold_train_idxs, kth_fold_val_idxs) in KFold_splitter.split(X, Y):
        X_train_folds.append(X[kth_fold_train_idxs])
        X_val_folds.append(...)
        Y_train_folds.append(...)
        Y_val_folds.append(...)
    return X_train_folds, X_val_folds, Y_train_folds, Y_val_folds

In [None]:
### Select a Ridge model on a list of hyper-parameters instances, via Kfold cross-validation:
def KFold_model_selection(X, Y, hyper_parameters_instances, num_folds, seed):
    # Extract a test set:
    X_train_val, X_test, Y_train_val, Y_test = ...
    # Extract train and validation folds:
    X_train_folds, X_val_folds, Y_train_folds, Y_val_folds = ...
    # For each hyper-parameter instance, do KFold cross validation:
    mean_val_MSEs = []
    for hyper_parameters_instance in hyper_parameters_instances:
        print("\nNow preprocessing hyper-parameter instance", hyper_parameters_instance)
        mean_val_MSE = ...
        print("Mean validation MSE:", mean_val_MSE)
        mean_val_MSEs.append(mean_val_MSE)
    # The hyper-parameter instance with the smallest mean validation MSE is our model of choice:
    best_instance_idx = ...
    best_hyper_parameters_instance = hyper_parameters_instances[best_instance_idx]
    print("\n\nBest hyper-parameter instance:", best_hyper_parameters_instance)
    # Train the best model on the whole train set then evaluate it on the test set:
    best_model_test_MSE = ...
    print("Test MSE:", best_model_test_MSE)

In [None]:
### KFold cross-validation of a Ridge model with given hyper-parameters:
def perform_KFold_CV(X_train_folds, X_val_folds, Y_train_folds, Y_val_folds, degree, regularization):
    val_fold_MSEs = []
    # For each fold, assess a surrogate model with fixed hyper-parameters:
    cmpt = 0
    for X_train_fold, X_val_fold, Y_train_fold, Y_val_fold in zip(X_train_folds, X_val_folds, Y_train_folds, Y_val_folds):
        val_fold_MSE = ...
        cmpt += 1
        print("Surrogate model", str(cmpt) + "/" + str(len(X_val_folds)), "validation MSE:", val_fold_MSE)
        val_fold_MSEs.append(val_fold_MSE)
    # Compute the mean validation MSE between all the folds:
    mean_val_MSE = ...
    return mean_val_MSE

In [None]:
### Fit and evaluate a Ridge model with given hyper-parameters:
def assess_Ridge(X_train, X_test, Y_train, Y_test, degree, regularization):
    # Build the polynomial features:
    poly_features = PolynomialFeatures(degree=degree)
    X_train_poly = ...
    X_test_poly = ...
    # Fit the polynomial features with a Ridge model:
    model = Ridge(alpha=regularization)
    model.fit(X_train_poly, Y_train)
    # Evaluate the Ridge model on the test set:
    Y_test_pred = model.predict(X_test_poly)
    test_MSE = ...
    return test_MSE

# Exercise 2 - 1): Choosing the right metrics when dealing with unbalanced data

In [None]:
# Fix random seed for reproducibility:
seed = 666
# Set up seaborn (for heatmaps):
sns.set()

### Train and evaluate a K-NN with K=10 on randomly generated binary dataset, with different ratios between 
### the two classes. Use both accuracy and F1 score metrics, plus the confusion matrix:
ratios = [0.6, 0.75, 0.9, 0.95, 0.98, 0.99]
test_accuracies = []
test_f1_scores = []
test_confusion_matrices = []
for ratio in ratios:
    X, Y = make_classification(n_samples=10000, 
                               n_classes=2, 
                               n_features=2, 
                               n_redundant=0, 
                               n_repeated=0, 
                               weights=[ratio],
                               flip_y=0, 
                               random_state=seed)
    
    ...

for test_confusion_matrix, ratio, idx in zip(test_confusion_matrices, ratios, range(len(ratios))):
    plt.figure(1, figsize=(15, 12))
    plt.subplot(3, 3, idx+1)
    plt.title("Confusion matrix, 1st class ratio = " + str(ratio))
    sns.heatmap(data=test_confusion_matrix.round(2), annot=True, fmt='d', cmap=sns.color_palette("RdBu_r", 1000))
plt.figure(1)
plt.suptitle("Assessment of a K-NN model (K=10) on randomly generated binary datasets, with different ratios between the two classes")
plt.subplot(3, 3, 8)
plt.title("Test accuracies + test F1-scores of minority class as functions of the 1st class ratio")
plt.plot(ratios, test_accuracies, c='g')
plt.plot(ratios, test_f1_scores, c='r')
plt.legend(["Accuracy", "F1-score"], loc='best')
plt.xlabel('1st class ratio')
plt.ylabel('Quality measures')
plt.show()

# Exercise 2 - 2): Model selection with Kfold cross-validation for classification on unbalanced data

In [None]:
### Split+shuffle X and Y into k=num_folds different folds:
def KFold_split(X, Y, num_folds, seed):
    ...

### Select a model via Kfold cross-validation:
def KFold_model_selection(X, Y, models, num_folds, seed):
    ...

### KFold cross-validation of a model:
def perform_KFold_CV(X_train_folds, X_val_folds, Y_train_folds, Y_val_folds, model_idx):
    ...

### Fit and evaluate a model:
def assess_model(X_train, X_test, Y_train, Y_test, model_idx):
    ...

In [None]:
### Model selection of a classification model on unbalanced data with KFold cross-validation:
# Load an unbalanced binary dataset:
with open('custom_unbalanced_dataset.pickle', 'rb') as unbalanced_dataset:
    X, Y = pickle.load(unbalanced_dataset)
    # Models to be cross-validated:
    models = {0: "K-NN",
              1: "Logistic regression",
              2: "Decision Tree"}
    # Select model with KFold cross-validation (use 10 folds):
    ...