In [11]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, f1_score, log_loss, roc_curve, roc_auc_score

from keras.layers import TextVectorization
from keras.layers import Embedding

### K-Fold Cross Validation

Here, you can find the implementation of the K-Fold Cross Validation.

The **parameters** of the function are:

- *k_folds* : number of folds
- *training_set* : training dataset on which perform the CV
- *neural_network* : a Neural Network architecture
- *hyper_params* : a list of objects with hyperparameters combinations to try (they need to be consistent with the NN architecture layers and properties)

In [None]:
def kfoldCrossValidation(k_folds, feature, label, neural_network, hyper_params):

    # Stratified K-fold Cross Validation
    stratified_kfold = StratifiedKFold(n_splits = k_folds, random_state = 19, shuffle = True)

    results = {}
    
    for hyper_params_combination in hyper_params:
    
        print(hyper_params_combination)

        # List with evaluation metric (performance for each iteration)
        evaluation_metric = []
    
        # Neural Network architecture with hyperparameters combination
        # ... #

        # Splitting in training and validation set
        for train, val in stratified_kfold.split(feature, label):

            # Training (fit Neural Network)
            neural_network.fit(
                x = feature[train], y = label[train], 
                batch_size = hyper_params_combination['batch_size'], 
                epochs = hyper_params_combination['epoch'], 
                validation_data = (feature[val], label[val])
            )
            
            # Validation 
            # ... #

            

    return results

In [None]:
# Function which performs both final training and testing
def trainingAndTesting(n_estimators, training_set):

    # Feature (normalized in [0,1]) and label 'clicker' for both training and testing set
    label_train = training_set['clicker'].to_numpy()
    feature_train = MinMaxScaler().fit_transform(training_set.drop('clicker', axis = 1).to_numpy())
    label_test = testing_set['clicker'].to_numpy()
    feature_test = MinMaxScaler().fit_transform(testing_set.drop('clicker', axis = 1).to_numpy())

    # Training
    rf_classifier = RandomForestClassifier(
        criterion = 'entropy', n_estimators = n_estimators, 
        class_weight = 'balanced_subsample', random_state = 19
    )
    rf_classifier.fit(feature_train, label_train)

    # Testing
    test_pred_prob = rf_classifier.predict_proba(feature_test)
    test_pred_class = rf_classifier.predict(feature_test)

    testing_data = pd.DataFrame({

        'actual': label_test.tolist(), 
        'pred_class': test_pred_class,
        'pred_prob_neg': test_pred_prob[:, 0],
        'pred_prob_pos': test_pred_prob[:, 1] 
        
    })  

    # Dataframe with only positive and negative class observations
    positive_class = testing_data[testing_data['actual'] == 1].copy()
    negative_class = testing_data[testing_data['actual'] == 0].copy()

    # Deciles of the predicted probabilities
    deciles = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
    positive_class['deciles'] = pd.cut(positive_class['pred_prob_pos'], deciles, include_lowest = True, right = False)
    negative_class['deciles'] = pd.cut(negative_class['pred_prob_neg'], deciles, include_lowest = True, right = False)

    # Deciles data with frequency and ratio of observation of positive class in the deciles
    deciles_data = pd.DataFrame({
        'pos': positive_class['deciles'].value_counts(sort = False),
        'neg': negative_class['deciles'].value_counts(sort = False)
    })

    deciles_data.reset_index(inplace = True)
    deciles_data['ratio_pos'] = deciles_data.apply(lambda row: round(row['pos'] / len(positive_class), 3), axis = 1)
    deciles_data['ratio_neg'] = deciles_data.apply(lambda row: round(row['neg'] / len(negative_class), 3), axis = 1)

    deciles_data.columns = ['decile_interval', 'freq_pos', 'freq_neg', 'ratio_pos', 'ratio_neg']
    
    # Performance metrics computation 
    logloss = round(log_loss(y_true = testing_data['actual'], 
                             y_pred = np.column_stack((testing_data['pred_prob_neg'], testing_data['pred_prob_pos']))), 3)
    
    fp_ratio, tp_ratio, thresholds = roc_curve(testing_data['actual'], testing_data['pred_class'])
    roc_auc = roc_auc_score(testing_data['actual'], testing_data['pred_class'])
    
    return { 
        'recall': round(recall_score(testing_data['actual'], testing_data['pred_class']), 3),
        'f1_score': round(f1_score(testing_data['actual'], testing_data['pred_class']), 3),
        'deciles_data': deciles_data, 
        'log_loss': logloss,
        'ROC': [ fp_ratio, tp_ratio, thresholds, roc_auc ]
    }