In [None]:
def cross_validate_train_data_nltkNaiveBayes(samples, labels, num_word_features, get_features, k=5):
    """
        Description: A function which applies k-fold cross-validation to the training split of the poem dataset,
        and outputs the nltk Multinomial Naive Bayes classifier's mean performance scores across the folds, as well
        as the variability in f1-scores (standard deviation) across the folds.
        Inputs:
            - samples ==> a list-of-lists where each sub-list/sample is a list of tokens.
            - labels ==> a list of labels corresponding to each training sample.
            - num_word_features ==> num of word features to use for each training-split FreqDist.
            - get_features ==> a function (e.g. doc_features) that turns tokens into features using the specific number of word features.
            - k ==> an integer representing the number of folds to iterate over for k-fold cross-validation 
        Outputs:
            - a dictionary containing keys for the average accuracy, macro-average precision/recall/F1-scores
              and f1 standard deviation across the samples
    """
    # Initialize lists of metrics
    accuracies = []
    macro_avg_precisions = []
    macro_avg_recalls = []
    macro_avg_f1s = []

    # Use Stratified KFold scikit-learn class with k (nr folds): it outputs indices 
    # Shuffle to reduce impact of specific orderings of the samples
    SKFGenerator = StratifiedKFold(n_splits=k, shuffle=True, random_state=3)  # Use random_state for reproducibility and comparison of results

    ## LOGGER counting progress made
    counter = 1
    
    # Iterate over the folds using the outputted indices by StratifiedKFold for this dataset
    for train_indices, val_indices in SKFGenerator.split(samples, labels):
            
        # Create train_set and val_set tuple (sample-label) lists for each fold using the stratified k fold's indices.
        train_set = [(samples[i], labels[i]) for i in train_indices]
        val_set = [(samples[i], labels[i]) for i in val_indices]

        train_tokenlists = [(tokenlist) for (tokenlist, label) in train_set]
        train_vocabulary_list = flatten_list_of_lists(train_tokenlists)
        all_words = nltk.FreqDist(w for w in train_vocabulary_list)
        word_features = list(all_words)[:num_word_features]

        train_featuresets = [(get_features(doc, word_features), label) for (doc, label) in train_set]
        val_featuresets = [(get_features(doc, word_features), label) for (doc, label) in val_set]
        
        # Train the Naive Bayes Classifier
        NBclassifier = nltk.NaiveBayesClassifier.train(train_featuresets)
        
         # Get the true labels and the predicted labels from the classifier
        true = [label for (features, label) in val_featuresets]
        pred = [NBclassifier.classify(features) for (features, label) in val_featuresets]

        # Calculate the accuracy for this particular fold.
        acc = accuracy(NBclassifier , val_featuresets)
        accuracies.append(acc)

        # Calculate macro_average precision, recall, and f1-score for this fold
        precision, recall, f1, _ = precision_recall_fscore_support(true, pred, average='macro', zero_division=0) # set to 0 to avoid zero-division error
        macro_avg_precisions.append(precision)
        macro_avg_recalls.append(recall)
        macro_avg_f1s.append(f1)  
        # Increment counter for logging outputs
        counter += 1
        
    # Calculate the mean for all metrics across the folds.
    mean_accuracy = np.mean(accuracies)
    mean_macro_precision = np.mean(macro_avg_precisions)
    mean_macro_recall = np.mean(macro_avg_recalls)
    mean_macro_f1 = np.mean(macro_avg_f1s)

    # Calculate the standard deviation of f1 across the folds
    std_macro_f1 = np.std(macro_avg_f1s, ddof=1) # apply Bessel's correction for fold std deviation as this is a small sample, not a pop.
    # Calculate the range of macro f1 scores (to put std into context)
    range_macro_f1 = np.max(macro_avg_f1s) - np.min(macro_avg_f1s)
    print(f"Number of word features: {num_word_features} --- Macro-Avg F1 standard deviation: {std_macro_f1} --- Macro-Avg F1 Range: {range_macro_f1}")

    return {
        "mean_accuracy": mean_accuracy,
        "mean_macro_precision": mean_macro_precision,
        "mean_macro_recall": mean_macro_recall,
        "mean_macro_f1": mean_macro_f1,
        "std_macro_f1": std_macro_f1,
        "range_macro_f1": range_macro_f1
    }

