In [None]:
def cross_validate_train_data_nltkNaiveBayes(samples, labels, num_word_features, get_features, k=5):
    """
        Description: A function which applies k-fold cross-validation to the training split of the poem dataset,
        and outputs the nltk Multinomial Naive Bayes classifier's mean performance scores across the folds, as well
        as the variability in f1-scores (standard deviation) across the folds.
        Inputs:
            - samples ==> a list-of-lists where each sub-list/sample is a list of tokens.
            - labels ==> a list of labels corresponding to each training sample.
            - num_word_features ==> num of word features to use for each training-split FreqDist.
            - get_features ==> a function (e.g. doc_features) that turns tokens into features using the specific number of word features.
            - k ==> an integer representing the number of folds to iterate over for k-fold cross-validation 
        Outputs:
            - a dictionary containing keys for the average accuracy, macro-average precision/recall/F1-scores
              and f1 standard deviation across the samples
    """
    # Initialize lists of metrics
    accuracies = []
    macro_avg_precisions = []
    macro_avg_recalls = []
    macro_avg_f1s = []

    # Use Stratified KFold scikit-learn class with k (nr folds): it outputs indices 
    # Shuffle to reduce impact of specific orderings of the samples
    SKFGenerator = StratifiedKFold(n_splits=k, shuffle=True, random_state=3)  # Use random_state for reproducibility and comparison of results

    ## LOGGER counting progress made
    counter = 1
    
    # Iterate over the folds using the outputted indices by StratifiedKFold for this dataset
    for train_indices, val_indices in SKFGenerator.split(samples, labels):
            
        # Create train_set and val_set tuple (sample-label) lists for each fold using the stratified k fold's indices.
        train_set = [(samples[i], labels[i]) for i in train_indices]
        val_set = [(samples[i], labels[i]) for i in val_indices]

        train_tokenlists = [(tokenlist) for (tokenlist, label) in train_set]
        train_vocabulary_list = flatten_list_of_lists(train_tokenlists)
        all_words = nltk.FreqDist(w for w in train_vocabulary_list)
        word_features = list(all_words)[:num_word_features]

        train_featuresets = [(get_features(doc, word_features), label) for (doc, label) in train_set]
        val_featuresets = [(get_features(doc, word_features), label) for (doc, label) in val_set]
        
        # Train the Naive Bayes Classifier
        NBclassifier = nltk.NaiveBayesClassifier.train(train_featuresets)
        
         # Get the true labels and the predicted labels from the classifier
        true = [label for (features, label) in val_featuresets]
        pred = [NBclassifier.classify(features) for (features, label) in val_featuresets]

        # Calculate the accuracy for this particular fold.
        acc = accuracy(NBclassifier , val_featuresets)
        accuracies.append(acc)

        # Calculate macro_average precision, recall, and f1-score for this fold
        precision, recall, f1, _ = precision_recall_fscore_support(true, pred, average='macro', zero_division=0) # set to 0 to avoid zero-division error
        macro_avg_precisions.append(precision)
        macro_avg_recalls.append(recall)
        macro_avg_f1s.append(f1)  
        # Increment counter for logging outputs
        counter += 1
        
    # Calculate the mean for all metrics across the folds.
    mean_accuracy = np.mean(accuracies)
    mean_macro_precision = np.mean(macro_avg_precisions)
    mean_macro_recall = np.mean(macro_avg_recalls)
    mean_macro_f1 = np.mean(macro_avg_f1s)

    # Calculate the standard deviation of f1 across the folds
    std_macro_f1 = np.std(macro_avg_f1s, ddof=1) # apply Bessel's correction for fold std deviation as this is a small sample, not a pop.
    # Calculate the range of macro f1 scores (to put std into context)
    range_macro_f1 = np.max(macro_avg_f1s) - np.min(macro_avg_f1s)
    print(f"Number of word features: {num_word_features} --- Macro-Avg F1 standard deviation: {std_macro_f1} --- Macro-Avg F1 Range: {range_macro_f1}")

    return {
        "mean_accuracy": mean_accuracy,
        "mean_macro_precision": mean_macro_precision,
        "mean_macro_recall": mean_macro_recall,
        "mean_macro_f1": mean_macro_f1,
        "std_macro_f1": std_macro_f1,
        "range_macro_f1": range_macro_f1
    }



In [None]:
def compute_metrics(pred):
    true_labels = pred.label_ids
    predicted_labels = pred.predictions.argmax(-1)
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='macro', zero_division=0.0)
    return {
        'eval_accuracy': accuracy,
        'eval_precision': precision,
        'eval_recall': recall,
        'eval_f1': f1,
    }

In [None]:
import os
import ray
from ray import tune

def train_fn(config):
    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)
    
    training_args = TrainingArguments(
        output_dir='./original_dataset_results',
        per_device_train_batch_size=config["per_device_train_batch_size"],
        per_device_eval_batch_size=config["per_device_eval_batch_size"],
        warmup_steps=config["warmup_steps"],
        weight_decay=config["weight_decay"],
        logging_dir='./logs',
        logging_steps=10,
        num_train_epochs=config["num_train_epochs"],
        learning_rate=config["learning_rate"]
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )

    # Start training
    trainer.train()

    # Evaluate the model
    eval_results = trainer.evaluate(eval_dataset=eval_dataset)
    print(eval_results)

    metrics = dict(eval_f1=eval_results["eval_f1"])
    
    ray.train.report(dict(eval_f1=eval_results["eval_f1"]))
    
    return metrics


# Define hyperparameter search space
search_space = {
    "num_train_epochs": tune.uniform(6, 11),  # Continuous range between 6 and 10
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 16,
    "warmup_steps": tune.uniform(0, 1000),  # Continuous range between 0 and 999
    "weight_decay": tune.uniform(0.001, 0.1),  # Continuous range between 0.001 and 0.1
    "learning_rate": tune.uniform(1e-5, 5e-5)  # Continuous range between 1e-5 and 5e-5
}

bayesopt = BayesOptSearch(
    metric="eval_f1",
    mode="max",
    utility_kwargs={
        "kind": "ucb",   # Use Upper Confidence Bound (UCB) utility function
        "kappa": 2.5,    # UCB parameter for exploration-exploitation trade-off
        "xi": 0.0        # Expected Improvement (EI) parameter, trade-off between certain and possible improvements
    }
)

asha_scheduler = ASHAScheduler(
    max_t=20,  # Max number of epochs
    grace_period=5,  # Min number of epochs to run before stopping a trial
    reduction_factor=3  # Factor to reduce the number of trials each iteration
)

# Define a trial_dirname_creator to shorten the path
def short_trial_dirname_creator(trial):
    return f"tune_trial_{trial.trial_id}"

# Initialize Ray and run hyperparameter search with Ray Tune
ray.shutdown()
ray.init(ignore_reinit_error=True)

analysis = tune.run(
    train_fn,
    config=search_space,
    search_alg=bayesopt,  # Use BayesOptSearch for hyperparameter optimization
    scheduler=asha_scheduler,  # Use ASHA scheduler to manage trial execution
    resources_per_trial={"cpu": 4},
    num_samples=10,
    progress_reporter=tune.CLIReporter(metric_columns=["eval_f1"]),
    trial_dirname_creator=short_trial_dirname_creator,
    local_dir='./ray',
    mode="max",
    metric="eval_f1"
)

In [None]:
def generate_and_show_confusion_matrix(
        true_labels, predicted_labels,
        label_names, # informative class names go here
        classifier_description, matrix_color=plt.cm.Greens, # default: green colour-coded confusion matrix
        above_threshold_text_color="yellow" # color in which to show the text of counts above the threshold
    ):
    # Also visualize the errors made for each class using a confusion matrix:
    #matrix = confusion_matrix(original_dataset_validation_labels, original_dataset_validation_predictions)
    matrix = confusion_matrix(true_labels, predicted_labels)
    plt.figure(figsize=(6, 4))
    plt.imshow(matrix, interpolation='nearest', cmap=matrix_color)
    plt.title(classifier_description)

    # Show the color-coding legend
    plt.colorbar()
    # Add labels for each class to the x- and y-axes
    ticks = np.arange(len(label_names)) # get numpy array of numbers from 0 to nr of classes - 1
    print(ticks)
    plt.xticks(ticks, label_names, rotation=30) # rotate x-axis labels for easier readability
    plt.yticks(ticks, label_names)
    
    # Add a threshold value (max value in the matrix divided by 2) after which the text-color is inverted from black to white (for visibility)
    threshold = matrix.max() / 2.
    for i in range(matrix.shape[0]): # iterate over nr rows
        for j in range(matrix.shape[1]): # iterate over nr cols
            # annotate based on cell (row, col), center the annotation/count, and change text color from black if threshold is exceeded
            plt.text(
                j, i, format(matrix[i, j], 'd'),
                ha="center", va="center",
                color=above_threshold_text_color if matrix[i, j] > threshold else "black"
            )

    # adjust subplot parameters so that the matrix fits in to the figure area
    plt.tight_layout()
    # label the axes
    plt.xlabel("Predicted")
    plt.ylabel("True")
    # display the current figure/matrix
    plt.show()