In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf
import numpy as np

test_datasets_accuracy = []
dataset_flavours = ["sentences_allagree", "sentences_75agree", "sentences_66agree", "sentences_50agree"]

for i, dataset_flavour in enumerate(dataset_flavours):
    print("Running for financial_phrasebank :", dataset_flavour)

    # Load dataset and tokenizer
    dataset = load_dataset("financial_phrasebank", dataset_flavour)
    checkpoint = "distilbert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    def tokenize_function(examples):
        return tokenizer(
            examples["sentence"],
            padding="max_length",
            truncation=True,
            max_length=512
        )

    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    train_val_split = tokenized_datasets["train"].train_test_split(test_size=0.2, seed=42)
    val_test_split = train_val_split['test'].train_test_split(test_size=0.5, seed=42)

    def to_tf_dataset(split, shuffle=False):
        return split.to_tf_dataset(
            columns=["input_ids", "attention_mask"],
            label_cols=["label"],
            shuffle=shuffle,
            batch_size=8,
            collate_fn=None
        )

    tf_train_dataset = to_tf_dataset(train_val_split['train'], shuffle=True)
    tf_validation_dataset = to_tf_dataset(val_test_split['train'], shuffle=True)
    tf_test_dataset = to_tf_dataset(val_test_split['test'], shuffle=False)

    # Define and compile model
    model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)
    model.distilbert.trainable = False

    initial_learning_rate = 5e-5
    lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=initial_learning_rate,
        decay_steps=10000,
        end_learning_rate=0.0,
        power=1.0
    )

    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
    model.compile(
        optimizer=optimizer,
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=['accuracy']
    )

    model.summary()

    # Training
    model.fit(tf_train_dataset, epochs=3, validation_data=tf_validation_dataset)

    # Evaluation
    eval_loss, eval_accuracy = model.evaluate(tf_test_dataset)
    print(f"For {dataset_flavour}, Evaluated Test Loss: {eval_loss}, Evaluated Test Accuracy: {eval_accuracy}")

    # Prediction and Manual Calculation of Metrics
    input_ids, y_true, y_pred_logits = [], [], []
    for batch in tf_test_dataset:
        ids = batch[0]['input_ids'].numpy()
        labels = batch[1].numpy()
        input_ids.extend(ids)
        y_true.extend(labels)
        logits = model.predict(batch[0], verbose=0)
        y_pred_logits.extend(logits.logits)

    predicted_class = np.argmax(y_pred_logits, axis=1)

    # Calculate and print manual accuracy
    manual_accuracy = np.mean(np.array(y_true) == predicted_class)
    print(f"Manually Calculated Test Accuracy: {manual_accuracy}")

    # Additional reporting
    report = classification_report(y_true, predicted_class, target_names=['Negative', 'Neutral', 'Positive'])
    print("Classification Report:\n", report)

    cm = confusion_matrix(y_true, predicted_class)
    print("Confusion Matrix:\n", cm)

    test_datasets_accuracy.append({
        "run": i,
        "flavour": dataset_flavour,
        "loss": eval_loss,
        "accuracy": eval_accuracy
    })

In [None]:
import matplotlib.pyplot as plt

# Initialize lists to store the dataset flavours, accuracies, and losses
flavours = []
accuracies = []
losses = []

# Extract data from the test_datasets_accuracy list of dictionaries
for entry in test_datasets_accuracy:
    flavours.append(entry["flavour"])
    accuracies.append(entry["accuracy"]*100)
    losses.append(entry["loss"])

# Plotting accuracy vs. dataset flavour
plt.figure(figsize=(10, 5))
plt.plot(flavours, accuracies, marker='o', linestyle='-', color='b', label='Accuracy (%)')
plt.title('Model Accuracy vs. Dataset Flavour')
plt.xlabel('# Training Examples')
plt.ylabel('Accuracy (%)')
plt.grid(True)
plt.legend()
plt.show()

# Plotting loss vs. dataset flavour
plt.figure(figsize=(10, 5))
plt.plot(flavours, losses, marker='o', linestyle='-', color='r', label='Loss')
plt.title('Model Loss vs. Dataset Flavour')
plt.xlabel('# Training Examples')
plt.ylabel('Loss')
plt.grid(True)
plt.legend()
plt.show()
