In [8]:
import pandas as pd
from transformers import AutoTokenizer, DistilBertForSequenceClassification
import torch

# Load the trained model
tokenizer = AutoTokenizer.from_pretrained('./faq_scope_detection')
model = DistilBertForSequenceClassification.from_pretrained('./faq_scope_detection')

# Load the test dataset
test_data = pd.read_csv('scope_training_datasets/testdata.csv')

# Rename the text column to match the training data
test_data = test_data.rename(columns={'Question': 'text'})

# Tokenize the test dataset
tokenized_test_dataset = tokenizer(test_data['text'].tolist(), truncation=True, padding=True, return_tensors='pt')

#functions
def calculate_accuracy(preds, labels):
    correct = (preds == labels).sum().item()
    total = labels.size(0)
    return correct / total

def calculate_f1_score(preds, labels):
    tp = ((preds == 1) & (labels == 1)).sum().item()
    tn = ((preds == 0) & (labels == 0)).sum().item()
    fp = ((preds == 1) & (labels == 0)).sum().item()
    fn = ((preds == 0) & (labels == 1)).sum().item()

    precision = tp / (tp + fp + 1e-8)
    recall = tp / (tp + fn + 1e-8)
    f1 = 2 * precision * recall / (precision + recall + 1e-8)

    return f1

# Make predictions on the test dataset
model.eval()
with torch.no_grad():
    outputs = model(**tokenized_test_dataset)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

# Get the labels for the test dataset
labels = torch.tensor(test_data['Label'].tolist())

# Calculate the accuracy and F1 score
accuracy = calculate_accuracy(predictions, labels)
f1_score = calculate_f1_score(predictions, labels)

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1_score}")

# Print the inaccurately labeled questions
incorrect_idx = (predictions != labels).nonzero().squeeze()
incorrect_predictions = predictions[incorrect_idx].tolist()
true_labels = labels[incorrect_idx].tolist()
incorrect_questions = test_data.iloc[incorrect_idx]['text'].tolist()

print("\nInaccurately labeled questions:")
for q, p, l in zip(incorrect_questions, incorrect_predictions, true_labels):
    print(f"Question: {q}")
    print(f"Predicted label: {p}")
    print(f"True label: {l}")
    print()

Accuracy: 0.5416666666666666
F1 Score: 0.4210526278116344

Inaccurately labeled questions:
Question: Is it your birthday?
Predicted label: 0
True label: 1

Question: Please recite the lyrics to the hit song baby shark.
Predicted label: 0
True label: 1

Question: Do my friends hate me?
Predicted label: 0
True label: 1

Question: Do you hate me?
Predicted label: 0
True label: 1

Question: What is love?
Predicted label: 0
True label: 1

Question: Can loneliness be conquered?
Predicted label: 0
True label: 1

Question: What is your favourite animal?
Predicted label: 0
True label: 1

Question: Will you become sentient and eat me?
Predicted label: 0
True label: 1

Question: What’s a good pizza place?
Predicted label: 0
True label: 1

Question: I am in your walls.
Predicted label: 0
True label: 1

Question: What is wrong with you?
Predicted label: 0
True label: 1

