In [10]:
import spacy
import random
import pandas as pd
import matplotlib.pyplot as plt
from spacy.training import Example
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    precision_score,
    recall_score,
    confusion_matrix,
    roc_curve,
    auc,
    ConfusionMatrixDisplay
)

random.seed(42)

## Load Data

In [11]:
df_train = pd.read_csv("output_train.csv")
df_valid = pd.read_csv("output_valid.csv")
df_test  = pd.read_csv("output_test.csv")

# Model Training

## Preparing data and model

In [12]:
def make_data(df, text_col, label_col): # prepares data for SpaCy training
    texts = df[text_col].tolist()
    labels = df[label_col].tolist()
    cats = [{"cats": {"1": bool(l), "0": not bool(l)}} for l in labels]
    return list(zip(texts, cats)), texts, labels

train_data, _, _= make_data(df_train, "statement", "label_binary")
valid_data, valid_texts, valid_labels = make_data(df_valid, "statement", "label_binary")
test_data, test_texts, test_labels = make_data(df_test, "statement", "label_binary")

# build spacy model with BOW classifier
nlp = spacy.blank("en")
textcat = nlp.add_pipe(
    "textcat",
    last=True,
    config={"model": {
            "@architectures": "spacy.TextCatBOW.v3",
            "exclusive_classes": True,
            "ngram_size": 1,
            "no_output_layer": False}})
# add classification labels
textcat.add_label("0")
textcat.add_label("1")

1

## Model Training and Validation

In [13]:
n_iter = 10
train_losses = []
val_f1s = []
optimizer = nlp.begin_training()

for epoch in range(n_iter):
    random.shuffle(train_data)
    losses = {}
    # training
    for batch in spacy.util.minibatch(train_data, size=8):
        examples = []
        for text, ann in batch:
            doc = nlp.make_doc(text)
            examples.append(Example.from_dict(doc, ann))
        nlp.update(examples, drop=0.2, sgd=optimizer, losses=losses)
    train_losses.append(losses["textcat"])

    # validation eval
    preds = [nlp(txt).cats for txt in valid_texts]
    pred_bin = [int(p["1"] >= 0.5) for p in preds]
    f1 = f1_score(valid_labels, pred_bin)*100
    val_f1s.append(f1)

    print(
        f"Epoch {epoch+1}/{n_iter} — "
        f"Train Loss: {losses['textcat']:.4f} — "
        f"Val F1: {f1:.4f}%")

# test evaluation
test_preds= [nlp(t).cats for t in test_texts]
test_bin = [int(p["1"] >= 0.5) for p in test_preds]
scores = {
    "accuracy": accuracy_score(test_labels, test_bin)*100,
    "f1": f1_score(test_labels, test_bin)*100,
    "precision": precision_score(test_labels, test_bin)*100,
    "recall": recall_score(test_labels, test_bin)*100
}
print("Test scores:", {k: f"{v:.2f}%" for k,v in scores.items()})

Epoch 1/10 — Train Loss: 474.8942 — Val F1: 64.4275%
Epoch 2/10 — Train Loss: 375.8721 — Val F1: 62.0582%
Epoch 3/10 — Train Loss: 321.1152 — Val F1: 60.9201%
Epoch 4/10 — Train Loss: 287.0336 — Val F1: 61.4345%
Epoch 5/10 — Train Loss: 261.0785 — Val F1: 61.7184%
Epoch 6/10 — Train Loss: 241.4476 — Val F1: 62.1000%
Epoch 7/10 — Train Loss: 226.1418 — Val F1: 62.5061%
Epoch 8/10 — Train Loss: 213.5697 — Val F1: 58.4764%
Epoch 9/10 — Train Loss: 201.9112 — Val F1: 59.7927%
Epoch 10/10 — Train Loss: 192.3624 — Val F1: 60.5317%
Test scores: {'accuracy': '67.03%', 'f1': '60.68%', 'precision': '61.34%', 'recall': '60.02%'}


## Plots

In [14]:
# Training + Validation curve
plt.figure()
plt.plot(range(1, n_iter+1), train_losses, marker="o", label="Train Loss")
plt.plot(range(1, n_iter+1), val_f1s,      marker="s", linestyle="--", label="Val F1 (%)")
plt.xlabel("Epoch")
plt.title("Training Loss & Validation F1 (%)")
plt.legend()
plt.tight_layout()
plt.savefig("training_validation_curve_noXML.png")
plt.close()

# Confusion matrix
cm   = confusion_matrix(test_labels, test_bin)
disp = ConfusionMatrixDisplay(cm, display_labels=["0","1"])
disp.plot(cmap="Blues", values_format="d")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.savefig("confusion_matrix_noXML.png")
plt.close()

# ROC curve
probs   = [p["1"] for p in test_preds]
fpr, tpr, _ = roc_curve(test_labels, probs)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
plt.plot([0,1], [0,1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.tight_layout()
plt.savefig("roc_curve_noXML.png")
plt.close()

# Save results
pd.DataFrame([scores]).to_csv("spacy_results_noXML.csv", index=False)