In [7]:
import pandas as pd
import spacy
import random
import matplotlib.pyplot as plt
import re
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from spacy.util import minibatch
from spacy.training import Example

random.seed = 42

In [8]:
# Config
experiment_configs = [{
    "name": "spacy_no_xml_tags",
    "epochs": 7,
    "batch_size": 16
}]

# Load data
df_train = pd.read_csv("output_train.csv").drop(columns=['A_raw_entities']).copy() # spacy tagged only
df_test  = pd.read_csv("output_test.csv").drop(columns=['A_raw_entities']).copy()

In [9]:
# prepare data for textcat
def prepare_spacy_data(df):
    return [ # prepares data for SpaCy training
        (text, {"cats": {"TRUE": label == 1, "FALSE": label == 0}})
        for text, label in zip(df['statement'], df['label_binary'])
    ]
train_data = prepare_spacy_data(df_train)
test_data  = prepare_spacy_data(df_test)

# run experiments
all_results = []

for cfg in experiment_configs:
    nlp = spacy.blank("en") # empty Eng pipeline for SpaCy
    textcat = nlp.add_pipe("textcat", last=True) # text clf
    # register classes
    textcat.add_label("TRUE")
    textcat.add_label("FALSE")

    # initialize optimizer
    optimizer = nlp.initialize()

    losses_list, f1s = [], [] # losses and f1 scores

    for epoch in range(cfg['epochs']):
        random.shuffle(train_data) # shuffle training data
        batches = minibatch(train_data, size=cfg['batch_size']) # create mini batches
        epoch_loss = 0.0

        for batch in batches:
            texts, annotations = zip(*batch) # unzip into texts and annotations
            examples = [ # for spacy
                Example.from_dict(nlp.make_doc(txt), ann)
                for txt, ann in zip(texts, annotations)]
            losses = {} # feed into the model, train
            nlp.update(examples, sgd=optimizer, losses=losses)
            epoch_loss += losses.get("textcat", 0.0)

        # evaluate on test set
        preds, true_labels = [], [] # predictions and true labels
        for txt, ann in test_data:
            doc = nlp(txt)
            pred = doc.cats["TRUE"] > 0.5
            preds.append(int(pred))
            true_labels.append(int(ann["cats"]["TRUE"]))

        f1 = f1_score(true_labels, preds)
        losses_list.append(epoch_loss)
        f1s.append(f1)
        print(f"Epoch {epoch+1}/{cfg['epochs']} — Loss: {epoch_loss:.4f} — F1: {f1:.4f}")

    # plot metrics
    plt.figure()
    plt.plot(range(1, cfg['epochs']+1), losses_list, marker='o', label='Loss')
    plt.plot(range(1, cfg['epochs']+1), [f * 100 for f in f1s], marker='s', linestyle='--', label='F1')
    plt.xlabel("Epoch")
    plt.title("SpaCy Training Curve")
    plt.legend()
    plt.savefig(f"{cfg['name']}_curve.png")
    plt.close()

    # final metrics
    acc  = accuracy_score(true_labels, preds)
    prec = precision_score(true_labels, preds)
    rec  = recall_score(true_labels, preds)

    all_results.append({
        "name": cfg["name"],
        "epochs": cfg["epochs"],
        "batch_size": cfg["batch_size"],
        "accuracy": round(acc, 2),
        "f1": round(f1, 2),
        "precision": round(prec, 2),
        "recall": round(rec, 2)
    })

# save results
pd.DataFrame(all_results).to_csv("spacy_without_xml_results.csv", index=False)


Epoch 1/7 — Loss: 242.7255 — F1: 0.5844
Epoch 2/7 — Loss: 206.3521 — F1: 0.6570
Epoch 3/7 — Loss: 168.7026 — F1: 0.6005
Epoch 4/7 — Loss: 124.7230 — F1: 0.6247
Epoch 5/7 — Loss: 91.0247 — F1: 0.6108
Epoch 6/7 — Loss: 68.2143 — F1: 0.5845
Epoch 7/7 — Loss: 58.5799 — F1: 0.6160
