In [26]:
from datasets import load_dataset
from tqdm import tqdm

## Load dataset

In [None]:
import xml.etree.ElementTree as ET
from pathlib import Path

# Fix 1: Point to the correct directory where XMLs are located
DATA_DIR = Path("Dataset")

def parse_articles(path):
    tree = ET.parse(path)
    root = tree.getroot()

    articles = {}
    for article in tqdm(root.findall("article")):
        aid = article.get("id")
        title = article.get("title") or ""
        text_node = article.find("text")
        text = (text_node.text or "") if text_node is not None else ""
        articles[aid] = {"id": aid, "title": title, "text": text}
    return articles


def parse_labels(path):
    tree = ET.parse(path)
    root = tree.getroot()

    labels = {}
    for doc in root.findall("article"):
        aid = doc.get("id")
        label = doc.get("hyperpartisan")
        # convert "true"/"false" → 1/0
        labels[aid] = 1 if label == "true" else 0
    return labels


def load_split(name):
    """
    name ∈ {"training", "validation", "test"}
    chooses the official *byarticle* split
    """
    
    # Fix 2: Handle validation split exception (only has bypublisher)
    modifier = "byarticle"
    # Check if the specific byarticle file exists, generic check preferred to avoid hardcoding "validation" if possible, 
    # but "validation" is the known offender.
    if name == "validation": 
        # Check if byarticle exists, if not fallback
        if not list(DATA_DIR.glob(f"articles-{name}-{modifier}-*.xml")):
             print(f"Note: 'byarticle' split not found for '{name}', falling back to 'bypublisher'.")
             modifier = "bypublisher"

    try:
        articles_file = next(DATA_DIR.glob(f"articles-{name}-{modifier}-*.xml"))
        gt_file = next(DATA_DIR.glob(f"ground-truth-{name}-{modifier}-*.xml"))
    except StopIteration:
        raise FileNotFoundError(f"Could not find dataset files for split '{name}' in {DATA_DIR.absolute()}")

    print(f"Loading {name} split from: {articles_file.name}")
    articles = parse_articles(articles_file)
    labels = parse_labels(gt_file)

    merged = []
    for aid, art in articles.items():
        merged.append({**art, "label": labels.get(aid)})
    return merged




train = load_split("training")
val = load_split("validation")
test = load_split("test")

print(f"Training set size:   {len(train)}")
print(f"Validation set size: {len(val)}")
print(f"Test set size:       {len(test)}")

print("Sample training item keys:", train[0].keys())



Loading training split from: articles-training-byarticle-20181122.xml
Note: 'byarticle' split not found for 'validation', falling back to 'bypublisher'.
Loading validation split from: articles-validation-bypublisher-20181122.xml
Loading test split from: articles-test-byarticle-20181207.xml
Training set size:   645
Validation set size: 150000
Test set size:       628
Sample training item keys: dict_keys(['id', 'title', 'text', 'label'])
