In [None]:
import re
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

In [29]:
# -------------------------
# Config
# -------------------------
MODEL_PATHS = {
    "svm": "model/svm_model.pkl",
    "scaler": "model/length_scaler.pkl",
}
DATA_PATH = "dataset/validation_data.csv"
OUTPUT_PATH = "5_Results.csv"
EMBEDDING_MODEL = "all-MiniLM-L6-v2"

In [None]:
# -------------------------
# Regex patterns
# -------------------------
START_PATTERNS = [
    r"^text\s*",
    r"^WASHINGTON \(Reuters\) -.*? - ",
    r"^LONDON \(Reuters\) -.*? - ",
]

END_PATTERNS = [
    r"text\s*$",
    r"-- Source link:.*$",
    r"ntly produced by the staff of Reuters News Agency\.*$",
    r"Featured image.*$",
]

In [None]:
# -------------------------
# Preprocessing
# -------------------------
def clean_text(text: str) -> str:
    """Remove boilerplate patterns and normalize whitespace."""
    for pat in START_PATTERNS:
        text = re.sub(pat, "", text, flags=re.IGNORECASE)
    for pat in END_PATTERNS:
        text = re.sub(pat, "", text, flags=re.IGNORECASE)
    return re.sub(r"\s+", " ", text).strip()

In [None]:
# -------------------------
# Embedding
# -------------------------
def embed_long_text(
    text: str, model: SentenceTransformer, max_tokens: int = 512, stride: int = 256
) -> np.ndarray:
    """Embed long text using chunking and averaging."""
    tokens = text.split()
    embeddings = []

    start = 0
    while start < len(tokens):
        end = min(start + max_tokens, len(tokens))
        chunk_text = " ".join(tokens[start:end])
        embeddings.append(model.encode(chunk_text))
        start += stride
        if end == len(tokens):
            break

    return np.mean(embeddings, axis=0)


def embed_corpus(texts: pd.Series, model: SentenceTransformer) -> np.ndarray:
    """Embed an entire corpus with progress bar."""
    embeddings = [
        embed_long_text(text, model) for text in tqdm(texts, desc="Embedding articles")
    ]
    return np.vstack(embeddings)

In [None]:
# -------------------------
# Main pipeline
# -------------------------
# Load models
svm_model = joblib.load(MODEL_PATHS["svm"])
length_scaler = joblib.load(MODEL_PATHS["scaler"])
embedder = SentenceTransformer(EMBEDDING_MODEL)

# Load data
df = pd.read_csv(DATA_PATH)
df_result = df.copy()

# Preprocess
df["combined"] = df["title"] + " " + df["text"]
df["combined_clean"] = df["combined"].apply(clean_text)
df["combined_len"] = df["combined"].str.len()

# Embed
embeddings = embed_corpus(df["combined_clean"], embedder)

# Features
length_scaled = length_scaler.fit_transform(df[["combined_len"]])
X = np.hstack((embeddings, length_scaled))

# Predict
predictions = svm_model.predict(X)
df_result["label"] = predictions

# Save
df_result.to_csv(OUTPUT_PATH, index=False)
print(f"Results saved to {OUTPUT_PATH}")
