In [None]:
from lxml.etree import iterparse
import xml

import os
import numpy as np
import pandas as pd
import nltk
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from nltk.tokenize import word_tokenize
from collections import Counter
from tqdm import tqdm

from tokenizers import Tokenizer
from tokenizers.pre_tokenizers import Whitespace


from utils import *

## Load & Preprocess Dataset

In [None]:
def read_glove(path, dim):
    '''
    read the glove vectors from path with dimension dim
    '''
    df = pd.read_csv(path + 'glove.6B.' + str(dim) + 'd.txt', sep=" ", quoting=3, header=None, index_col=0)
    glove = {key: val.values for key, val in df.T.items()}
    return glove

### Text Cleaning

In [None]:
import html
import re

def cleanQuotations(text):
    text = re.sub(r'[`‘’‛⸂⸃⸌⸍⸜⸝]', "'", text)
    text = re.sub(r'[„“”]|(\'\')|(,,)', '"', text)
    return text

def cleanText(text):
    text = re.sub(r'(www\S+)|(https?\S+)|(href)', ' ', text)
    text = re.sub(r'\{[^}]*\}|\[[^]]*\]|\([^)]*\)', ' ', text)
    text = re.sub(r'Getty [Ii]mages?|Getty|[Ff]ollow us on [Tt]witter|MORE:|ADVERTISEMENT|VIDEO', ' ', text)
    text = re.sub(r'@\S+|#\S+|\.{2,}', ' ', text)
    text = text.lstrip().replace('\n','')
    text = re.sub(r'  +', ' ', text)
    return text

def fixup(text):
    text = text.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'") \
               .replace('nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n") \
               .replace('quot;', "'").replace('<br />', "\n").replace('\\"', '"') \
               .replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(' @-@ ', '-') \
               .replace('\\', ' \\ ')
    return html.unescape(text)

def textCleaning(title, text):
    title = cleanQuotations(title)
    text  = cleanQuotations(text)
    text  = cleanText(fixup(text))
    return (title + ". " + text).strip()


In [4]:
import os
import re
from lxml import etree as ET

import torch
from torch.utils.data import IterableDataset, DataLoader

def parse_articles(article_path):
    for _, elem in ET.iterparse(article_path, events=("end",)):
        if elem.tag != "article":
            continue

        article_id = elem.get("id")
        title = elem.get("title", "") or ""

        paragraphs = [
            (p.text or "").strip()
            for p in elem.findall("p")
            if p.text
        ]

        text = "\n".join(paragraphs)

        yield article_id, title, text
        elem.clear()  

def parse_labels(label_path):
    labels = {}

    for _, elem in ET.iterparse(label_path, events=("end",)):
        if elem.tag != "article":
            continue

        labels[elem.get("id")] = {
            "hyperpartisan": elem.get("hyperpartisan"),
            "bias": elem.get("bias"),
        }

        elem.clear()

    return labels

def preprocess_article(title, text):
    # your pipeline + final normalization layer
    cleaned = textCleaning(title, text)
    cleaned = cleaned.lower()
    cleaned = re.sub(r"\s+", " ", cleaned).strip()
    return cleaned



def load_dataset(article_path, label_path, preprocess=None):
    labels = parse_labels(label_path)

    for article_id, title, text in parse_articles(article_path):
        if article_id not in labels:
            continue

        if preprocess:
            text = preprocess_article(title, text)

        yield {
            "id": article_id,
            "title": title,
            "text": text,
            "bias": labels[article_id]["bias"],
            "hyperpartisan": labels[article_id]["hyperpartisan"],
        }


In [5]:
class ArticleDataset(IterableDataset):
    def __init__(self, article_path, label_path):
        self.article_path = article_path
        self.label_path = label_path

    def __iter__(self):
        yield from load_dataset(
            self.article_path,
            self.label_path,
            preprocess=preprocess_article
        )

### Tokenization

In [None]:
from tokenizers import Tokenizer
from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
tokenizer.pre_tokenizer = Whitespace()

def hf_tokenize(text):
    return tokenizer.pre_tokenizer.pre_tokenize_str(text)


def build_vocab_from_stream(dataset_stream, min_freq=3, max_size=None):
    counter = Counter()

    for sample in tqdm(dataset_stream, desc="Building vocab", total=600000):
        tokens = [w for w, _ in hf_tokenize(sample["text"])]
        counter.update(tokens)

    vocab = {"<pad>": 0, "<unk>": 1}
    idx = 2

    for token, freq in counter.most_common():
        if freq < min_freq:
            break
        if max_size and idx >= max_size:
            break

        vocab[token] = idx
        idx += 1

    return vocab


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
batch_size = 16


dataPath = "./Dataset"

train_article_path = os.path.join(dataPath, "train-articles.xml")
train_label_path   = os.path.join(dataPath, "ground-truth-training-bypublisher-20181122.xml")

val_article_path = os.path.join(dataPath, "val-articles.xml")
val_label_path   = os.path.join(dataPath, "ground-truth-validation-bypublisher-20181122.xml")


train_dataset = ArticleDataset(train_article_path, train_label_path)
val_dataset   = ArticleDataset(val_article_path, val_label_path)

In [8]:
print("Building vocab...")
vocab = build_vocab_from_stream(
    load_dataset(train_article_path, train_label_path, preprocess_article),
    min_freq=3,
    max_size=50000
)

print("Vocab size:", len(vocab))


Building vocab...


Building vocab:  80%|████████  | 600000/750000 [08:00<02:00, 1249.71it/s]


Vocab size: 50000


In [10]:
def numericalize(tokens, vocab):
    unk = vocab["<unk>"]
    return [vocab.get(t, unk) for t in tokens]

In [11]:
import torch

def collate_batch(batch, vocab, max_len=256):
    texts = []
    labels = []

    for sample in batch:
        tokens = word_tokenize(sample["text"])
        ids = numericalize(tokens[:max_len], vocab)

        pad = max_len - len(ids)
        if pad > 0:
            ids += [vocab["<pad>"]] * pad

        texts.append(ids)

        # choose target — here: hyperpartisan binary
        label = 1 if sample["hyperpartisan"] == "true" else 0
        labels.append(label)

    return (
        torch.tensor(texts, dtype=torch.long),
        torch.tensor(labels, dtype=torch.long)
    )


In [12]:
import torch

def collate_batch(batch, vocab, max_len=256):
    texts = []
    labels = []

    for sample in batch:
        tokens = word_tokenize(sample["text"])
        ids = numericalize(tokens[:max_len], vocab)

        pad = max_len - len(ids)
        if pad > 0:
            ids += [vocab["<pad>"]] * pad

        texts.append(ids)

        # choose target — here: hyperpartisan binary
        label = 1 if sample["hyperpartisan"] == "true" else 0
        labels.append(label)

    return (
        torch.tensor(texts, dtype=torch.long),
        torch.tensor(labels, dtype=torch.long)
    )


In [14]:
from functools import partial



train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    num_workers=0,
    persistent_workers=False,
    collate_fn=partial(collate_batch, vocab=vocab, max_len=256)
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    num_workers=0,
    persistent_workers=False,
    collate_fn=partial(collate_batch, vocab=vocab, max_len=256)
)



In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes=2,
                 kernel_sizes=(3,4,5), num_filters=100,
                 padding_idx=0, pretrained_embeddings=None,
                 freeze_embeddings=False):

        super().__init__()

        self.embedding = nn.Embedding(
            vocab_size,
            embed_dim,
            padding_idx=padding_idx
        )

        # optionally load pretrained GloVe
        if pretrained_embeddings is not None:
            self.embedding.weight.data.copy_(pretrained_embeddings)

        # optionally freeze embeddings
        if freeze_embeddings:
            self.embedding.weight.requires_grad = False

        # conv layers with different kernel sizes
        self.convs = nn.ModuleList([
            nn.Conv1d(
                in_channels=embed_dim,
                out_channels=num_filters,
                kernel_size=k
            )
            for k in kernel_sizes
        ])

        self.dropout = nn.Dropout(0.5)

        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)

    def forward(self, x):
        # x: (batch, seq_len)

        x = self.embedding(x)           # (batch, seq, embed)
        x = x.transpose(1, 2)           # (batch, embed, seq)

        conv_outputs = []
        for conv in self.convs:
            c = conv(x)                 # (batch, num_filters, L_out)
            c = F.relu(c)
            c = F.max_pool1d(c, c.size(2))   # (batch, num_filters, 1)
            conv_outputs.append(c.squeeze(2))

        out = torch.cat(conv_outputs, dim=1)  # (batch, F*kernels)
        out = self.dropout(out)

        return self.fc(out)


In [16]:
embed_dim = 300
num_classes = 2  # partisan / not partisan

model = TextCNN(
    vocab_size=len(vocab),
    embed_dim=embed_dim,
    num_classes=num_classes,
    padding_idx=vocab["<pad>"],
    pretrained_embeddings=None,   # or pass GloVe tensor later
    freeze_embeddings=False
)


In [17]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)


In [18]:
def train_one_epoch(loader):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for tokens, labels in loader:
        tokens = tokens.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        logits = model(tokens)
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item() * labels.size(0)

        preds = logits.argmax(1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    return total_loss / total, correct / total


In [19]:
@torch.no_grad()
def eval_epoch(loader):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    for tokens, labels in loader:
        tokens = tokens.to(device)
        labels = labels.to(device)

        logits = model(tokens)
        loss = criterion(logits, labels)

        total_loss += loss.item() * labels.size(0)

        preds = logits.argmax(1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    return total_loss / total, correct / total


In [None]:
epochs = 5

for epoch in range(epochs):
    train_loss, train_acc = train_one_epoch(train_loader)
    val_loss, val_acc = eval_epoch(val_loader)

    print(f"Epoch {epoch+1}/{epochs}")
    print(f"  Train loss: {train_loss:.4f}  acc: {train_acc:.4f}")
    print(f"  Val   loss: {val_loss:.4f}  acc: {val_acc:.4f}")
