# Dataset Exploration - SemEval-2019 Task 4
Exploring the hyperpartisan news detection dataset before and after preprocessing.

In [None]:
from lxml import etree
from collections import Counter
import config

## 1. Raw Data Exploration

In [None]:
def parse_raw_articles(xml_path, limit=5):
    articles = []
    context = etree.iterparse(xml_path, events=('end',), tag='article')
    for i, (event, elem) in enumerate(context):
        if i >= limit:
            break
        articles.append({
            'id': elem.get('id'),
            'title': elem.get('title', ''),
            'published': elem.get('published-at', ''),
            'raw_xml': etree.tostring(elem, encoding='unicode', pretty_print=True)[:2000]
        })
        elem.clear()
    return articles

def parse_raw_labels(xml_path):
    labels = {}
    context = etree.iterparse(xml_path, events=('end',), tag='article')
    for event, elem in context:
        labels[elem.get('id')] = {
            'hyperpartisan': elem.get('hyperpartisan'),
            'labeled_by': elem.get('labeled-by'),
            'url': elem.get('url', '')
        }
        elem.clear()
    return labels

In [None]:
print("Loading raw training data...")
raw_articles = parse_raw_articles(config.ARTICLES_TRAIN, limit=3)
raw_labels = parse_raw_labels(config.LABELS_TRAIN)
print(f"Total labels: {len(raw_labels)}")

In [None]:
print("=" * 80)
print("SAMPLE RAW ARTICLES")
print("=" * 80)
for art in raw_articles:
    label_info = raw_labels.get(art['id'], {})
    print(f"\nID: {art['id']}")
    print(f"Title: {art['title'][:100]}..." if len(art['title']) > 100 else f"Title: {art['title']}")
    print(f"Published: {art['published']}")
    print(f"Hyperpartisan: {label_info.get('hyperpartisan')}")
    print(f"\nRaw XML (truncated):")
    print(art['raw_xml'][:1000])
    print("-" * 80)

## 2. Label Distribution

In [None]:
label_counts = Counter(v['hyperpartisan'] for v in raw_labels.values())
print("Training set label distribution:")
for label, count in label_counts.items():
    print(f"  {label}: {count} ({count/len(raw_labels)*100:.1f}%)")

In [None]:
test_labels = parse_raw_labels(config.LABELS_TEST)
test_counts = Counter(v['hyperpartisan'] for v in test_labels.values())
print("Test set label distribution:")
for label, count in test_counts.items():
    print(f"  {label}: {count} ({count/len(test_labels)*100:.1f}%)")

## 3. Run Preprocessing

In [None]:
from preprocess import preprocess_and_cache, load_cached_data

if not (config.CACHE_DIR / "train_data.pkl").exists():
    print("Cache not found, running preprocessing...")
    preprocess_and_cache()
else:
    print("Cache exists, loading...")

In [None]:
train_data = load_cached_data('train')
test_data = load_cached_data('test')
print(f"Train samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")

## 4. Preprocessed Data Exploration

In [None]:
print("=" * 80)
print("SAMPLE PREPROCESSED ARTICLES")
print("=" * 80)
for sample in train_data[:3]:
    print(f"\nID: {sample['id']}")
    print(f"Title: {sample['title'][:100]}..." if len(sample['title']) > 100 else f"Title: {sample['title']}")
    print(f"Label: {sample['label']} ({'hyperpartisan' if sample['label'] else 'not hyperpartisan'})")
    print(f"Num tokens: {len(sample['tokens'])}")
    print(f"Num hyperlinks: {len(sample['hyperlinks'])}")
    print(f"\nFirst 50 tokens: {sample['tokens'][:50]}")
    print(f"\nCleaned text (first 500 chars): {sample['text'][:500]}...")
    print("-" * 80)

## 5. Token Statistics

In [None]:
token_lengths = [len(d['tokens']) for d in train_data]
print("Token length statistics (train):")
print(f"  Min: {min(token_lengths)}")
print(f"  Max: {max(token_lengths)}")
print(f"  Mean: {sum(token_lengths)/len(token_lengths):.1f}")
print(f"  Median: {sorted(token_lengths)[len(token_lengths)//2]}")

In [None]:
all_tokens = []
for d in train_data:
    all_tokens.extend(d['tokens'])
word_freq = Counter(all_tokens)
print(f"\nTotal tokens: {len(all_tokens)}")
print(f"Unique tokens: {len(word_freq)}")
print(f"\nTop 20 most common words:")
for word, count in word_freq.most_common(20):
    print(f"  {word}: {count}")

## 6. Hyperlink Statistics

In [None]:
hyperlink_counts = [len(d['hyperlinks']) for d in train_data]
print("Hyperlink statistics (train):")
print(f"  Articles with links: {sum(1 for c in hyperlink_counts if c > 0)}")
print(f"  Total links: {sum(hyperlink_counts)}")
print(f"  Mean per article: {sum(hyperlink_counts)/len(hyperlink_counts):.1f}")
print(f"  Max: {max(hyperlink_counts)}")

In [None]:
link_types = Counter()
for d in train_data:
    for link in d['hyperlinks']:
        link_types[link['type']] += 1
print(f"\nLink types: {dict(link_types)}")

## 7. Compare Hyperpartisan vs Non-Hyperpartisan

In [None]:
hyper = [d for d in train_data if d['label'] == 1]
non_hyper = [d for d in train_data if d['label'] == 0]

print(f"Hyperpartisan articles: {len(hyper)}")
print(f"Non-hyperpartisan articles: {len(non_hyper)}")

hyper_len = [len(d['tokens']) for d in hyper]
non_hyper_len = [len(d['tokens']) for d in non_hyper]

print(f"\nAvg tokens (hyperpartisan): {sum(hyper_len)/len(hyper_len):.1f}")
print(f"Avg tokens (non-hyperpartisan): {sum(non_hyper_len)/len(non_hyper_len):.1f}")

hyper_links = [len(d['hyperlinks']) for d in hyper]
non_hyper_links = [len(d['hyperlinks']) for d in non_hyper]

print(f"\nAvg hyperlinks (hyperpartisan): {sum(hyper_links)/len(hyper_links):.1f}")
print(f"Avg hyperlinks (non-hyperpartisan): {sum(non_hyper_links)/len(non_hyper_links):.1f}")