# Dataset Exploration - SemEval-2019 Task 4
Exploring the hyperpartisan news detection dataset before and after preprocessing.

In [1]:
from lxml import etree
from collections import Counter
import config

## 1. Raw Data Exploration

In [2]:
def parse_raw_articles(xml_path, limit=5):
    articles = []
    context = etree.iterparse(xml_path, events=('end',), tag='article')
    for i, (event, elem) in enumerate(context):
        if i >= limit:
            break
        articles.append({
            'id': elem.get('id'),
            'title': elem.get('title', ''),
            'published': elem.get('published-at', ''),
            'raw_xml': etree.tostring(elem, encoding='unicode', pretty_print=True)[:2000]
        })
        elem.clear()
    return articles

def parse_raw_labels(xml_path):
    labels = {}
    context = etree.iterparse(xml_path, events=('end',), tag='article')
    for event, elem in context:
        labels[elem.get('id')] = {
            'hyperpartisan': elem.get('hyperpartisan'),
            'labeled_by': elem.get('labeled-by'),
            'url': elem.get('url', '')
        }
        elem.clear()
    return labels

In [3]:
print("Loading raw training data...")
raw_articles = parse_raw_articles(config.ARTICLES_TRAIN, limit=3)
raw_labels = parse_raw_labels(config.LABELS_TRAIN)
print(f"Total labels: {len(raw_labels)}")

Loading raw training data...
Total labels: 645


In [4]:
print("=" * 80)
print("SAMPLE RAW ARTICLES")
print("=" * 80)
for art in raw_articles:
    label_info = raw_labels.get(art['id'], {})
    print(f"\nID: {art['id']}")
    print(f"Title: {art['title'][:100]}..." if len(art['title']) > 100 else f"Title: {art['title']}")
    print(f"Published: {art['published']}")
    print(f"Hyperpartisan: {label_info.get('hyperpartisan')}")
    print(f"\nRaw XML (truncated):")
    print(art['raw_xml'][:1000])
    print("-" * 80)

SAMPLE RAW ARTICLES

ID: 0000000
Title: Kucinich: Reclaiming the money power
Published: 2017-09-10
Hyperpartisan: true

Raw XML (truncated):
<article id="0000000" published-at="2017-09-10" title="Kucinich: Reclaiming the money power">
From flickr.com: Money {MID-161793} <p>Money ( <a href="https://farm8.static.flickr.com/7020/6551534889_9c8ae52997.jpg" type="external">Image</a> by <a href="https://www.flickr.com/people/68751915@N05/" type="external">401(K) 2013</a>) <a href="https://creativecommons.org/licenses/by-sa/2.0/" type="external">Permission</a> <a type="internal">Details</a> <a type="internal">DMCA</a></p> No Pill Can Stop Tinnitus, But This 1 Weird Trick Can <p>The walls are closing in on Congress.</p> <p>Terrifying walls of water from Hurricanes Harvey and Irma, which, when the damage is totaled, could rise to a half trillion dollars. The Walls of War: The multi-trillion dollar ongoing cost of Afghanistan, Iraq and other interventions. The crumbling walls of the U.S. infrast

## 2. Label Distribution

In [5]:
label_counts = Counter(v['hyperpartisan'] for v in raw_labels.values())
print("Training set label distribution:")
for label, count in label_counts.items():
    print(f"  {label}: {count} ({count/len(raw_labels)*100:.1f}%)")

Training set label distribution:
  true: 238 (36.9%)
  false: 407 (63.1%)


In [6]:
test_labels = parse_raw_labels(config.LABELS_TEST)
test_counts = Counter(v['hyperpartisan'] for v in test_labels.values())
print("Test set label distribution:")
for label, count in test_counts.items():
    print(f"  {label}: {count} ({count/len(test_labels)*100:.1f}%)")

Test set label distribution:
  false: 314 (50.0%)
  true: 314 (50.0%)


## 3. Run Preprocessing

In [7]:
from preprocess import preprocess_and_cache, load_cached_data

if not (config.CACHE_DIR / "train_data.pkl").exists():
    print("Cache not found, running preprocessing...")
    preprocess_and_cache()
else:
    print("Cache exists, loading...")

Cache exists, loading...


In [8]:
train_data = load_cached_data('train')
test_data = load_cached_data('test')
print(f"Train samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")

Train samples: 645
Test samples: 628


## 4. Preprocessed Data Exploration

In [9]:
print("=" * 80)
print("SAMPLE PREPROCESSED ARTICLES")
print("=" * 80)
for sample in train_data[:3]:
    print(f"\nID: {sample['id']}")
    print(f"Title: {sample['title'][:100]}..." if len(sample['title']) > 100 else f"Title: {sample['title']}")
    print(f"Label: {sample['label']} ({'hyperpartisan' if sample['label'] else 'not hyperpartisan'})")
    print(f"Num tokens: {len(sample['tokens'])}")
    print(f"Num hyperlinks: {len(sample['hyperlinks'])}")
    print(f"\nFirst 50 tokens: {sample['tokens'][:50]}")
    print(f"\nCleaned text (first 500 chars): {sample['text'][:500]}...")
    print("-" * 80)

SAMPLE PREPROCESSED ARTICLES

ID: 0000000
Title: Kucinich: Reclaiming the money power
Label: 1 (hyperpartisan)
Num tokens: 1465
Num hyperlinks: 6

First 50 tokens: ['kucinich', 'reclaiming', 'the', 'money', 'power', 'from', 'flickr', 'com', 'money', 'mid', 'money', 'image', 'by', 'k', 'permission', 'details', 'dmca', 'no', 'pill', 'can', 'stop', 'tinnitus', 'but', 'this', 'weird', 'trick', 'can', 'the', 'walls', 'are', 'closing', 'in', 'on', 'congress', 'terrifying', 'walls', 'of', 'water', 'from', 'hurricanes', 'harvey', 'and', 'irma', 'which', 'when', 'the', 'damage', 'is', 'totaled', 'could']

Cleaned text (first 500 chars): kucinich reclaiming the money power from flickr com money mid money image by k permission details dmca no pill can stop tinnitus but this weird trick can the walls are closing in on congress terrifying walls of water from hurricanes harvey and irma which when the damage is totaled could rise to a half trillion dollars the walls of war the multi trillion dollar o

## 5. Token Statistics

In [10]:
token_lengths = [len(d['tokens']) for d in train_data]
print("Token length statistics (train):")
print(f"  Min: {min(token_lengths)}")
print(f"  Max: {max(token_lengths)}")
print(f"  Mean: {sum(token_lengths)/len(token_lengths):.1f}")
print(f"  Median: {sorted(token_lengths)[len(token_lengths)//2]}")

Token length statistics (train):
  Min: 24
  Max: 5649
  Mean: 583.6
  Median: 425


In [11]:
all_tokens = []
for d in train_data:
    all_tokens.extend(d['tokens'])
word_freq = Counter(all_tokens)
print(f"\nTotal tokens: {len(all_tokens)}")
print(f"Unique tokens: {len(word_freq)}")
print(f"\nTop 20 most common words:")
for word, count in word_freq.most_common(20):
    print(f"  {word}: {count}")


Total tokens: 376425
Unique tokens: 21090

Top 20 most common words:
  the: 20247
  to: 10720
  of: 8781
  and: 8360
  a: 7659
  in: 6516
  that: 5524
  s: 4700
  is: 4198
  for: 3152
  on: 2968
  it: 2959
  he: 2817
  was: 2690
  trump: 2688
  this: 2350
  with: 2249
  as: 2222
  i: 1923
  his: 1904


## 6. Hyperlink Statistics

In [12]:
hyperlink_counts = [len(d['hyperlinks']) for d in train_data]
print("Hyperlink statistics (train):")
print(f"  Articles with links: {sum(1 for c in hyperlink_counts if c > 0)}")
print(f"  Total links: {sum(hyperlink_counts)}")
print(f"  Mean per article: {sum(hyperlink_counts)/len(hyperlink_counts):.1f}")
print(f"  Max: {max(hyperlink_counts)}")

Hyperlink statistics (train):
  Articles with links: 406
  Total links: 2776
  Mean per article: 4.3
  Max: 277


In [13]:
link_types = Counter()
for d in train_data:
    for link in d['hyperlinks']:
        link_types[link['type']] += 1
print(f"\nLink types: {dict(link_types)}")


Link types: {'external': 2776}


## 7. Compare Hyperpartisan vs Non-Hyperpartisan

In [14]:
hyper = [d for d in train_data if d['label'] == 1]
non_hyper = [d for d in train_data if d['label'] == 0]

print(f"Hyperpartisan articles: {len(hyper)}")
print(f"Non-hyperpartisan articles: {len(non_hyper)}")

hyper_len = [len(d['tokens']) for d in hyper]
non_hyper_len = [len(d['tokens']) for d in non_hyper]

print(f"\nAvg tokens (hyperpartisan): {sum(hyper_len)/len(hyper_len):.1f}")
print(f"Avg tokens (non-hyperpartisan): {sum(non_hyper_len)/len(non_hyper_len):.1f}")

hyper_links = [len(d['hyperlinks']) for d in hyper]
non_hyper_links = [len(d['hyperlinks']) for d in non_hyper]

print(f"\nAvg hyperlinks (hyperpartisan): {sum(hyper_links)/len(hyper_links):.1f}")
print(f"Avg hyperlinks (non-hyperpartisan): {sum(non_hyper_links)/len(non_hyper_links):.1f}")

Hyperpartisan articles: 238
Non-hyperpartisan articles: 407

Avg tokens (hyperpartisan): 820.4
Avg tokens (non-hyperpartisan): 445.1

Avg hyperlinks (hyperpartisan): 4.5
Avg hyperlinks (non-hyperpartisan): 4.2
