In [1]:
import os
import json
from pathlib import Path
from collections import defaultdict, Counter
import pandas as pd
from datetime import datetime
from typing import Dict, List, Tuple

In [2]:
root_path = Path("all-rnr-annotated-threads")
stats = defaultdict(lambda: defaultdict(int))
issues = defaultdict(list)
sample_data = defaultdict(list)
threads_info = []

In [3]:
def _json_files_clean(dir_path: Path):
    """Return only real JSONs (skip macOS AppleDouble like '._*')."""
    return [p for p in dir_path.glob("*.json") if not p.name.startswith("._")]


In [4]:
def map_structure():
    """Map the directory structure and count elements"""
    
    # Find all event directories
    if not root_path.exists():
        issues['critical'].append(f"Root path does not exist: {root_path}")
        return
        
    event_dirs = [d for d in root_path.iterdir() if d.is_dir()]
    
    for event_dir in event_dirs:
        event_name = event_dir.name
        stats['events'][event_name] = 0
        
        # Check for rumours and non-rumours subdirectories
        for category in ['rumours', 'non-rumours']:
            category_path = event_dir / category
            
            if not category_path.exists():
                issues['missing_dirs'].append(f"{event_name}/{category}")
                continue
            
            # Count thread folders
            thread_folders = [d for d in category_path.iterdir() if d.is_dir()]
            thread_count = len(thread_folders)
            
            stats['events'][event_name] += thread_count
            stats['categories'][category] += thread_count
            stats['total']['threads'] += thread_count
            
            # Store thread info for sampling
            for thread_folder in thread_folders:
                threads_info.append({
                    'path': thread_folder,
                    'event': event_name,
                    'category': category,
                    'thread_id': thread_folder.name
                })
    
    print(f"   Found {len(event_dirs)} events")
    print(f"   Total threads: {stats['total']['threads']}")
    print(f"   Rumours: {stats['categories']['rumours']}")
    print(f"   Non-rumours: {stats['categories']['non-rumours']}")

In [5]:
def examine_thread(thread_info: Dict):
    """Examine a single thread folder"""
    thread_path = thread_info['path']
    thread_id = thread_info['thread_id']
    
    thread_data = {
        'thread_id': thread_id,
        'event': thread_info['event'],
        'category': thread_info['category'],
        'files': {}
    }
    
    # Check for expected files
    expected_files = ['annotation.json', 'structure.json']
    expected_dirs = ['reactions', 'source-tweets']
    
    for file_name in expected_files:
        file_path = thread_path / file_name
        if file_path.exists():
            thread_data['files'][file_name] = 'present'
            stats['files'][file_name] += 1
            
            # Load and sample JSON
            try:
                with open(file_path, 'r', encoding='utf-8-sig') as f:
                    data = json.load(f)
                    if len(sample_data[file_name]) < 3:
                        sample_data[file_name].append({
                            'thread_id': thread_id,
                            'data': data
                        })
            except Exception as e:
                issues['json_errors'].append(f"{thread_id}/{file_name}: {str(e)}")
        else:
            thread_data['files'][file_name] = 'missing'
            issues['missing_files'].append(f"{thread_id}/{file_name}")
    
    # Check directories
    for dir_name in expected_dirs:
        dir_path = thread_path / dir_name
        if dir_path.exists():
            # json_files = list(dir_path.glob('*.json'))
            json_files = _json_files_clean(dir_path) ### NEW ###
            count = len(json_files)
            thread_data['files'][dir_name] = f"{count} files"
            stats['dir_counts'][dir_name] += count
            
            # Sample one JSON from each directory
            # if json_files and len(sample_data[dir_name]) < 3:
            #     try:
            #         # with open(json_files[0], 'r', encoding='utf-8') as f:
            #         with open(json_files[0], 'r', encoding='utf-8-sig') as f:
            #             data = json.load(f)
            #             sample_data[dir_name].append({
            #                 'thread_id': thread_id,
            #                 'file': json_files[0].name,
            #                 'data': data
            #             })
            #     except Exception as e:
            #         issues['json_errors'].append(f"{thread_id}/{dir_name}/{json_files[0].name}: {str(e)}")
            
            ### NEW ###
            if json_files and len(sample_data[dir_name]) < 3:
                try:
                    # pick the first real JSON after filtering; could also sort by name/time if you prefer
                    jf = json_files[0]
                    with open(jf, 'r', encoding='utf-8-sig') as f:
                        data = json.load(f)
                    sample_data[dir_name].append({
                        'thread_id': thread_id,
                        'file': jf.name,
                        'data': data
                    })
                except UnicodeDecodeError as e:
                    issues['json_errors'].append(f"{thread_id}/{dir_name}/{jf.name}: {e}")
                except Exception as e:
                    issues['json_errors'].append(f"{thread_id}/{dir_name}/{jf.name}: {str(e)}")

            ### END NEW ###
        else:
            thread_data['files'][dir_name] = 'missing'
            issues['missing_dirs'].append(f"{thread_id}/{dir_name}")
    
    # Store sample thread data
    if len(sample_data['threads']) < 10:
        sample_data['threads'].append(thread_data)

In [6]:
def sample_threads(sample_size: int):
    """Sample threads and examine their JSON structures"""
    
    # Sample from each event/category combination
    for event_name in set(t['event'] for t in threads_info):
        for category in ['rumours', 'non-rumours']:
            threads = [t for t in threads_info 
                      if t['event'] == event_name and t['category'] == category]
            
            sample = threads[:min(sample_size, len(threads))]
            
            for thread in sample:
                examine_thread(thread)

In [7]:
def check_timestamps():
    """Check timestamp formats across different JSON files"""
    timestamp_formats = Counter()
    
    # Check source tweets
    for sample in sample_data['source-tweets']:
        data = sample['data']
        if 'created_at' in data:
            ts = data['created_at']
            timestamp_formats[type(ts).__name__] += 1
            # Try to parse
            try:
                if isinstance(ts, str):
                    # Twitter format: "Wed Oct 10 20:19:24 +0000 2018"
                    datetime.strptime(ts, "%a %b %d %H:%M:%S %z %Y")
            except:
                issues['timestamp_parsing'].append(f"Source tweet: {ts}")
    
    # Check reactions
    for sample in sample_data['reactions']:
        data = sample['data']
        if 'created_at' in data:
            ts = data['created_at']
            try:
                if isinstance(ts, str):
                    datetime.strptime(ts, "%a %b %d %H:%M:%S %z %Y")
            except:
                issues['timestamp_parsing'].append(f"Reaction: {ts}")
    
    stats['timestamp_formats'] = dict(timestamp_formats)


In [8]:
def check_user_ids():
    """Check user ID formats and consistency"""
    user_id_types = Counter()
    
    for sample in sample_data['source-tweets']:
        data = sample['data']
        if 'user' in data and 'id' in data['user']:
            user_id_types[type(data['user']['id']).__name__] += 1
    
    stats['user_id_types'] = dict(user_id_types)

In [9]:
def check_languages():
    """Check language distribution"""
    languages = Counter()
    
    for sample in sample_data['source-tweets']:
        data = sample['data']
        if 'lang' in data:
            languages[data['lang']] += 1
    
    for sample in sample_data['reactions']:
        data = sample['data']
        if 'lang' in data:
            languages[data['lang']] += 1
    
    stats['languages'] = dict(languages.most_common(10))

In [10]:
def check_veracity_labels():
    """Check veracity label distribution"""
    veracity_labels = Counter()
    
    for sample in sample_data['annotation.json']:
        data = sample['data']
        if 'true' in data:
            label = data['true']
            veracity_labels[str(label)] += 1
    
    stats['veracity_labels'] = dict(veracity_labels)

In [11]:
def check_quality():
    """Perform data quality checks"""
    # Check timestamp formats
    check_timestamps()
    
    # Check for user ID consistency
    check_user_ids()
    
    # Check language distribution
    check_languages()
    
    # Check veracity labels
    check_veracity_labels()

In [12]:
def generate_report():
    """Generate and save comprehensive report"""
    
    report = []
    report.append("=" * 80)
    report.append("Dataset Exploration")
    report.append("=" * 80)
    report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

    # 1. Overall Statistics
    report.append("\nOverall stats")
    report.append("-" * 80)
    report.append(f"Total threads: {stats['total']['threads']}")
    report.append(f"Rumours: {stats['categories']['rumours']}")
    report.append(f"Non-rumours: {stats['categories']['non-rumours']}")
    
    # 2. Event Distribution
    report.append("\n\n Event distribution")
    report.append("-" * 80)
    for event, count in sorted(stats['events'].items(), key=lambda x: x[1], reverse=True):
        report.append(f"  {event}: {count} threads")
    
    # 3. File Presence
    report.append("\n\nFile stats")
    report.append("-" * 80)
    report.append(f"annotation.json files: {stats['files'].get('annotation.json', 0)}")
    report.append(f"structure.json files: {stats['files'].get('structure.json', 0)}")
    report.append(f"Total reaction files: {stats['dir_counts'].get('reactions', 0)}")
    report.append(f"Total source-tweet files: {stats['dir_counts'].get('source-tweets', 0)}")
    
    # 4. Data Quality Issues
    report.append("\n\n ⚠️  Data quality")
    report.append("-" * 80)
    if not any(issues.values()):
        report.append("  No issues found!")
    else:
        for issue_type, issue_list in issues.items():
            if issue_list:
                report.append(f"\n  {issue_type.upper()}: {len(issue_list)} issues")
                for issue in issue_list[:5]:  # Show first 5
                    report.append(f"    - {issue}")
                if len(issue_list) > 5:
                    report.append(f"    ... and {len(issue_list) - 5} more")
    
    # 5. Language Distribution
    if stats['languages']:
        report.append("\n\n Language distribution")
        report.append("-" * 80)
        for lang, count in stats['languages'].items():
            report.append(f"  {lang}: {count}")
    
    # 6. Veracity Labels
    if stats['veracity_labels']:
        report.append("\n\n Veracity labels")
        report.append("-" * 80)
        for label, count in stats['veracity_labels'].items():
            report.append(f"  {label}: {count}")
    
    # 7. JSON Schema Samples
    report.append("\n\nSample json schema")
    report.append("-" * 80)
    
    # annotation.json sample
    if sample_data['annotation.json']:
        report.append("\n  annotation.json structure:")
        sample = sample_data['annotation.json'][0]['data']
        report.append(f"    Keys: {list(sample.keys())}")
    
    # source-tweets sample
    if sample_data['source-tweets']:
        report.append("\n  source-tweets/*.json structure:")
        sample = sample_data['source-tweets'][0]['data']
        report.append(f"    Keys: {list(sample.keys())}")
        if 'user' in sample:
            report.append(f"    User keys: {list(sample['user'].keys())}")
    
    # reactions sample
    if sample_data['reactions']:
        report.append("\n  reactions/*.json structure:")
        sample = sample_data['reactions'][0]['data']
        report.append(f"    Keys: {list(sample.keys())}")
    
    # 8. Sample Thread Details
    report.append("\n\nSample thread")
    report.append("-" * 80)
    for thread in sample_data['threads'][:3]:
        report.append(f"\n  Thread: {thread['thread_id']}")
        report.append(f"    Event: {thread['event']}")
        report.append(f"    Category: {thread['category']}")
        report.append(f"    Files:")
        for file, status in thread['files'].items():
            report.append(f"      - {file}: {status}")
    
    report.append("\n" + "=" * 80)
    report.append("End of report")
    report.append("=" * 80)
    
    # Print report
    report_text = "\n".join(report)
    print(report_text)

In [13]:
def explore(sample_size: int = 5):
    """Main exploration function"""
    print(f" Starting PHEME Dataset Exploration...")
    print(f" Root directory: {root_path}\n")
    
    # Verify root exists
    if not root_path.exists():
        print(f"ERROR: Path {root_path} does not exist!")
        return
    
    # Step 1: Map directory structure
    print("\nStep 1: Mapping directory structure...")
    map_structure()
    
    # Step 2: Sample and examine JSONs
    print(f"\nStep 2: Sampling {sample_size} threads per category...")
    sample_threads(sample_size)
    
    # Step 3: Identify data quality issues
    print("\nStep 3: Checking data quality...")
    check_quality()
    
    # Step 4: Generate report
    print("\n Step 4: Generating report...")
    generate_report()
    
    print("\n Exploration complete!")




In [14]:
explore(sample_size=5)

 Starting PHEME Dataset Exploration...
 Root directory: all-rnr-annotated-threads


Step 1: Mapping directory structure...


   Found 9 events
   Total threads: 6425
   Rumours: 2402
   Non-rumours: 4023

Step 2: Sampling 5 threads per category...

Step 3: Checking data quality...

 Step 4: Generating report...
Dataset Exploration
Generated: 2025-12-11 17:06:29


Overall stats
--------------------------------------------------------------------------------
Total threads: 6425
Rumours: 2402
Non-rumours: 4023


 Event distribution
--------------------------------------------------------------------------------
  charliehebdo-all-rnr-threads: 2079 threads
  sydneysiege-all-rnr-threads: 1221 threads
  ferguson-all-rnr-threads: 1143 threads
  ottawashooting-all-rnr-threads: 890 threads
  germanwings-crash-all-rnr-threads: 469 threads
  putinmissing-all-rnr-threads: 238 threads
  prince-toronto-all-rnr-threads: 233 threads
  gurlitt-all-rnr-threads: 138 threads
  ebola-essien-all-rnr-threads: 14 threads


File stats
--------------------------------------------------------------------------------
annotation.json fi