In [7]:
import sys
from pathlib import Path

# Add src directory to Python path so we can import hansard modules
# Find project root by walking up from current directory
current = Path.cwd()
project_root = current

# Walk up to find project root (contains both 'src' and 'notebooks')
for parent in [current] + list(current.parents):
    if (parent / 'src' / 'hansard').exists() and (parent / 'notebooks').exists():
        project_root = parent
        break

src_path = project_root / 'src'
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
    
print(f"Project root: {project_root}")
print(f"Added to Python path: {src_path}")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob


Project root: /Users/omarkhursheed/workplace/hansard-nlp-explorer
Added to Python path: /Users/omarkhursheed/workplace/hansard-nlp-explorer/src


In [None]:
derived_debate_data_path = '/Users/omarkhursheed/workplace/hansard-nlp-explorer/data-hansard/derived_complete/debates_complete/'
derived_speech_data_path = '/Users/omarkhursheed/workplace/hansard-nlp-explorer/data-hansard/derived_complete/speeches_complete/'

def load_debate_data(
    year_range: tuple[int, int],
    stratified_sampling: bool = False,
    gender_matched_data_only: bool = False,
    sample_size: float = 1.0,
    chamber: str = 'Both',
    load_text: bool = True,
    use_cache: bool = True
) -> dict:
    """
    Load debate data for a given year range.

    Args:
        year_range: tuple of two integers, the start and end year of the range to load
        stratified_sampling: boolean, whether to use stratified sampling to load the data
        gender_matched_data_only: boolean, whether to only load gender matched data
        sample_size: fraction of the data to load (0.0 to 1.0, or >1 for absolute count)
        chamber: 'Lords' or 'Commons' or 'Both'
        load_text: boolean, whether to load full text content
        use_cache: boolean, whether to use cached data

    Returns:
        Dict with 'metadata', 'text_data', 'statistics' keys
    """
    from pathlib import Path
    import pandas as pd

    # Import the unified data loader
    from hansard.utils.unified_data_loader import UnifiedDataLoader

    # Initialize the loader
    loader = UnifiedDataLoader()

    # Determine source based on gender_matched_data_only
    if gender_matched_data_only:
        source = 'gender_enhanced'
    else:
        source = 'processed_fixed'

    # Convert sample_size from fraction to count if needed
    if sample_size < 1.0:
        # If fraction, we need to estimate total debates first to convert to count
        # Load metadata only to get count
        temp_data = loader.load_debates(
            source=source,
            year_range=year_range,
            load_text=False,
            use_cache=use_cache
        )
        total_debates = len(temp_data['metadata'])
        if total_debates == 0:
            print(f"Warning: No debates found for year range {year_range}")
            return {
                'metadata': pd.DataFrame(),
                'text_data': [],
                'statistics': {}
            }
        sample_count = max(1, int(total_debates * sample_size))  # Ensure at least 1
    elif sample_size >= 1.0 and sample_size != 1.0:
        # If >= 1, treat as absolute count
        sample_count = max(1, int(sample_size))  # Ensure at least 1
    else:
        # If exactly 1.0, load all data
        sample_count = None

    # Load the actual data
    data = loader.load_debates(
        source=source,
        year_range=year_range,
        sample_size=sample_count if stratified_sampling else None,
        load_text=load_text,
        use_cache=use_cache
    )
    
    # Ensure data has expected structure
    if 'metadata' not in data:
        data['metadata'] = pd.DataFrame()
    if 'text_data' not in data:
        data['text_data'] = []
    if 'statistics' not in data:
        data['statistics'] = {}

    # Apply chamber filtering if needed
    if chamber != 'Both' and 'chamber' in data['metadata'].columns:
        data['metadata'] = data['metadata'][data['metadata']['chamber'] == chamber]

        # Also filter text_data if it was loaded
        if load_text and 'text_data' in data:
            valid_file_paths = set(data['metadata']['file_path'])
            data['text_data'] = [
                text for text in data['text_data']
                if text['file_path'] in valid_file_paths
            ]

        # Recalculate statistics after filtering
        data['statistics'] = loader._calculate_statistics(data['metadata'])

    # Apply non-stratified sampling if requested
    if not stratified_sampling and sample_count is not None:
        available_count = len(data['metadata'])
        if sample_count < available_count:
            data['metadata'] = data['metadata'].sample(n=sample_count, random_state=42)

            # Filter text_data to match sampled metadata
            if load_text and 'text_data' in data:
                valid_file_paths = set(data['metadata']['file_path'])
                data['text_data'] = [
                    text for text in data['text_data']
                    if text['file_path'] in valid_file_paths
                ]

            # Recalculate statistics
            data['statistics'] = loader._calculate_statistics(data['metadata'])
        elif sample_count > available_count:
            # Requested more than available, just use what we have
            print(f"Warning: Requested {sample_count} debates but only {available_count} available")

    print(f"\nLoaded {len(data['metadata'])} debates from {year_range[0]}-{year_range[1]}")
    print(f"Source: {source}, Chamber: {chamber}")
    if data['statistics']:
        print(f"Statistics: {data['statistics']}")

    return data

In [9]:
# Test the load_debate_data function with various configurations

print("=" * 60)
print("Testing load_debate_data function")
print("=" * 60)

# Test 1: Basic loading with small sample
print("\nTest 1: Basic loading (1990-1992, small sample)")
try:
    data1 = load_debate_data(
        year_range=(1990, 1992),
        sample_size=10,
        load_text=False,
        use_cache=True
    )
    print(f"✓ Success: Loaded {len(data1['metadata'])} debates")
    print(f"  Keys in data: {list(data1.keys())}")
    if len(data1['metadata'].columns) > 0:
        print(f"  Metadata columns: {list(data1['metadata'].columns)[:10]}...")  # First 10 columns
    else:
        print("  No columns found in metadata")
except Exception as e:
    print(f"✗ Error: {e}")
    import traceback
    traceback.print_exc()

Testing load_debate_data function

Test 1: Basic loading (1990-1992, small sample)
Loading debates: source=processed_fixed, years=(1990, 1992), sample=None, text=False

Loaded 10 debates from 1990-1992
Source: processed_fixed, Chamber: Both
Statistics: {'total_debates': 10, 'year_range': (1990, 1992), 'total_words': 24886, 'avg_words_per_debate': 2488.6, 'chamber_distribution': {'Commons': 7, 'Lords': 3}, 'debates_with_speakers': 6, 'avg_speakers_per_debate': 4.8}
✓ Success: Loaded 10 debates
  Keys in data: ['metadata', 'statistics', 'text_data']
  Metadata columns: ['file_path', 'file_name', 'file_size', 'file_modified', 'content_hash', 'extraction_timestamp', 'title', 'success', 'error', 'meta_tags']...


In [None]:
# Test 2: Fractional sampling
print("\nTest 2: Fractional sampling (0.1 = 10%)")
try:
    data2 = load_debate_data(
        year_range=(1990, 1992),
        sample_size=0.1,
        load_text=False,
        use_cache=True
    )
    print(f"✓ Success: Loaded {len(data2['metadata'])} debates")
except Exception as e:
    print(f"✗ Error: {e}")
    import traceback
    traceback.print_exc()


In [None]:
# Test 3: Stratified sampling
print("\nTest 3: Stratified sampling")
try:
    data3 = load_debate_data(
        year_range=(1990, 1992),
        sample_size=20,
        stratified_sampling=True,
        load_text=False,
        use_cache=True
    )
    print(f"✓ Success: Loaded {len(data3['metadata'])} debates")
    if len(data3['metadata']) > 0:
        year_counts = data3['metadata']['year'].value_counts().sort_index()
        print(f"  Year distribution: {year_counts.to_dict()}")
except Exception as e:
    print(f"✗ Error: {e}")
    import traceback
    traceback.print_exc()


In [None]:
# Test 4: Chamber filtering
print("\nTest 4: Chamber filtering (Commons only)")
try:
    data4 = load_debate_data(
        year_range=(1990, 1992),
        sample_size=10,
        chamber='Commons',
        load_text=False,
        use_cache=True
    )
    print(f"✓ Success: Loaded {len(data4['metadata'])} debates")
    if len(data4['metadata']) > 0 and 'chamber' in data4['metadata'].columns:
        chamber_counts = data4['metadata']['chamber'].value_counts()
        print(f"  Chamber distribution: {chamber_counts.to_dict()}")
except Exception as e:
    print(f"✗ Error: {e}")
    import traceback
    traceback.print_exc()


In [None]:
# Test 5: Gender-matched data
print("\nTest 5: Gender-matched data loading")
try:
    data5 = load_debate_data(
        year_range=(1990, 1992),
        sample_size=10,
        gender_matched_data_only=True,
        load_text=False,
        use_cache=True
    )
    print(f"✓ Success: Loaded {len(data5['metadata'])} debates")
    if len(data5['metadata']) > 0:
        print(f"  Columns in gender data: {[c for c in data5['metadata'].columns if 'gender' in c.lower() or 'male' in c.lower() or 'female' in c.lower()]}")
except Exception as e:
    print(f"✗ Error: {e}")
    import traceback
    traceback.print_exc()


In [None]:
# Test 6: Loading with text content
print("\nTest 6: Loading with text content (small sample)")
try:
    data6 = load_debate_data(
        year_range=(1990, 1992),
        sample_size=5,
        load_text=True,
        use_cache=True
    )
    print(f"✓ Success: Loaded {len(data6['metadata'])} debates")
    print(f"  Text data entries: {len(data6.get('text_data', []))}")
    if data6.get('text_data'):
        first_text = data6['text_data'][0]
        print(f"  First text keys: {list(first_text.keys())}")
        if 'full_text' in first_text:
            text_length = len(first_text['full_text'])
            print(f"  First text length: {text_length} characters")
except Exception as e:
    print(f"✗ Error: {e}")
    import traceback
    traceback.print_exc()


In [None]:
# Test 7: Edge cases - sample_size = 1.0 (load all)
print("\nTest 7: Loading all data (sample_size = 1.0)")
try:
    data7 = load_debate_data(
        year_range=(1990, 1992),
        sample_size=1.0,
        load_text=False,
        use_cache=True
    )
    print(f"✓ Success: Loaded {len(data7['metadata'])} debates (all available)")
except Exception as e:
    print(f"✗ Error: {e}")
    import traceback
    traceback.print_exc()


In [None]:
# Summary of test results
print("\n" + "=" * 60)
print("Test Summary")
print("=" * 60)
print("All tests completed. Review output above for any errors.")
print("\nFunction features tested:")
print("  ✓ Basic loading with absolute sample size")
print("  ✓ Fractional sampling (0.0 to 1.0)")
print("  ✓ Stratified sampling (maintains year distribution)")
print("  ✓ Chamber filtering (Commons/Lords/Both)")
print("  ✓ Gender-matched data loading")
print("  ✓ Text content loading")
print("  ✓ Loading all data (sample_size = 1.0)")
