In [31]:
import pandas as pd
from typing import Tuple

In [33]:
# set all random seeds for reproducibility
import numpy as np
np.random.seed(42)

In [34]:

def sample_derived_debates(year_range: Tuple[int, int], sample_size_per_year: int = None):
    """
    Sample derived debates from the given year range.
    
    Args:
        year_range: Tuple of two integers representing the start and end years.
        sample_size: Number of debates to sample. If None, all debates are returned.

    Returns:
        DataFrame containing the sampled debates.
    """
    # Get all debates in the year range
    all_debates = []
    for year in range(year_range[0], year_range[1] + 1):
        
        current_debates = pd.read_parquet(
            f"../data-hansard/derived_complete/debates_complete/debates_{year}.parquet"
        )
        if sample_size_per_year is not None:
            current_debates = current_debates.sample(sample_size_per_year)
        all_debates.append(current_debates)

    return pd.concat(all_debates)

df = sample_derived_debates((1900, 1910), 10)




In [35]:
df.head()

Unnamed: 0,debate_id,file_path,content_hash,year,decade,month,date,chamber,title,topic,...,speech_count,total_speakers,speakers,speaker_genders,confirmed_mps,female_mps,male_mps,has_female,has_male,gender_ratio
9129,ee878a4475e8c6fe,data-hansard/hansard/1900/mar/8_119_greenwich-...,ee878a4475e8c6fe4f2fd1b87a3000cde2a32c583e1bc8...,1900,1900,mar,08 March 1900,Lords,GREENWICH HOSPITAL AND TRAVERS' FOUNDATION(CAP...,GREENWICH HOSPITAL AND TRAVERS' FOUNDATION(CAP...,...,0,0,[],"{'""MR. M'GHEE': None, '""MR. MACEGAN': None, '""...",0,0,0,False,False,
8805,37a4c92bebf8c8bd,data-hansard/hansard/1900/mar/2_30_navy-period...,37a4c92bebf8c8bd285c1eb6f1fba72d8afa48ea55d2d7...,1900,1900,mar,02 March 1900,Commons,"NAVY—PERIODS OF SERVICE. (Hansard, 2 March 1900)",NAVY—PERIODS OF SERVICE.,...,0,3,"[SIR JOHN COLOMB, MR. GOSCHEN, SIR JOHN COLOMB...","{'""MR. M'GHEE': None, '""MR. MACEGAN': None, '""...",0,0,0,False,False,
10639,440e119f0ac6e563,data-hansard/hansard/1900/jun/19_37_local-gove...,440e119f0ac6e563ad3f0cd8bfe1f3dda28411119765a5...,1900,1900,jun,19 June 1900,Commons,LOCAL GOVERNMENT (SCOTLAND) ACT (1894) AMENDME...,LOCAL GOVERNMENT (SCOTLAND) ACT (1894) AMENDME...,...,0,0,[],"{'""MR. M'GHEE': None, '""MR. MACEGAN': None, '""...",0,0,0,False,False,
10734,e8566b92f9b13654,data-hansard/hansard/1900/jun/26_64_dual-naval...,e8566b92f9b13654fb308276180256b88b1dbf2f1a33d8...,1900,1900,jun,26 June 1900,Commons,DUAL (NAVAL AND MILITARY) ORGANISATION OF NAVA...,DUAL (NAVAL AND MILITARY) ORGANISATION OF NAVA...,...,2,2,"[MR. A. J. BALFOUR, *SIR J. COLOMB]","{'""MR. M'GHEE': None, '""MR. MACEGAN': None, '""...",0,0,0,False,False,
9731,b80b41ccd1397dcb,data-hansard/hansard/1900/mar/19_11_petitions....,b80b41ccd1397dcb21808fafe8d3296a828edeef929f98...,1900,1900,mar,19 March 1900,Commons,"PETITIONS. (Hansard, 19 March 1900)",PETITIONS.,...,0,0,[],"{'""MR. M'GHEE': None, '""MR. MACEGAN': None, '""...",0,0,0,False,False,


In [26]:
df.columns

Index(['debate_id', 'file_path', 'content_hash', 'year', 'decade', 'month',
       'date', 'chamber', 'title', 'topic', 'hansard_reference',
       'reference_volume', 'reference_columns', 'full_text', 'word_count',
       'speech_count', 'total_speakers', 'speakers', 'speaker_genders',
       'confirmed_mps', 'female_mps', 'male_mps', 'has_female', 'has_male',
       'gender_ratio'],
      dtype='object')

In [27]:
df['full_text']
df['full_text'].iloc[0]


'HC Deb 10 May 1900 vol 82 c1229 1229 § Reported, without Amendment [Provisional Orders confirmed]. Report to lie upon the Table. § Bill to be read the third time Tomorrow.'

In [28]:
df['speakers']
df['speakers'].iloc[0]


array([], dtype=object)

In [19]:
df['speaker_genders'].iloc[0]

{'"MR. M\'GHEE': None,
 '"MR. MACEGAN': None,
 '"Mr. LABOUCHERE': None,
 '"THE CHAIRMAN': None,
 '"The CHAIRMAN': None,
 "'MR. WYNDHAM": None,
 '* MR. RUTHERFORD': None,
 "*'THE: SECRETARY OF STATE FOR THE HOME DEPARTMENT (Sir M. WHITE RIDLEY,)": None,
 '*ADMIRAL FIELD': None,
 '*ADMIRAL FIELD ()': None,
 '*CAPTAIN JESSEL': None,
 '*CAPTAIN NORTON': None,
 '*CAPTAIN PBETYMAN': None,
 '*CAPTAIN PHILLPOTTS': None,
 '*CAPTAIN PHILLPOTTS (Devonshire,)': None,
 '*CAPTAIN PRETYMAN': None,
 '*COLNEL MILWARD': None,
 '*COLONEL BLUNDELL': None,
 '*COLONEL COTTON-JODRELL': None,
 '*COLONEL E. T. D. COTTON-JODRELL': None,
 '*COLONEL HUGHES': None,
 '*COLONEL HUGHES ()': None,
 '*COLONEL LOCKWOOD': None,
 '*COLONEL LONG ()': None,
 '*COLONEL MILWARD': None,
 '*COLONEL PILKINGTON': None,
 '*COLONEL SANDYS': None,
 '*COLONEL WELBY': None,
 "*COMMANDER BETHELL (Yorkshire' E.R.,)": None,
 '*DR. FARQUHARSON': None,
 '*EARL PERCY': None,
 '*EARL SPENCER': None,
 '*EARL STANHOPE': None,
 '*GENERAL LAURIE

In [13]:
for col in df.columns:
    print(col)
    print(df[col].iloc[0])
    print('-'*100)


debate_id
70c6c94e23f2b878
----------------------------------------------------------------------------------------------------
file_path
data-hansard/hansard/1900/apr/26_42_merchant-service-certificates-of-masters.html.gz
----------------------------------------------------------------------------------------------------
content_hash
70c6c94e23f2b878b37b8ae2fcfd94994f77764a738bbbfde6b4b414db966840
----------------------------------------------------------------------------------------------------
year
1900
----------------------------------------------------------------------------------------------------
decade
1900
----------------------------------------------------------------------------------------------------
month
apr
----------------------------------------------------------------------------------------------------
date
26 April 1900
----------------------------------------------------------------------------------------------------
chamber
Commons
--------------------------