# Data Loading and Preprocessing for Parliamentary Speeches

This notebook provides a reusable function for processing parliamentary speech data from ParlaMint corpora. 

## Simplified Approach:
- **Single output**: Returns one clean dataframe with raw text preserved
- **Why?** Modern NLP models (embeddings, LIWC) work better with natural language
- **Preprocessing**: Only applied where specifically needed (e.g., BERTopic stopwords)

In [7]:
import os
import pandas as pd
import numpy as np

In [8]:
def process_parliament_data_parlamint5(parent_folder):
    """
    Process ParlaMint 5.0 parliamentary speech data using meta.tsv and .txt only (ignore ana-meta.tsv).
    Returns a single dataframe with all speeches and raw text.
    """
    print(f"Processing parliamentary data from: {parent_folder}")
    df_list = []
    year_folders = sorted([f for f in os.listdir(parent_folder) if os.path.isdir(os.path.join(parent_folder, f))])
    print(f"Found {len(year_folders)} year folders: {year_folders}")

    for year_folder in year_folders:
        folder_path = os.path.join(parent_folder, year_folder)
        meta_files = sorted([f for f in os.listdir(folder_path) if f.endswith('-meta.tsv') and not f.endswith('-ana-meta.tsv')])

        print(f"  Processing {year_folder}: {len(meta_files)} meta.tsv files")

        for meta_file in meta_files:
            base = meta_file.replace('-meta.tsv', '')
            txt_file = base + '.txt'
            meta_path = os.path.join(folder_path, meta_file)
            txt_path = os.path.join(folder_path, txt_file)

            try:
                df_meta = pd.read_csv(meta_path, sep='\t', encoding='utf-8', index_col=False)

                # Load speech texts from .txt file (ID<TAB>Text)
                text_map = {}
                if os.path.exists(txt_path):
                    with open(txt_path, encoding='utf-8') as f:
                        for line in f:
                            parts = line.strip().split('\t', 1)
                            if len(parts) == 2:
                                text_map[parts[0]] = parts[1]

                # Attach text to meta rows by ID
                df_meta['Text'] = df_meta['ID'].map(text_map)

                # Remove rows with missing text (None or NaN or empty string)
                df_meta = df_meta[df_meta['Text'].notnull() & (df_meta['Text'].str.strip() != '')]

                df_list.append(df_meta)

            except Exception as e:
                print(f"    Error reading files {meta_file}, {txt_file}: {e}")
                continue

    if not df_list:
        raise ValueError("No data was successfully loaded from the specified folder structure")

    print("\nCombining all data...")
    df_all = pd.concat(df_list, ignore_index=True)
    print(f"Combined dataframe shape: {df_all.shape}")

    # Calculate word counts for ALL speeches
    df_all['Word_Count'] = df_all['Text'].apply(lambda x: len(str(x).split()))

    print(f"\nProcessing complete!")
    print(f"Final dataframe shape: {df_all.shape}")
    print(f"Total speeches: {len(df_all)}")

    return df_all

Process AT english translated

In [9]:
# Process Austrian data with ParlaMint 5.0 logic code (meta.tsv + .txt only)
parent_folder = r"data folder\ParlaMint5.0-AT-en.ana\ParlaMint-AT-en.txt"
AT_all = process_parliament_data_parlamint5(parent_folder)

# Save the full dataframe (no filtering)
AT_all.to_pickle(r'data folder\data\AT_en.pkl')
print(f"Saved full dataframe to 'AT_en.pkl'")

# Show a quick summary
pd.set_option('display.max_columns', None)
print('Total speeches:', len(AT_all))

Processing parliamentary data from: data folder\ParlaMint5.0-AT-en.ana\ParlaMint-AT-en.txt
Found 27 year folders: ['1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']
  Processing 1996: 54 meta.tsv files
  Processing 1997: 51 meta.tsv files
  Processing 1998: 50 meta.tsv files
  Processing 1999: 32 meta.tsv files
  Processing 2000: 48 meta.tsv files
  Processing 2001: 37 meta.tsv files
  Processing 2002: 29 meta.tsv files
  Processing 2003: 40 meta.tsv files
  Processing 2004: 50 meta.tsv files
  Processing 2005: 42 meta.tsv files
  Processing 2006: 37 meta.tsv files
  Processing 2007: 35 meta.tsv files
  Processing 2008: 41 meta.tsv files
  Processing 2009: 43 meta.tsv files
  Processing 2010: 40 meta.tsv files
  Processing 2011: 47 meta.tsv files
  Processing 2012: 47 meta.tsv files
  Processing 2013: 41 meta.tsv files
  P

Process AT in german

In [10]:
parent_folder = r"data folder\ParlaMint5.0-ATmultilingual\ParlaMint-AT.txt"
AT_german = process_parliament_data_parlamint5(parent_folder)


# Save the full dataframe (no filtering)
AT_german.to_pickle(r'data folder\data\AT_german.pkl')
print(f"Saved full dataframe to 'AT_german.pkl'")

# Show a quick summary
pd.set_option('display.max_columns', None)
print('Total speeches:', len(AT_german))

Processing parliamentary data from: data folder\ParlaMint5.0-ATmultilingual\ParlaMint-AT.txt
Found 27 year folders: ['1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']
  Processing 1996: 54 meta.tsv files
  Processing 1997: 51 meta.tsv files
  Processing 1998: 50 meta.tsv files
  Processing 1999: 32 meta.tsv files
  Processing 2000: 48 meta.tsv files
  Processing 2001: 37 meta.tsv files
  Processing 2002: 29 meta.tsv files
  Processing 2003: 40 meta.tsv files
  Processing 2004: 50 meta.tsv files
  Processing 2005: 42 meta.tsv files
  Processing 2006: 37 meta.tsv files
  Processing 2007: 35 meta.tsv files
  Processing 2008: 41 meta.tsv files
  Processing 2009: 43 meta.tsv files
  Processing 2010: 40 meta.tsv files
  Processing 2011: 47 meta.tsv files
  Processing 2012: 47 meta.tsv files
  Processing 2013: 41 meta.tsv files
 

In [11]:
AT_german.head(3)
print(AT_german['Speaker_role'].value_counts())

print(AT_all['Speaker_role'].value_counts())

Speaker_role
PräsidentIn              125042
Reguläre/r SprecherIn    106277
Gast                        440
Name: count, dtype: int64
Speaker_role
Chairperson    125042
Regular        106277
Guest             440
Name: count, dtype: int64


Process GB

In [None]:
parent_folder = r"data folder\ParlaMint5.0-GB\ParlaMint-GB.txt"
GB = process_parliament_data_parlamint5(parent_folder)


# Save the full dataframe (no filtering)
GB.to_pickle(r'data folder\data\GB.pkl')
print(f"Saved full dataframe to 'GB.pkl'")

# Show a quick summary
pd.set_option('display.max_columns', None)
print('Total speeches:', len(GB))

Processing parliamentary data from: data folder\ParlaMint5.0-GB\ParlaMint-GB.txt
Found 8 year folders: ['2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']
  Processing 2015: 277 meta.tsv files
  Processing 2016: 301 meta.tsv files
  Processing 2017: 259 meta.tsv files
  Processing 2018: 309 meta.tsv files
  Processing 2019: 281 meta.tsv files
  Processing 2020: 313 meta.tsv files
  Processing 2021: 267 meta.tsv files
  Processing 2022: 202 meta.tsv files

Combining all data...
Combined dataframe shape: (670912, 25)

Processing complete!
Final dataframe shape: (670912, 26)
Total speeches: 670912
Saved full dataframe to 'GB.pkl'
Total speeches: 670912


Unnamed: 0,Text_ID,ID,Title,Date,Body,Term,Session,Meeting,Sitting,Agenda,Subcorpus,Lang,Speaker_role,Speaker_MP,Speaker_minister,Speaker_party,Speaker_party_name,Party_status,Party_orientation,Speaker_ID,Speaker_name,Speaker_gender,Speaker_birth,Topic,Text,Word_Count
0,ParlaMint-GB_2015-01-05-commons,ParlaMint-GB_2015-01-05-commons.u1,"Minutes of the House of Commons, Daily Session...",2015-01-05,Lower house,55,-,2015-01-05,2015-01-05,-,Reference,English,Regular,MP,notMinister,LD,Liberal Democrat,Coalition,Centre to centre-left,JenniferWillott,"Willott, Jennifer Nancy",F,-,Immigration,1. What progress her Department has made on im...,13
1,ParlaMint-GB_2015-01-05-commons,ParlaMint-GB_2015-01-05-commons.u2,"Minutes of the House of Commons, Daily Session...",2015-01-05,Lower house,55,-,2015-01-05,2015-01-05,-,Reference,English,Regular,MP,notMinister,CON,Conservative,Coalition,Centre-right,TheresaMay,"May, Theresa Mary",F,-,Transportation,The Government are on track to deliver their c...,25
2,ParlaMint-GB_2015-01-05-commons,ParlaMint-GB_2015-01-05-commons.u3,"Minutes of the House of Commons, Daily Session...",2015-01-05,Lower house,55,-,2015-01-05,2015-01-05,-,Reference,English,Regular,MP,notMinister,LD,Liberal Democrat,Coalition,Centre to centre-left,JenniferWillott,"Willott, Jennifer Nancy",F,-,Immigration,"It is clear that exit checks, which were scrap...",49
3,ParlaMint-GB_2015-01-05-commons,ParlaMint-GB_2015-01-05-commons.u4,"Minutes of the House of Commons, Daily Session...",2015-01-05,Lower house,55,-,2015-01-05,2015-01-05,-,Reference,English,Regular,MP,notMinister,CON,Conservative,Coalition,Centre-right,TheresaMay,"May, Theresa Mary",F,-,Immigration,"As I indicated in my original answer, we are o...",86
4,ParlaMint-GB_2015-01-05-commons,ParlaMint-GB_2015-01-05-commons.u5,"Minutes of the House of Commons, Daily Session...",2015-01-05,Lower house,55,-,2015-01-05,2015-01-05,-,Reference,English,Regular,MP,notMinister,LAB,Labour,Opposition,Centre-left,AlexanderCunningham,"Cunningham, Alexander",M,-,Immigration,19. Given the situation at our border in Calai...,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
670907,ParlaMint-GB_2022-07-21-lords,ParlaMint-GB_2022-07-21-lords.u209,"Minutes of the House of Lords, Daily Session 2...",2022-07-21,Upper house,58,-,2022-07-21,2022-07-21,-,"COVID,War",English,Regular,notMP,notMinister,CON,Conservative,-,Centre-right,JoannaPenn,"Penn, Joanna",F,-,Domestic Commerce,I will have to check that point for the noble ...,522
670908,ParlaMint-GB_2022-07-21-lords,ParlaMint-GB_2022-07-21-lords.u210,"Minutes of the House of Lords, Daily Session 2...",2022-07-21,Upper house,58,-,2022-07-21,2022-07-21,-,"COVID,War",English,Regular,notMP,notMinister,LAB,Labour,Opposition,Centre-left,PremSikka,"Sikka, Prem",M,-,Domestic Commerce,"My Lords, the Minister has referred a number o...",140
670909,ParlaMint-GB_2022-07-21-lords,ParlaMint-GB_2022-07-21-lords.u211,"Minutes of the House of Lords, Daily Session 2...",2022-07-21,Upper house,58,-,2022-07-21,2022-07-21,-,"COVID,War",English,Regular,notMP,notMinister,CON,Conservative,-,Centre-right,JoannaPenn,"Penn, Joanna",F,-,Domestic Commerce,The noble Lord is right that different aspects...,1070
670910,ParlaMint-GB_2022-07-21-lords,ParlaMint-GB_2022-07-21-lords.u212,"Minutes of the House of Lords, Daily Session 2...",2022-07-21,Upper house,58,-,2022-07-21,2022-07-21,-,"COVID,War",English,Regular,notMP,notMinister,LAB,Labour,Opposition,Centre-left,PremSikka,"Sikka, Prem",M,-,Domestic Commerce,I thank noble Lords for a very interesting deb...,413


In [14]:
print(GB['Speaker_role'].value_counts())

GB.head(3)

Speaker_role
Regular        654567
Chairperson     16345
Name: count, dtype: int64


Unnamed: 0,Text_ID,ID,Title,Date,Body,Term,Session,Meeting,Sitting,Agenda,Subcorpus,Lang,Speaker_role,Speaker_MP,Speaker_minister,Speaker_party,Speaker_party_name,Party_status,Party_orientation,Speaker_ID,Speaker_name,Speaker_gender,Speaker_birth,Topic,Text,Word_Count
0,ParlaMint-GB_2015-01-05-commons,ParlaMint-GB_2015-01-05-commons.u1,"Minutes of the House of Commons, Daily Session...",2015-01-05,Lower house,55,-,2015-01-05,2015-01-05,-,Reference,English,Regular,MP,notMinister,LD,Liberal Democrat,Coalition,Centre to centre-left,JenniferWillott,"Willott, Jennifer Nancy",F,-,Immigration,1. What progress her Department has made on im...,13
1,ParlaMint-GB_2015-01-05-commons,ParlaMint-GB_2015-01-05-commons.u2,"Minutes of the House of Commons, Daily Session...",2015-01-05,Lower house,55,-,2015-01-05,2015-01-05,-,Reference,English,Regular,MP,notMinister,CON,Conservative,Coalition,Centre-right,TheresaMay,"May, Theresa Mary",F,-,Transportation,The Government are on track to deliver their c...,25
2,ParlaMint-GB_2015-01-05-commons,ParlaMint-GB_2015-01-05-commons.u3,"Minutes of the House of Commons, Daily Session...",2015-01-05,Lower house,55,-,2015-01-05,2015-01-05,-,Reference,English,Regular,MP,notMinister,LD,Liberal Democrat,Coalition,Centre to centre-left,JenniferWillott,"Willott, Jennifer Nancy",F,-,Immigration,"It is clear that exit checks, which were scrap...",49


Process Croatia in english

In [15]:
parent_folder = r"data folder\ParlaMint5.0-HR\ParlaMint-HR.txt"
CRO_en = process_parliament_data_parlamint5(parent_folder)


# Save the full dataframe (no filtering)
CRO_en.to_pickle(r'data folder\data\CRO_en.pkl')
print(f"Saved full dataframe to 'CRO_en.pkl'")

# Show a quick summary
print('Total speeches:', len(parent_folder))


Processing parliamentary data from: data folder\ParlaMint5.0-HR\ParlaMint-HR.txt
Found 20 year folders: ['2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']
  Processing 2003: 3 meta.tsv files
  Processing 2004: 68 meta.tsv files
  Processing 2005: 78 meta.tsv files
  Processing 2006: 81 meta.tsv files
  Processing 2007: 58 meta.tsv files
  Processing 2008: 83 meta.tsv files
  Processing 2009: 82 meta.tsv files
  Processing 2010: 88 meta.tsv files
  Processing 2011: 71 meta.tsv files
  Processing 2012: 94 meta.tsv files
  Processing 2013: 106 meta.tsv files
  Processing 2014: 101 meta.tsv files
  Processing 2015: 82 meta.tsv files
  Processing 2016: 87 meta.tsv files
  Processing 2017: 107 meta.tsv files
  Processing 2018: 113 meta.tsv files
  Processing 2019: 102 meta.tsv files
  Processing 2020: 107 meta.tsv files
  Processing 2021: 121 meta.tsv files
  Processing 2022: 76 meta

In [16]:
print(CRO_en['Speaker_role'].value_counts())

CRO_en.head(3)

Speaker_role
Redovni            257753
Predsjedavajući    246585
Name: count, dtype: int64


Unnamed: 0,Text_ID,ID,Title,Date,Body,Term,Session,Meeting,Sitting,Agenda,Subcorpus,Lang,Speaker_role,Speaker_MP,Speaker_minister,Speaker_party,Speaker_party_name,Party_status,Party_orientation,Speaker_ID,Speaker_name,Speaker_gender,Speaker_birth,Topic,Text,Word_Count
0,ParlaMint-HR_2003-12-22-0,ParlaMint-HR_2003-12-22-0.u1,"Zapisnici sjednica Hrvatskog sabora, mandat 5,...",2003-12-22,Jednodoman,5. mandat,-,Zasedanje 1,2003-12-22,-,Referenca,hrvatski,Predsjedavajući,MP,notMinister,HSS,Hrvatska seljačka stranka,Opposition,Centar do desni centar,TomčićZlatko,"Tomčić, Zlatko",M,1945,Ostalo,Cijenjene gospođe i gospodo. Pripala mi je čas...,1203
1,ParlaMint-HR_2003-12-22-0,ParlaMint-HR_2003-12-22-0.u2,"Zapisnici sjednica Hrvatskog sabora, mandat 5,...",2003-12-22,Jednodoman,5. mandat,-,Zasedanje 1,2003-12-22,-,Referenca,hrvatski,Redovni,MP,notMinister,HDZ,Hrvatska demokratska zajednica,Opposition,Desni centar,ŠeksVladimir,"Šeks, Vladimir",M,1943,Aktivnosti vlade,"Gospodine predsjedavajući, uvažene gospođe i g...",1547
2,ParlaMint-HR_2003-12-22-0,ParlaMint-HR_2003-12-22-0.u5,"Zapisnici sjednica Hrvatskog sabora, mandat 5,...",2003-12-22,Jednodoman,5. mandat,-,Zasedanje 1,2003-12-22,-,Referenca,hrvatski,Redovni,MP,notMinister,HDZ,Hrvatska demokratska zajednica,Opposition,Desni centar,SesvečanDamir,"Sesvečan, Damir",M,1967,Aktivnosti vlade,"Kolegice i kolege zastupnici, dozvolite mi da ...",1588


Process Croatia in croatian

In [17]:
parent_folder = r"data folder\ParlaMint5.0-HR-en.ana\ParlaMint-HR-en.txt"
CRO_cro = process_parliament_data_parlamint5(parent_folder)


# Save the full dataframe (no filtering)
CRO_cro.to_pickle(r'data folder\data\CRO_cro.pkl')
print(f"Saved full dataframe to 'CRO_cro.pkl'")

# Show a quick summary
print('Total speeches:', len(CRO_cro))

Processing parliamentary data from: data folder\ParlaMint5.0-HR-en.ana\ParlaMint-HR-en.txt
Found 20 year folders: ['2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']
  Processing 2003: 3 meta.tsv files
  Processing 2004: 68 meta.tsv files
  Processing 2005: 78 meta.tsv files
  Processing 2006: 81 meta.tsv files
  Processing 2007: 58 meta.tsv files
  Processing 2008: 83 meta.tsv files
  Processing 2009: 82 meta.tsv files
  Processing 2010: 88 meta.tsv files
  Processing 2011: 71 meta.tsv files
  Processing 2012: 94 meta.tsv files
  Processing 2013: 106 meta.tsv files
  Processing 2014: 101 meta.tsv files
  Processing 2015: 82 meta.tsv files
  Processing 2016: 87 meta.tsv files
  Processing 2017: 107 meta.tsv files
  Processing 2018: 113 meta.tsv files
  Processing 2019: 102 meta.tsv files
  Processing 2020: 107 meta.tsv files
  Processing 2021: 121 meta.tsv files
  Processing 202

In [18]:
print(CRO_cro['Speaker_role'].value_counts())

CRO_cro.head(3)

Speaker_role
Regular        257753
Chairperson    246585
Name: count, dtype: int64


Unnamed: 0,Text_ID,ID,Title,Date,Body,Term,Session,Meeting,Sitting,Agenda,Subcorpus,Lang,Speaker_role,Speaker_MP,Speaker_minister,Speaker_party,Speaker_party_name,Party_status,Party_orientation,Speaker_ID,Speaker_name,Speaker_gender,Speaker_birth,Topic,Text,Word_Count
0,ParlaMint-HR-en_2003-12-22-0,ParlaMint-HR_2003-12-22-0.u1,Minutes of the National Assembly of the Republ...,2003-12-22,Unicameralism,5. mandat,-,Zasedanje 1,2003-12-22,-,Reference,English,Chairperson,MP,notMinister,HSS,Hrvatska seljačka stranka,Opposition,Centre to centre-right,TomčićZlatko,"Tomčić, Zlatko",M,1945,Other,Honored ladies and gentlemen. It has been my h...,1610
1,ParlaMint-HR-en_2003-12-22-0,ParlaMint-HR_2003-12-22-0.u2,Minutes of the National Assembly of the Republ...,2003-12-22,Unicameralism,5. mandat,-,Zasedanje 1,2003-12-22,-,Reference,English,Regular,MP,notMinister,HDZ,Hrvatska demokratska zajednica,Opposition,Centre-right,ŠeksVladimir,"Šeks, Vladimir",M,1943,Government Operations,"Mr. Chairman, distinguished ladies and gentlem...",1958
2,ParlaMint-HR-en_2003-12-22-0,ParlaMint-HR_2003-12-22-0.u5,Minutes of the National Assembly of the Republ...,2003-12-22,Unicameralism,5. mandat,-,Zasedanje 1,2003-12-22,-,Reference,English,Regular,MP,notMinister,HDZ,Hrvatska demokratska zajednica,Opposition,Centre-right,SesvečanDamir,"Sesvečan, Damir",M,1967,Government Operations,"My colleagues and colleagues, allow me to subm...",1878
