# Data Loading and Preprocessing for Parliamentary Speeches

This notebook provides a reusable function for processing parliamentary speech data from ParlaMint corpora. 

## Simplified Approach:
- **Single output**: Returns one clean dataframe with raw text preserved
- **Why?** Modern NLP models (embeddings, LIWC) work better with natural language
- **Preprocessing**: Only applied where specifically needed (e.g., BERTopic stopwords)

In [2]:
import os
import pandas as pd
import numpy as np

In [3]:
def process_parliament_data_parlamint5(parent_folder):
    """
    Process ParlaMint 5.0 parliamentary speech data using meta.tsv and .txt only (ignore ana-meta.tsv).
    Returns a single dataframe with all speeches and raw text.
    """
    print(f"Processing parliamentary data from: {parent_folder}")
    df_list = []
    year_folders = sorted([f for f in os.listdir(parent_folder) if os.path.isdir(os.path.join(parent_folder, f))])
    print(f"Found {len(year_folders)} year folders: {year_folders}")

    for year_folder in year_folders:
        folder_path = os.path.join(parent_folder, year_folder)
        meta_files = sorted([f for f in os.listdir(folder_path) if f.endswith('-meta.tsv') and not f.endswith('-ana-meta.tsv')])

        print(f"  Processing {year_folder}: {len(meta_files)} meta.tsv files")

        for meta_file in meta_files:
            base = meta_file.replace('-meta.tsv', '')
            txt_file = base + '.txt'
            meta_path = os.path.join(folder_path, meta_file)
            txt_path = os.path.join(folder_path, txt_file)

            try:
                df_meta = pd.read_csv(meta_path, sep='\t', encoding='utf-8', index_col=False)

                # Load speech texts from .txt file (ID<TAB>Text)
                text_map = {}
                if os.path.exists(txt_path):
                    with open(txt_path, encoding='utf-8') as f:
                        for line in f:
                            parts = line.strip().split('\t', 1)
                            if len(parts) == 2:
                                text_map[parts[0]] = parts[1]

                # Attach text to meta rows by ID
                df_meta['Text'] = df_meta['ID'].map(text_map)

                # Remove rows with missing text (None or NaN or empty string)
                df_meta = df_meta[df_meta['Text'].notnull() & (df_meta['Text'].str.strip() != '')]

                df_list.append(df_meta)

            except Exception as e:
                print(f"    Error reading files {meta_file}, {txt_file}: {e}")
                continue

    if not df_list:
        raise ValueError("No data was successfully loaded from the specified folder structure")

    print("\nCombining all data...")
    df_all = pd.concat(df_list, ignore_index=True)
    print(f"Combined dataframe shape: {df_all.shape}")

    # Calculate word counts for ALL speeches
    df_all['Word_Count'] = df_all['Text'].apply(lambda x: len(str(x).split()))

    print(f"\nProcessing complete!")
    print(f"Final dataframe shape: {df_all.shape}")
    print(f"Total speeches: {len(df_all)}")

    return df_all

In [4]:
# Process Austrian data with ParlaMint 5.0 logic code (meta.tsv + .txt only)
parent_folder = r"data folder\ParlaMint5.0-AT-en.ana\ParlaMint-AT-en.txt"
AT_all = process_parliament_data_parlamint5(parent_folder)

# Save the full dataframe (no filtering)
AT_all.to_pickle(r'data folder\data\AT_en.pkl')
print(f"Saved full dataframe to 'AT_en.pkl'")

# Show a quick summary
pd.set_option('display.max_columns', None)
print('Total speeches:', len(AT_all))

Processing parliamentary data from: data folder\ParlaMint5.0-AT-en.ana\ParlaMint-AT-en.txt
Found 27 year folders: ['1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']
  Processing 1996: 54 meta.tsv files
  Processing 1997: 51 meta.tsv files
  Processing 1997: 51 meta.tsv files
  Processing 1998: 50 meta.tsv files
  Processing 1998: 50 meta.tsv files
  Processing 1999: 32 meta.tsv files
  Processing 1999: 32 meta.tsv files
  Processing 2000: 48 meta.tsv files
  Processing 2000: 48 meta.tsv files
  Processing 2001: 37 meta.tsv files
  Processing 2001: 37 meta.tsv files
  Processing 2002: 29 meta.tsv files
  Processing 2002: 29 meta.tsv files
  Processing 2003: 40 meta.tsv files
  Processing 2003: 40 meta.tsv files
  Processing 2004: 50 meta.tsv files
  Processing 2004: 50 meta.tsv files
  Processing 2005: 42 meta.tsv files
  P

In [5]:
AT_all.head(2)

AT_all["Subcorpus"].value_counts()

Subcorpus
Reference    207141
COVID         19056
COVID,War      5562
Name: count, dtype: int64

In [7]:
AT_all.tail(2)

Unnamed: 0,Text_ID,ID,Title,Date,Body,Term,Session,Meeting,Sitting,Agenda,Subcorpus,Lang,Speaker_role,Speaker_MP,Speaker_minister,Speaker_party,Speaker_party_name,Party_status,Party_orientation,Speaker_ID,Speaker_name,Speaker_gender,Speaker_birth,Topic,Text,Word_Count
231757,ParlaMint-AT-en_2022-10-12-027-XXVII-NRSITZ-00178,ParlaMint-AT_2022-10-12-027-XXVII-NRSITZ-00178...,"Sitting Number 178, Legislative period XXVII, ...",2022-10-12,Lower house,Legislative period XXVII,-,-,Sitting 178,-,"COVID,War",English,Chairperson,MP,notMinister,ÖVP,parliamentary group of the Austrian People's P...,Coalition,Centre-right to right,PAD_88386,"Sobotka, Wolfgang",M,1956,Other,We come to the moved votes. Can we vote? SPÖ? ...,147
231758,ParlaMint-AT-en_2022-10-12-027-XXVII-NRSITZ-00178,ParlaMint-AT_2022-10-12-027-XXVII-NRSITZ-00178...,"Sitting Number 178, Legislative period XXVII, ...",2022-10-12,Lower house,Legislative period XXVII,-,-,Sitting 178,-,"COVID,War",English,Chairperson,MP,notMinister,ÖVP,parliamentary group of the Austrian People's P...,Coalition,Centre-right to right,PAD_88386,"Sobotka, Wolfgang",M,1956,Other,"I am entitled to announce that, at today's mee...",48


In [10]:
print(AT_all['ID'].nunique())
print(AT_all['Text_ID'].nunique())

231759
1221
