# Data Loading and Preprocessing for Parliamentary Speeches

This notebook provides a reusable function for processing parliamentary speech data from ParlaMint corpora. 

## Simplified Approach:
- **Single output**: Returns one clean dataframe with raw text preserved
- **Why?** Modern NLP models (embeddings, LIWC) work better with natural language
- **Preprocessing**: Only applied where specifically needed (e.g., BERTopic stopwords)

In [None]:
import os
import pandas as pd
pd.set_option('display.max_columns', None)

def process_parliament_data_parlamint5(parent_folder):
    """
    Process ParlaMint 5.0 parliamentary speech data using meta.tsv and .txt only (ignore ana-meta.tsv).
    Returns a single dataframe with all speeches and raw text.
    """
    print(f"Processing parliamentary data from: {parent_folder}")
    df_list = []
    year_folders = sorted([f for f in os.listdir(parent_folder) if os.path.isdir(os.path.join(parent_folder, f))])
    print(f"Found {len(year_folders)} year folders, from {year_folders[0]} to {year_folders[-1]}")
    for year_folder in year_folders:
        folder_path = os.path.join(parent_folder, year_folder)
        meta_files = sorted([f for f in os.listdir(folder_path) if f.endswith('-meta.tsv') and not f.endswith('-ana-meta.tsv')])

        for meta_file in meta_files:
            base = meta_file.replace('-meta.tsv', '')
            txt_file = base + '.txt'
            meta_path = os.path.join(folder_path, meta_file)
            txt_path = os.path.join(folder_path, txt_file)

            try:
                df_meta = pd.read_csv(meta_path, sep='\t', encoding='utf-8', index_col=False)

                # Load speech texts from .txt file (ID<TAB>Text)
                text_map = {}
                if os.path.exists(txt_path):
                    with open(txt_path, encoding='utf-8') as f:
                        for line in f:
                            parts = line.strip().split('\t', 1)
                            if len(parts) == 2:
                                text_map[parts[0]] = parts[1]

                # Attach text to meta rows by ID
                df_meta['Text'] = df_meta['ID'].map(text_map)

                # Remove rows with missing text (None or NaN or empty string)
                df_meta = df_meta[df_meta['Text'].notnull() & (df_meta['Text'].str.strip() != '')]

                df_list.append(df_meta)

            except Exception as e:
                print(f"    Error reading files {meta_file}, {txt_file}: {e}")
                continue

    if not df_list:
        raise ValueError("No data was successfully loaded from the specified folder structure")

    df_all = pd.concat(df_list, ignore_index=True)

    return df_all

Process AT - Combine English and German versions

In [6]:
# Load English version
parent_folder_en = r"data folder\AT\ParlaMint5.0-AT-en.ana\ParlaMint-AT-en.txt"
AT_en = process_parliament_data_parlamint5(parent_folder_en)

# Load German version
parent_folder_de = r"data folder\AT\ParlaMint5.0-AT\ParlaMint-AT.txt"
AT_german = process_parliament_data_parlamint5(parent_folder_de)

# Merge on ID to combine both text versions
AT_combined = AT_en.merge(AT_german[['ID', 'Text']], on='ID', how='inner', suffixes=('', '_native'))
AT_combined = AT_combined.rename(columns={'Text_native': 'Text_native_language'})

# Save the combined dataframe
AT_combined.to_pickle(r'data folder\AT\AT_combined.pkl')
print(f"Saved combined dataframe to 'AT_combined.pkl'")

# Show a quick summary
print("#####################################")
print('Total speeches:', len(AT_combined))
print('English text sample:', AT_combined['Text'].iloc[0][:100])
print('German text sample:', AT_combined['Text_native_language'].iloc[0][:100])
print(AT_combined['Speaker_role'].value_counts())

Processing parliamentary data from: data folder\AT\ParlaMint5.0-AT-en.ana\ParlaMint-AT-en.txt
Found 27 year folders, from 1996 to 2022
Processing parliamentary data from: data folder\AT\ParlaMint5.0-AT\ParlaMint-AT.txt
Found 27 year folders, from 1996 to 2022
Saved combined dataframe to 'AT_combined.pkl'
#####################################
Total speeches: 231759
English text sample: Please take a seat. – I also ask the photographers to stop their activity. I may attend the first an
German text sample: Bitte Platz zu nehmen. – Ich bitte auch die Photographen, ihre Tätigkeit einzustellen. Ich darf die 
Speaker_role
Chairperson    125042
Regular        106277
Guest             440
Name: count, dtype: int64


Process Croatia - Combine English and Croatian versions

In [7]:
# Load English version
parent_folder_en = r"data folder\HR\ParlaMint5.0-HR-en.ana\ParlaMint-HR-en.txt"
HR_en = process_parliament_data_parlamint5(parent_folder_en)

# Load Croatian version
parent_folder_hr = r"data folder\HR\ParlaMint5.0-HR\ParlaMint-HR.txt"
HR_hr = process_parliament_data_parlamint5(parent_folder_hr)

# Merge on ID to combine both text versions
HR_combined = HR_en.merge(HR_hr[['ID', 'Text']], on='ID', how='inner', suffixes=('', '_native'))
HR_combined = HR_combined.rename(columns={'Text_native': 'Text_native_language'})

# Save the combined dataframe
HR_combined.to_pickle(r'data folder\HR\HR_combined.pkl')
print(f"Saved combined dataframe to 'HR_combined.pkl'")

# Show a quick summary
print("#####################################")
print('Total speeches:', len(HR_combined))
print('English text sample:', HR_combined['Text'].iloc[0][:100])
print('Croatian text sample:', HR_combined['Text_native_language'].iloc[0][:100])
print(HR_combined['Speaker_role'].value_counts())

Processing parliamentary data from: data folder\HR\ParlaMint5.0-HR-en.ana\ParlaMint-HR-en.txt
Found 20 year folders, from 2003 to 2022
Processing parliamentary data from: data folder\HR\ParlaMint5.0-HR\ParlaMint-HR.txt
Found 20 year folders, from 2003 to 2022
Saved combined dataframe to 'HR_combined.pkl'
#####################################
Total speeches: 504338
English text sample: Honored ladies and gentlemen. It has been my honor to open the first Constitutional session of the 5
Croatian text sample: Cijenjene gospođe i gospodo. Pripala mi je čast da sukladno Poslovniku Hrvatskog sabora otvorim prvu
Speaker_role
Regular        257753
Chairperson    246585
Name: count, dtype: int64


Process GB (English only)

In [8]:
parent_folder = r"data folder\GB\ParlaMint5.0-GB\ParlaMint-GB.txt"
GB = process_parliament_data_parlamint5(parent_folder)

# Save the dataframe
GB.to_pickle(r'data folder\GB\GB.pkl')
print(f"Saved dataframe to 'GB.pkl'")

# Show a quick summary
print("#####################################")
print('Total speeches:', len(GB))
print(GB['Speaker_role'].value_counts())

Processing parliamentary data from: data folder\GB\ParlaMint5.0-GB\ParlaMint-GB.txt
Found 8 year folders, from 2015 to 2022
Saved dataframe to 'GB.pkl'
#####################################
Total speeches: 670912
Speaker_role
Regular        654567
Chairperson     16345
Name: count, dtype: int64
