# Data Loading and Preprocessing for Parliamentary Speeches

This notebook provides a reusable function for processing parliamentary speech data from ParlaMint corpora. 

## Simplified Approach:
- **Single output**: Returns one clean dataframe with raw text preserved
- **Why?** Modern NLP models (embeddings, LIWC) work better with natural language
- **Preprocessing**: Only applied where specifically needed (e.g., BERTopic stopwords)

In [7]:
import os
import pandas as pd
import numpy as np

# Note: Removed NLTK preprocessing since we're keeping raw text

In [None]:
def process_parliament_data(parent_folder, min_word_count=10):  # Changed back to 10
    """
    Process parliamentary speech data, preserving raw text for modern NLP pipelines.
    
    Args:
        parent_folder (str): Path to the folder containing year subdirectories
        min_word_count (int): Minimum word count for speeches (default: 10)
    
    Returns:
        tuple: (original_df, processed_df)
            - original_df: Complete unfiltered data
            - processed_df: Filtered data ready for topic modeling
    """
    print(f"Processing parliamentary data from: {parent_folder}")
    
    # Initialize list to collect dataframes
    df_list = []
    
    # Get year folders (assuming they are named as digits)
    year_folders = sorted([f for f in os.listdir(parent_folder) if f.isdigit()])
    print(f"Found {len(year_folders)} year folders: {year_folders}")
    
    def process_year_folder(year_folder):
        """Process all files in a specific year folder."""
        folder_path = os.path.join(parent_folder, year_folder)
        txt_files = sorted([f for f in os.listdir(folder_path) if f.endswith(".txt")])
        tsv_files = sorted([f for f in os.listdir(folder_path) if f.endswith("-meta.tsv")])
        
        print(f"  Processing {year_folder}: {len(txt_files)} txt files, {len(tsv_files)} tsv files")
        
        if len(txt_files) != len(tsv_files):
            print(f"  Warning: Mismatch in TXT and TSV file counts for {year_folder}!")
            return
        
        for txt_file, tsv_file in zip(txt_files, tsv_files):
            txt_path = os.path.join(folder_path, txt_file)
            tsv_path = os.path.join(folder_path, tsv_file)

            try:
                # Read text and metadata files
                df_txt = pd.read_csv(txt_path, sep="\t", header=None, names=["ID", "Text"], encoding="utf-8")
                df_meta = pd.read_csv(tsv_path, sep="\t", encoding="utf-8", index_col=False)
                
                # Merge on ID
                merged_df = pd.merge(df_meta, df_txt, on="ID", how="inner")
                
                if len(df_txt) != len(df_meta):
                    print(f"    Warning: Row count mismatch in {txt_file} ({len(df_txt)}) and {tsv_file} ({len(df_meta)})")
                
                df_list.append(merged_df)
                
            except Exception as e:
                print(f"    Error reading files {txt_file}, {tsv_file}: {e}")
                continue
    
    # Process all year folders
    for year in year_folders:
        process_year_folder(year)
    
    if not df_list:
        raise ValueError("No data was successfully loaded from the specified folder structure")
    
    # Combine all data into one dataframe
    print("\nCombining all data...")
    original_df = pd.concat(df_list, ignore_index=True)
    print(f"Original dataframe shape: {original_df.shape}")
    
    # Standardize column names for consistency
    if 'Text_ID' in original_df.columns:
        original_df.rename(columns={'Text_ID': 'Sitting_ID'}, inplace=True)
    if 'ID' in original_df.columns and 'Speech_ID' not in original_df.columns:
        original_df.rename(columns={'ID': 'Speech_ID'}, inplace=True)
    
    # # Drop unnecessary columns if they exist
    # columns_to_drop = ['Body', 'Term', 'Session', 'Meeting', 'Sitting', 'Agenda', 'Subcorpus', 'Lang']
    # existing_columns_to_drop = [col for col in columns_to_drop if col in original_df.columns]
    # if existing_columns_to_drop:
    #     original_df.drop(columns=existing_columns_to_drop, inplace=True)
    #     print(f"Dropped columns: {existing_columns_to_drop}")
    
    # Calculate word counts for ALL speeches
    original_df['Word_Count'] = original_df['Text'].apply(lambda x: len(str(x).split()))
    
    # Only filter very short speeches (keep long speeches for topic modeling)
    original_df['Is_Too_Short'] = original_df['Word_Count'] < min_word_count
    original_df['Is_Filtered'] = original_df['Is_Too_Short']
    
    print(f"\nFiltering analysis:")
    print(f"  Too short (<{min_word_count} words): {original_df['Is_Too_Short'].sum()}")
    print(f"  Remaining for topic modeling: {(~original_df['Is_Filtered']).sum()}")
    
    # Create processed version for topic modeling (no merging step)
    processed_df = original_df[~original_df['Is_Filtered']].copy()
    
    # Sort by sitting and speech order for consistency (but don't merge)
    if 'Sitting_ID' in processed_df.columns and 'Speech_ID' in processed_df.columns:
        processed_df = processed_df.sort_values(['Sitting_ID', 'Speech_ID']).reset_index(drop=True)
        print("Sorted speeches by Sitting_ID and Speech_ID for consistency")
    
    # Update word count (should be same as original since no merging)
    processed_df['Word_Count'] = processed_df['Text'].apply(lambda x: len(str(x).split()))
    
    # Add identifier to track which speeches went into topic modeling
    processed_df['Used_For_Topic_Modeling'] = True
    
    # Don't add Processed_Text column - let topic modeling handle the column structure
    
    print(f"\nProcessing complete!")
    print(f"Original dataframe shape: {original_df.shape}")
    print(f"Processed dataframe shape: {processed_df.shape}")
    print(f"Filtered out {len(original_df) - len(processed_df)} short speeches")
    print("Each speech treated as independent unit for topic modeling")
    
    return original_df, processed_df

# Process Austrian data with simplified logic
parent_folder = r"data folder\ParlaMint-AT-en.ana\ParlaMint-AT-en.txt"
AT_original, AT_processed = process_parliament_data(parent_folder)

# Save both versions
AT_original.to_pickle(r'data folder\data\AT_original_complete.pkl')
AT_processed.to_pickle(r'data folder\data\AT_for_topic_modeling.pkl')

print(f"Saved original complete dataframe to 'AT_original_complete.pkl'")
print(f"Saved processed dataframe to 'AT_for_topic_modeling.pkl'")

Processing parliamentary data from: data folder\ParlaMint-AT-en.ana\ParlaMint-AT-en.txt
Found 27 year folders: ['1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']
  Processing 1996: 54 txt files, 54 tsv files
  Processing 1997: 51 txt files, 51 tsv files
  Processing 1998: 50 txt files, 50 tsv files
  Processing 1999: 32 txt files, 32 tsv files
  Processing 2000: 48 txt files, 48 tsv files
  Processing 2001: 37 txt files, 37 tsv files
  Processing 2002: 29 txt files, 29 tsv files
  Processing 2003: 40 txt files, 40 tsv files
  Processing 2004: 50 txt files, 50 tsv files
  Processing 2005: 42 txt files, 42 tsv files
  Processing 2006: 37 txt files, 37 tsv files
  Processing 2007: 35 txt files, 35 tsv files
  Processing 2008: 41 txt files, 41 tsv files
  Processing 2009: 43 txt files, 43 tsv files
  Processing 2010: 40 txt fi