In [5]:
import sys
sys.path.insert(0, '..')

# Clear any cached imports
if 'src.data.load_data' in sys.modules:
    del sys.modules['src.data.load_data']

from src.data.load_data import load_sample

# Load the sample data
df = load_sample()

print(f"Sample data shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nColumn names: {df.columns}")
print(f"\nData types:")
print(df.schema)

Sample data shape: (35381, 12)

First few rows:
shape: (5, 12)
┌─────────┬─────────┬────────────┬───────────┬───┬────────────┬────────────┬───────────┬───────────┐
│ id      ┆ session ┆ electoralT ┆ firstName ┆ … ┆ documentUr ┆ positionSh ┆ positionL ┆ date      │
│ ---     ┆ ---     ┆ erm        ┆ ---       ┆   ┆ l          ┆ ort        ┆ ong       ┆ ---       │
│ i64     ┆ i64     ┆ ---        ┆ str       ┆   ┆ ---        ┆ ---        ┆ ---       ┆ str       │
│         ┆         ┆ i64        ┆           ┆   ┆ str        ┆ str        ┆ str       ┆           │
╞═════════╪═════════╪════════════╪═══════════╪═══╪════════════╪════════════╪═══════════╪═══════════╡
│ 1000550 ┆ 4       ┆ 19         ┆ Florian   ┆ … ┆ https://di ┆ Member of  ┆ NA        ┆ 2017-12-1 │
│         ┆         ┆            ┆           ┆   ┆ p21.bundes ┆ Parliament ┆           ┆ 2         │
│         ┆         ┆            ┆           ┆   ┆ tag.de/dip ┆            ┆           ┆           │
│         ┆         ┆       

In [6]:
# Data cleaning and type conversion
import polars as pl

# 1. Convert date to datetime (only if it's a string)
if df.schema['date'] == pl.String:
    df = df.with_columns(
        pl.col('date').str.to_date().cast(pl.Datetime).alias('date')
    )
else:
    print(f"Date column is already {df.schema['date']}, skipping conversion")

# 2. Drop rows with missing speechContent
df = df.drop_nulls(subset=['speechContent'])

# 3. Fill missing values in firstName and lastName with empty strings
df = df.with_columns([
    pl.col('firstName').fill_null(''),
    pl.col('lastName').fill_null('')
])

# 4. Replace invalid values (-1) with null before casting to categorical
df = df.with_columns([
    pl.col('electoralTerm').replace(-1, None),
    pl.col('factionId').replace(-1, None),
    pl.col('positionShort').replace('', None)
])

# 5. Cast categorical variables to categorical type
# Convert to string first, then to categorical (handles numeric values)
df = df.with_columns([
    pl.col('electoralTerm').cast(pl.String).cast(pl.Categorical),
    pl.col('factionId').cast(pl.String).cast(pl.Categorical),
    pl.col('positionShort').cast(pl.String).cast(pl.Categorical)
])

print("Data cleaning completed!")
print(f"\nCleaned data shape: {df.shape}")
print(f"\nData types after cleaning:")
print(df.schema)
print(f"\nNull values:")
print(df.null_count())

Data cleaning completed!

Cleaned data shape: (35377, 12)

Data types after cleaning:
Schema([('id', Int64), ('session', Int64), ('electoralTerm', Categorical), ('firstName', String), ('lastName', String), ('politicianId', Int64), ('speechContent', String), ('factionId', Categorical), ('documentUrl', String), ('positionShort', Categorical), ('positionLong', String), ('date', Datetime(time_unit='us', time_zone=None))])

Null values:
shape: (1, 12)
┌─────┬─────────┬───────────────┬───────────┬───┬─────────────┬──────────────┬──────────────┬──────┐
│ id  ┆ session ┆ electoralTerm ┆ firstName ┆ … ┆ documentUrl ┆ positionShor ┆ positionLong ┆ date │
│ --- ┆ ---     ┆ ---           ┆ ---       ┆   ┆ ---         ┆ t            ┆ ---          ┆ ---  │
│ u32 ┆ u32     ┆ u32           ┆ u32       ┆   ┆ u32         ┆ ---          ┆ u32          ┆ u32  │
│     ┆         ┆               ┆           ┆   ┆             ┆ u32          ┆              ┆      │
╞═════╪═════════╪═══════════════╪═══════════

In [7]:
# Filter speeches by minimum text length
import polars as pl

min_length = 200

# Add speech length column
df = df.with_columns(
    pl.col('speechContent').str.len_chars().alias('speech_length')
)

print(f"Speech length statistics (before filtering):")
print(f"  Min: {df['speech_length'].min()}")
print(f"  Max: {df['speech_length'].max()}")
print(f"  Mean: {df['speech_length'].mean():.0f}")
print(f"  Median: {df['speech_length'].median():.0f}")

# Filter speeches with sufficient length
df = df.filter(pl.col('speech_length') > min_length)

print(f"\nAfter filtering (length > {min_length} chars):")
print(f"  Rows kept: {df.shape[0]} (removed {100 - df.shape[0]} rows)")
print(f"  New shape: {df.shape}")
print(f"\nNew speech length statistics:")
print(f"  Min: {df['speech_length'].min()}")
print(f"  Max: {df['speech_length'].max()}")
print(f"  Mean: {df['speech_length'].mean():.0f}")

Speech length statistics (before filtering):
  Min: 4
  Max: 41315
  Mean: 3722
  Median: 3679

After filtering (length > 200 chars):
  Rows kept: 30688 (removed -30588 rows)
  New shape: (30688, 13)

New speech length statistics:
  Min: 201
  Max: 41315
  Mean: 4281


In [8]:
# Save cleaned data to processed folder
from pathlib import Path

# Define output path
processed_dir = Path('../data/processed')

output_file = processed_dir / 'df_sample_cleaned.csv'

# Save as CSV
df.write_csv(output_file)

print(f"Cleaned data saved to: {output_file}")
print(f"Final dataset shape: {df.shape}")
print(f"Final dataset size: {output_file.stat().st_size / 1024:.2f} KB")

Cleaned data saved to: ..\data\processed\df_sample_cleaned.csv
Final dataset shape: (30688, 13)
Final dataset size: 134702.24 KB
