In [2]:
import sys
sys.path.insert(0, '..')

from src.data.load_data import load_cleaned

# Clear any cached imports
if 'src.data.load_data' in sys.modules:
    del sys.modules['src.data.load_data']
    
# Load the cleaned dataset
df = load_cleaned()

print(f"Cleaned dataset loaded!")
print(f"Shape: {df.shape}")
print(f"\nColumn names: {df.columns}")
print(f"\nData types:")
print(df.schema)
print(f"\nFirst few rows:")
print(df.head())

Cleaned dataset loaded!
Shape: (563, 13)

Column names: ['id', 'session', 'electoralTerm', 'firstName', 'lastName', 'politicianId', 'speechContent', 'factionId', 'documentUrl', 'positionShort', 'positionLong', 'date', 'speech_length']

Data types:
Schema([('id', Int64), ('session', Int64), ('electoralTerm', Int64), ('firstName', String), ('lastName', String), ('politicianId', Int64), ('speechContent', String), ('factionId', Int64), ('documentUrl', String), ('positionShort', String), ('positionLong', String), ('date', String), ('speech_length', Int64)])

First few rows:
shape: (5, 13)
┌─────────┬─────────┬────────────┬───────────┬───┬────────────┬────────────┬───────────┬───────────┐
│ id      ┆ session ┆ electoralT ┆ firstName ┆ … ┆ positionSh ┆ positionLo ┆ date      ┆ speech_le │
│ ---     ┆ ---     ┆ erm        ┆ ---       ┆   ┆ ort        ┆ ng         ┆ ---       ┆ ngth      │
│ i64     ┆ i64     ┆ ---        ┆ str       ┆   ┆ ---        ┆ ---        ┆ str       ┆ ---       │
│    

In [3]:
# Text Segmentation: Split speeches into paragraphs
import polars as pl
import re

# Test different splitting methods on the first speech
test_speech = df['speechContent'][0]

print("=" * 80)
print("COMPARING PARAGRAPH SPLITTING METHODS")
print("=" * 80)

# Method 1: Split by double newline (\n\n)
paragraphs_double_newline = test_speech.split('\n\n')
paragraphs_double_newline = [p.strip() for p in paragraphs_double_newline if p.strip()]

print(f"\nMethod 1: Split by \\n\\n")
print(f"  Number of paragraphs: {len(paragraphs_double_newline)}")
print(f"  Min length: {min(len(p) for p in paragraphs_double_newline) if paragraphs_double_newline else 0}")
print(f"  Max length: {max(len(p) for p in paragraphs_double_newline) if paragraphs_double_newline else 0}")
print(f"  Avg length: {sum(len(p) for p in paragraphs_double_newline) / len(paragraphs_double_newline) if paragraphs_double_newline else 0:.0f}")

# Method 2: Split by single newline (\n)
paragraphs_single_newline = test_speech.split('\n')
paragraphs_single_newline = [p.strip() for p in paragraphs_single_newline if p.strip()]

print(f"\nMethod 2: Split by \\n")
print(f"  Number of paragraphs: {len(paragraphs_single_newline)}")
print(f"  Min length: {min(len(p) for p in paragraphs_single_newline) if paragraphs_single_newline else 0}")
print(f"  Max length: {max(len(p) for p in paragraphs_single_newline) if paragraphs_single_newline else 0}")
print(f"  Avg length: {sum(len(p) for p in paragraphs_single_newline) / len(paragraphs_single_newline) if paragraphs_single_newline else 0:.0f}")

# Method 3: Split by regex (one or more newlines/spaces)
paragraphs_regex = re.split(r'\n\s*\n+', test_speech)
paragraphs_regex = [p.strip() for p in paragraphs_regex if p.strip()]

print(f"\nMethod 3: Split by regex (\\n\\s*\\n+)")
print(f"  Number of paragraphs: {len(paragraphs_regex)}")
print(f"  Min length: {min(len(p) for p in paragraphs_regex) if paragraphs_regex else 0}")
print(f"  Max length: {max(len(p) for p in paragraphs_regex) if paragraphs_regex else 0}")
print(f"  Avg length: {sum(len(p) for p in paragraphs_regex) / len(paragraphs_regex) if paragraphs_regex else 0:.0f}")

# Sample paragraphs
print(f"\n" + "=" * 80)
print("SAMPLE PARAGRAPHS (Method 1: \\n\\n)")
print("=" * 80)
for i, para in enumerate(paragraphs_double_newline[:3]):
    print(f"\nParagraph {i+1} ({len(para)} chars):")
    print(para[:200] + "..." if len(para) > 200 else para)

COMPARING PARAGRAPH SPLITTING METHODS

Method 1: Split by \n\n
  Number of paragraphs: 29
  Min length: 5
  Max length: 520
  Avg length: 121

Method 2: Split by \n
  Number of paragraphs: 65
  Min length: 5
  Max length: 208
  Avg length: 53

Method 3: Split by regex (\n\s*\n+)
  Number of paragraphs: 29
  Min length: 5
  Max length: 520
  Avg length: 121

SAMPLE PARAGRAPHS (Method 1: \n\n)

Paragraph 1 (51 chars):
Herr Präsident! Liebe Kolleginnen und Kollegen! Vor

Paragraph 2 (520 chars):
total leeren Tribünen - mit Ausnahme der Eltern des
Kollegen Kurth, die dankenswerterweise da sind - diskutieren wir jetzt als letzten Tagesordnungspunkt über
eines der großen Themen der Erinnerungsku...

Paragraph 3 (5 chars):
({0})


In [4]:
# Apply Method 1 (split by \n\n) to all speeches
import polars as pl

def split_speech_into_paragraphs(speech_content: str) -> list:
    """Split speech into paragraphs by double newline."""
    paragraphs = speech_content.split('\n\n')
    paragraphs = [p.strip() for p in paragraphs if p.strip()]
    return paragraphs

# Create a new dataframe with paragraphs as separate rows
rows_list = []

for row in df.iter_rows(named=True):
    speech_content = row['speechContent']
    paragraphs = split_speech_into_paragraphs(speech_content)
    
    # Create a row for each paragraph
    for para_num, paragraph in enumerate(paragraphs, 1):
        new_row = {
            **row,  # Copy all metadata from original speech
            'speechContent': paragraph,  # Replace with paragraph
            'paragraph_number': para_num,  # Add paragraph number
            'paragraph_length': len(paragraph)  # Add paragraph length
        }
        rows_list.append(new_row)

# Create new dataframe with paragraphs
df_paragraphs = pl.DataFrame(rows_list)

print("Paragraph Segmentation Complete!")
print(f"\nOriginal dataset: {df.shape[0]} speeches")
print(f"New dataset: {df_paragraphs.shape[0]} paragraphs")
print(f"Average paragraphs per speech: {df_paragraphs.shape[0] / df.shape[0]:.1f}")

print(f"\nNew columns: {df_paragraphs.columns}")
print(f"\nParagraph length statistics:")
print(f"  Min: {df_paragraphs['paragraph_length'].min()}")
print(f"  Max: {df_paragraphs['paragraph_length'].max()}")
print(f"  Mean: {df_paragraphs['paragraph_length'].mean():.0f}")
print(f"  Median: {df_paragraphs['paragraph_length'].median():.0f}")

print(f"\nFirst few rows:")
print(df_paragraphs.head())

Paragraph Segmentation Complete!

Original dataset: 563 speeches
New dataset: 3142 paragraphs
Average paragraphs per speech: 5.6

New columns: ['id', 'session', 'electoralTerm', 'firstName', 'lastName', 'politicianId', 'speechContent', 'factionId', 'documentUrl', 'positionShort', 'positionLong', 'date', 'speech_length', 'paragraph_number', 'paragraph_length']

Paragraph length statistics:
  Min: 5
  Max: 25375
  Mean: 529
  Median: 197

First few rows:
shape: (5, 15)
┌────────┬─────────┬────────────┬───────────┬───┬────────────┬────────────┬────────────┬───────────┐
│ id     ┆ session ┆ electoralT ┆ firstName ┆ … ┆ date       ┆ speech_len ┆ paragraph_ ┆ paragraph │
│ ---    ┆ ---     ┆ erm        ┆ ---       ┆   ┆ ---        ┆ gth        ┆ number     ┆ _length   │
│ i64    ┆ i64     ┆ ---        ┆ str       ┆   ┆ str        ┆ ---        ┆ ---        ┆ ---       │
│        ┆         ┆ i64        ┆           ┆   ┆            ┆ i64        ┆ i64        ┆ i64       │
╞════════╪═════════╪═══

In [5]:
# Save segmented paragraphs to interim folder
from pathlib import Path

# Define output path
interim_dir = Path('../data/interim')
interim_dir.mkdir(exist_ok=True)

output_file = interim_dir / 'df_sample_split.csv'

# Save as CSV
df_paragraphs.write_csv(output_file)

print(f"Segmented paragraphs saved!")
print(f"Output file: {output_file}")
print(f"File size: {output_file.stat().st_size / 1024:.2f} KB")
print(f"Total rows (paragraphs): {df_paragraphs.shape[0]}")
print(f"Total columns: {df_paragraphs.shape[1]}")

Segmented paragraphs saved!
Output file: ..\data\interim\df_sample_split.csv
File size: 2144.53 KB
Total rows (paragraphs): 3142
Total columns: 15


In [6]:
# Convert all speeches to lowercase
import polars as pl

# Convert speechContent to lowercase
df_paragraphs_lowercase = df_paragraphs.with_columns(
    pl.col('speechContent').str.to_lowercase().alias('speechContent')
)

print("Lowercase conversion complete!")
print(f"\nAll {df_paragraphs_lowercase.shape[0]} paragraphs converted to lowercase")
print(f"\nSample (first paragraph, first 200 chars):")
print(df_paragraphs_lowercase['speechContent'][0][:200])

Lowercase conversion complete!

All 3142 paragraphs converted to lowercase

Sample (first paragraph, first 200 chars):
herr präsident! liebe kolleginnen und kollegen! vor


In [7]:
# Tokenize all paragraphs using spaCy
import spacy
import polars as pl

# Load German language model
try:
    nlp = spacy.load('de_core_news_sm')
    print("spaCy model 'de_core_news_sm' loaded successfully!")
except OSError:
    print("Model not found. Installing de_core_news_sm...")
    import subprocess
    subprocess.run(['python', '-m', 'spacy', 'download', 'de_core_news_sm'])
    nlp = spacy.load('de_core_news_sm')

# Tokenize each paragraph
def tokenize_text(text: str) -> list:
    """Tokenize text using spaCy and return list of tokens."""
    doc = nlp(text)
    tokens = [token.text for token in doc]
    return tokens

# Add tokens column
tokens_list = []
for speech_content in df_paragraphs['speechContent']:
    tokens = tokenize_text(speech_content)
    tokens_list.append(tokens)

# Add tokens column to dataframe
df_paragraphs = df_paragraphs.with_columns(
    pl.Series('tokens', tokens_list)
)

# Add token count column
df_paragraphs = df_paragraphs.with_columns(
    pl.col('tokens').list.len().alias('token_count')
)

print("Tokenization complete!")
print(f"\nDataframe shape: {df_paragraphs.shape}")
print(f"New columns: {df_paragraphs.columns}")

print(f"\nToken count statistics:")
print(f"  Min: {df_paragraphs['token_count'].min()}")
print(f"  Max: {df_paragraphs['token_count'].max()}")
print(f"  Mean: {df_paragraphs['token_count'].mean():.1f}")
print(f"  Median: {df_paragraphs['token_count'].median():.0f}")

print(f"\nSample (first paragraph tokens):")
print(f"  Tokens: {df_paragraphs['tokens'][0]}")
print(f"  Token count: {df_paragraphs['token_count'][0]}")

spaCy model 'de_core_news_sm' loaded successfully!
Tokenization complete!

Dataframe shape: (3142, 17)
New columns: ['id', 'session', 'electoralTerm', 'firstName', 'lastName', 'politicianId', 'speechContent', 'factionId', 'documentUrl', 'positionShort', 'positionLong', 'date', 'speech_length', 'paragraph_number', 'paragraph_length', 'tokens', 'token_count']

Token count statistics:
  Min: 2
  Max: 4174
  Mean: 91.6
  Median: 34

Sample (first paragraph tokens):
  Tokens: shape: (9,)
Series: '' [str]
[
	"Herr"
	"Präsident"
	"!"
	"Liebe"
	"Kolleginnen"
	"und"
	"Kollegen"
	"!"
	"Vor"
]
  Token count: 9


In [8]:
# Remove stopwords from tokens
import polars as pl
import spacy

# Load German model (already loaded from previous cell)
nlp = spacy.load('de_core_news_sm')

# Get German stopwords from spaCy
german_stopwords = nlp.Defaults.stop_words

print("Stopword Removal")
print(f"Number of German stopwords: {len(german_stopwords)}")
print(f"Sample stopwords: {list(german_stopwords)[:20]}")

# Function to remove stopwords
def remove_stopwords(tokens: list) -> list:
    """Remove stopwords from token list."""
    return [token for token in tokens if token.lower() not in german_stopwords]

# Apply stopword removal to all tokens
df_paragraphs = df_paragraphs.with_columns(
    pl.col('tokens').map_elements(remove_stopwords, return_dtype=pl.List(pl.Utf8)).alias('tokens_no_stopwords')
)

# Add count of tokens without stopwords
df_paragraphs = df_paragraphs.with_columns(
    pl.col('tokens_no_stopwords').list.len().alias('token_count_no_stopwords')
)

print("\nStopword removal complete!")
print(f"\nDataframe shape: {df_paragraphs.shape}")
print(f"New columns: {df_paragraphs.columns}")

print(f"\nToken count comparison:")
print(f"  Original tokens - Mean: {df_paragraphs['token_count'].mean():.1f}")
print(f"  After stopword removal - Mean: {df_paragraphs['token_count_no_stopwords'].mean():.1f}")
print(f"  Reduction: {(1 - df_paragraphs['token_count_no_stopwords'].mean() / df_paragraphs['token_count'].mean()) * 100:.1f}%")

print(f"\nSample comparison (first paragraph):")
print(f"  Original tokens ({df_paragraphs['token_count'][0]}): {df_paragraphs['tokens'][0]}")
print(f"  After stopword removal ({df_paragraphs['token_count_no_stopwords'][0]}): {df_paragraphs['tokens_no_stopwords'][0]}")

Stopword Removal
Number of German stopwords: 543
Sample stopwords: ['ging', 'seines', 'kurz', 'mich', 'wollten', 'du', 'für', 'zur', 'dasselbe', 'mögen', 'sei', 'dürfen', 'allein', 'worden', 'eines', 'je', 'gute', 'geschweige', 'sondern', 'einiges']

Stopword removal complete!

Dataframe shape: (3142, 19)
New columns: ['id', 'session', 'electoralTerm', 'firstName', 'lastName', 'politicianId', 'speechContent', 'factionId', 'documentUrl', 'positionShort', 'positionLong', 'date', 'speech_length', 'paragraph_number', 'paragraph_length', 'tokens', 'token_count', 'tokens_no_stopwords', 'token_count_no_stopwords']

Token count comparison:
  Original tokens - Mean: 91.6
  After stopword removal - Mean: 50.1
  Reduction: 45.3%

Sample comparison (first paragraph):
  Original tokens (9): shape: (9,)
Series: '' [str]
[
	"Herr"
	"Präsident"
	"!"
	"Liebe"
	"Kolleginnen"
	"und"
	"Kollegen"
	"!"
	"Vor"
]
  After stopword removal (7): shape: (7,)
Series: '' [str]
[
	"Herr"
	"Präsident"
	"!"
	"Liebe"
	

In [9]:
# Remove punctuation and numbers from tokens
import polars as pl
import string
import re

# Get German stopwords from spaCy (already loaded)
nlp = spacy.load('de_core_news_sm')

print("Punctuation and Number Removal")
print(f"Punctuation characters: {string.punctuation}")

# Function to remove punctuation and numbers
def remove_punct_and_numbers(tokens: list) -> list:
    """Remove punctuation and numbers from token list."""
    cleaned = []
    for token in tokens:
        # Check if token contains only punctuation and/or numbers
        if all(c in string.punctuation + string.digits for c in token):
            # Skip tokens that are purely punctuation or numbers
            continue
        else:
            # Remove punctuation and numbers from within tokens, keep only letters
            cleaned_token = re.sub(r'[\d\W]', '', token)
            if cleaned_token:  # Only add non-empty tokens
                cleaned.append(cleaned_token)
    return cleaned

# Apply punctuation/number removal to tokens without stopwords
df_paragraphs = df_paragraphs.with_columns(
    pl.col('tokens_no_stopwords').map_elements(remove_punct_and_numbers, return_dtype=pl.List(pl.Utf8)).alias('tokens_clean')
)

# Add count of clean tokens
df_paragraphs = df_paragraphs.with_columns(
    pl.col('tokens_clean').list.len().alias('token_count_clean')
)

print("\nPunctuation and number removal complete!")
print(f"\nDataframe shape: {df_paragraphs.shape}")
print(f"Columns: {df_paragraphs.columns}")

print(f"\nToken count progression:")
print(f"  Original tokens - Mean: {df_paragraphs['token_count'].mean():.1f}")
print(f"  After stopword removal - Mean: {df_paragraphs['token_count_no_stopwords'].mean():.1f}")
print(f"  After punct/number removal - Mean: {df_paragraphs['token_count_clean'].mean():.1f}")
print(f"  Total reduction: {(1 - df_paragraphs['token_count_clean'].mean() / df_paragraphs['token_count'].mean()) * 100:.1f}%")

print(f"\nSample comparison (first paragraph):")
print(f"  Original tokens ({df_paragraphs['token_count'][0]}): {df_paragraphs['tokens'][0]}")
print(f"  After stopwords ({df_paragraphs['token_count_no_stopwords'][0]}): {df_paragraphs['tokens_no_stopwords'][0]}")
print(f"  After punct/numbers ({df_paragraphs['token_count_clean'][0]}): {df_paragraphs['tokens_clean'][0]}")

Punctuation and Number Removal
Punctuation characters: !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~

Punctuation and number removal complete!

Dataframe shape: (3142, 21)
Columns: ['id', 'session', 'electoralTerm', 'firstName', 'lastName', 'politicianId', 'speechContent', 'factionId', 'documentUrl', 'positionShort', 'positionLong', 'date', 'speech_length', 'paragraph_number', 'paragraph_length', 'tokens', 'token_count', 'tokens_no_stopwords', 'token_count_no_stopwords', 'tokens_clean', 'token_count_clean']

Token count progression:
  Original tokens - Mean: 91.6
  After stopword removal - Mean: 50.1
  After punct/number removal - Mean: 30.9
  Total reduction: 66.3%

Sample comparison (first paragraph):
  Original tokens (9): shape: (9,)
Series: '' [str]
[
	"Herr"
	"Präsident"
	"!"
	"Liebe"
	"Kolleginnen"
	"und"
	"Kollegen"
	"!"
	"Vor"
]
  After stopwords (7): shape: (7,)
Series: '' [str]
[
	"Herr"
	"Präsident"
	"!"
	"Liebe"
	"Kolleginnen"
	"Kollegen"
	"!"
]
  After punct/numbers (5): shape: (5,)
S

In [10]:
# Lemmatization
import polars as pl
import spacy

# Load German model (already loaded from previous cells)
nlp = spacy.load('de_core_news_sm')

print("Lemmatization")
print("Converting tokens to their base/lemma form using spaCy German model...")

# Function to lemmatize a list of tokens
def lemmatize_tokens(tokens):
    """Lemmatize a list of tokens by looking up each token individually."""
    if tokens is None or len(tokens) == 0:
        return []
    
    lemmas = []
    for token_text in tokens:
        # Process each token through spaCy to get its lemma
        doc = nlp(token_text)
        if doc and len(doc) > 0:
            lemmas.append(doc[0].lemma_)
        else:
            # If no lemma found, keep the original token
            lemmas.append(token_text)
    
    return lemmas

# Apply lemmatization using map_batches with explicit column operation
def apply_lemmatization(df):
    """Apply lemmatization to tokens_clean column."""
    tokens_lemma_list = []
    for tokens in df['tokens_clean']:
        lemmas = lemmatize_tokens(tokens)
        tokens_lemma_list.append(lemmas)
    return tokens_lemma_list

# Apply to dataframe
lemma_results = apply_lemmatization(df_paragraphs)
df_paragraphs = df_paragraphs.with_columns(
    pl.Series('tokens_lemma', lemma_results)
)

# Add count of lemmatized tokens
df_paragraphs = df_paragraphs.with_columns(
    pl.col('tokens_lemma').list.len().alias('token_count_lemma')
)

print("\nLemmatization complete!")
print(f"\nDataframe shape: {df_paragraphs.shape}")
print(f"Columns: {df_paragraphs.columns}")

print(f"\nToken count at each stage:")
print(f"  Original tokens - Mean: {df_paragraphs['token_count'].mean():.1f}")
print(f"  After stopword removal - Mean: {df_paragraphs['token_count_no_stopwords'].mean():.1f}")
print(f"  After punct/number removal - Mean: {df_paragraphs['token_count_clean'].mean():.1f}")
print(f"  After lemmatization - Mean: {df_paragraphs['token_count_lemma'].mean():.1f}")

print(f"\nSample comparison (first paragraph):")
print(f"  Clean tokens ({df_paragraphs['token_count_clean'][0]}): {df_paragraphs['tokens_clean'][0]}")
print(f"  Lemmatized ({df_paragraphs['token_count_lemma'][0]}): {df_paragraphs['tokens_lemma'][0]}")

Lemmatization
Converting tokens to their base/lemma form using spaCy German model...

Lemmatization complete!

Dataframe shape: (3142, 23)
Columns: ['id', 'session', 'electoralTerm', 'firstName', 'lastName', 'politicianId', 'speechContent', 'factionId', 'documentUrl', 'positionShort', 'positionLong', 'date', 'speech_length', 'paragraph_number', 'paragraph_length', 'tokens', 'token_count', 'tokens_no_stopwords', 'token_count_no_stopwords', 'tokens_clean', 'token_count_clean', 'tokens_lemma', 'token_count_lemma']

Token count at each stage:
  Original tokens - Mean: 91.6
  After stopword removal - Mean: 50.1
  After punct/number removal - Mean: 30.9
  After lemmatization - Mean: 30.9

Sample comparison (first paragraph):
  Clean tokens (5): shape: (5,)
Series: '' [str]
[
	"Herr"
	"Präsident"
	"Liebe"
	"Kolleginnen"
	"Kollegen"
]
  Lemmatized (5): shape: (5,)
Series: '' [str]
[
	"Herr"
	"Präsident"
	"Liebe"
	"Kollegin"
	"Kollege"
]


In [11]:
# Save preprocessed data to processed folder
from pathlib import Path

# Define output path
processed_dir = Path('../data/processed')
processed_dir.mkdir(exist_ok=True)

# Save as Parquet (supports nested data like lists)
output_file = processed_dir / 'df_sample_split_preprocessed.parquet'
df_paragraphs.write_parquet(output_file)

print(f"Preprocessed data saved!")
print(f"Output file: {output_file}")
print(f"File size: {output_file.stat().st_size / 1024:.2f} KB")
print(f"Total rows (paragraphs): {df_paragraphs.shape[0]}")
print(f"Total columns: {df_paragraphs.shape[1]}")
print(f"\nDataframe columns:")
for col in df_paragraphs.columns:
    print(f"  - {col}")

Preprocessed data saved!
Output file: ..\data\processed\df_sample_split_preprocessed.parquet
File size: 2547.12 KB
Total rows (paragraphs): 3142
Total columns: 23

Dataframe columns:
  - id
  - session
  - electoralTerm
  - firstName
  - lastName
  - politicianId
  - speechContent
  - factionId
  - documentUrl
  - positionShort
  - positionLong
  - date
  - speech_length
  - paragraph_number
  - paragraph_length
  - tokens
  - token_count
  - tokens_no_stopwords
  - token_count_no_stopwords
  - tokens_clean
  - token_count_clean
  - tokens_lemma
  - token_count_lemma
