In [2]:
import re
from collections import defaultdict
import os
import pandas as pd

def sanitize_filename(name, max_length=100):
    # Remove/replace invalid characters and truncate
    name = re.sub(r'[\\/:"*?<>|]', '_', name)
    return name[:max_length].strip()

# Match speaker and speech (includes 'De voorzitter')
pattern = re.compile(
    r'(?:'
    r'(?:De heer|Mevrouw)\s+(?P<name>[A-Z][a-zA-Zà-ÿ\'’\- ]{1,40})\s+\((?P<party>[^)]+)\):'
    r'|(?P<voorzitter>De voorzitter):'
    r'|Minister\s+(?P<minister>[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*):'
    r')\s*'
    r'(?P<speech>.*?)'
    r'(?=(?:'
    r'(?:De heer|Mevrouw)\s+[A-Z][a-zA-Zà-ÿ\'’\- ]{1,40}\s+\([^)]+\):'
    r'|De voorzitter:'
    r'|Minister\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*:'
    r'|$))',
    re.DOTALL
)

# Dictionary of party -> list of speeches
party_speeches = defaultdict(list)

# Read and process each file independently
txt_folder = "txt"
file_count = 0
for filename in os.listdir(txt_folder):
    
    if filename.endswith(".txt"):
        file_path = os.path.join(txt_folder, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()

        print(f"Processing file {file_count + 1}: {filename}")
        file_count += 1

        # Extract speeches from the current file
        for match in pattern.finditer(text):
            if match.group("party"):
                party = match.group("party").strip()
                name = match.group("name").strip() if match.group("name") else ""
            elif match.group("voorzitter"):
                party = "VOORZITTER"
                name = ""
            elif match.group("minister"):
                party = ""
                name = match.group("minister").strip()
            else:
                continue  # Defensive: skip if no speaker found

            speech = match.group("speech").strip()

            if len(party) > 100 or '\n' in party or party.lower().startswith("de heer") or len(party.split()) > 10:
                print(f"⚠️ Skipping invalid party label: {party[:60]}...")
                continue

            # FIX: Append the speech to the party_speeches dictionary
            party_speeches[party].append({
                "Filename": filename,
                "Party": party,
                "Speaker": name,
                "Speech": speech
            })

# Prepare data for CSV
data = []
for party, speeches in party_speeches.items():
    for speech_info in speeches:
        data.append(speech_info)

# Save to a CSV file
output_csv = "party_speeches.csv"
df = pd.DataFrame(data)
print(df.head())  # Print the first few rows for verification
df.to_csv(output_csv, index=False, encoding="utf-8")

print(f"✅ Party speeches have been saved to the CSV file: {output_csv}")


Processing file 1: 0000698e-a0d5-4007-bd38-38285db4f804.pdf.txt
Processing file 2: 00026177-5fb6-41c7-8058-edb4274858d2.pdf.txt
Processing file 3: 00026fc1-d112-4e28-b1d6-52a23efe367a.pdf.txt
Processing file 4: 000299a9-e3b9-4997-9a25-4e2b00e61dca.pdf.txt
Processing file 5: 00035a28-9a80-448e-86c3-9886aa73a9b8.pdf.txt
Processing file 6: 000886c8-c66e-49c1-95fc-cf9cc487480a.pdf.txt
Processing file 7: 000cc27b-14a3-4ae6-afb0-6e926c10dc49.pdf.txt
Processing file 8: 00105eac-ffd9-40c7-be76-08a86c7ddb9c.pdf.txt
Processing file 9: 0011166f-e274-42d1-b39d-7a910c3152a7.pdf.txt
Processing file 10: 00121815-a318-4ef3-8ad6-25150c1711f4.pdf.txt
Processing file 11: 0016b926-2a8d-4bf1-bde3-c754311e117c.pdf.txt
Processing file 12: 001b269e-d256-4bba-9766-224e8d97cf93.pdf.txt
Processing file 13: 001b7d55-d7af-4576-bd29-0abf847fbbff.pdf.txt
Processing file 14: 001c41ef-c433-425a-b489-ae74399aa651.pdf.txt
Processing file 15: 001d9c42-a396-49a8-9809-cba247a568f1.pdf.txt
Processing file 16: 00220f75-ab84-

In [2]:
import pandas as pd
import torch
import nltk
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("manifesto-project/manifestoberta-xlm-roberta-56policy-topics-sentence-2024-1-1")
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")

# Path to speeches CSV
input_csv = "party_speeches.csv"
output_csv = "party_speeches_classification.csv"

# Read speeches from CSV
df_speeches = pd.read_csv(input_csv)
results = []

for index, row in df_speeches.iterrows():
    party_name = row["Party"]
    text = row["Speech"]

    if not text or pd.isna(text):
        print(f"⚠️ Skipping empty speech for party: {party_name}")
        continue

    print(f"\n{party_name}: Speech has {len(text)} characters")

    # Split into sentences and optionally filter
    sentences = sent_tokenize(text)
    print(f"{party_name}: Extracted {len(sentences)} total sentences")

    max_samples = 50  # Limit number of samples for speed
    sentences = [s for s in sentences if len(s.split()) > 5][:max_samples]
    print(f"{party_name}: Using {len(sentences)} filtered sentences for classification")

    if not sentences:
        print(f"⚠️ No valid sentences found for party: {party_name}")
        continue

    topic_scores = torch.zeros(len(model.config.id2label))

    for i, sent in enumerate(sentences):
        inputs = tokenizer(sent,
                           return_tensors="pt",
                           max_length=200,
                           padding="max_length",
                           truncation=True)

        print(f"Sentence {i+1}: {len(inputs['input_ids'][0])} tokens")

        with torch.no_grad():
            logits = model(**inputs).logits
            probs = torch.softmax(logits, dim=1).squeeze()

        topic_scores += probs

    # Average topic scores
    topic_scores /= len(sentences)

    # Format results
    probabilities = {
        model.config.id2label[i]: round(score.item() * 100, 2)
        for i, score in enumerate(topic_scores)
    }
    probabilities = dict(sorted(probabilities.items(), key=lambda x: x[1], reverse=True))
    predicted_class = max(probabilities, key=probabilities.get)

    result = {
        "party": party_name,
        "predicted_class": predicted_class,
        "top_3": list(probabilities.items())[:3]
    }
    results.append(result)

    print(f"{party_name}: {predicted_class}")
    print(f"Top 3 topics: {result['top_3']}\n")

# Prepare data for CSV
classification_data = []
for result in results:
    classification_data.append({
        "Party": result["party"],
        "Predicted Class": result["predicted_class"],
        "Top 1 Topic": result["top_3"][0][0],
        "Top 1 Probability (%)": result["top_3"][0][1],
        "Top 2 Topic": result["top_3"][1][0] if len(result["top_3"]) > 1 else None,
        "Top 2 Probability (%)": result["top_3"][1][1] if len(result["top_3"]) > 1 else None,
        "Top 3 Topic": result["top_3"][2][0] if len(result["top_3"]) > 2 else None,
        "Top 3 Probability (%)": result["top_3"][2][1] if len(result["top_3"]) > 2 else None,
    })

# Save results to a CSV file
df_classification = pd.DataFrame(classification_data)
df_classification.to_csv(output_csv, index=False, encoding="utf-8")

print(f"✅ Classification results saved to: {output_csv}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jacco\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!



VOORZITTER: Speech has 242 characters
VOORZITTER: Extracted 2 total sentences
VOORZITTER: Using 2 filtered sentences for classification
Sentence 1: 200 tokens
Sentence 2: 200 tokens
VOORZITTER: 413 - Nationalisation
Top 3 topics: [('413 - Nationalisation', 2.9), ('412 - Controlled Economy', 2.73), ('604 - Traditional Morality: Negative', 2.52)]


VOORZITTER: Speech has 2165 characters
VOORZITTER: Extracted 21 total sentences
VOORZITTER: Using 19 filtered sentences for classification
Sentence 1: 200 tokens
Sentence 2: 200 tokens
Sentence 3: 200 tokens
Sentence 4: 200 tokens
Sentence 5: 200 tokens
Sentence 6: 200 tokens
Sentence 7: 200 tokens
Sentence 8: 200 tokens
Sentence 9: 200 tokens
Sentence 10: 200 tokens
Sentence 11: 200 tokens
Sentence 12: 200 tokens
Sentence 13: 200 tokens
Sentence 14: 200 tokens
Sentence 15: 200 tokens
Sentence 16: 200 tokens
Sentence 17: 200 tokens
Sentence 18: 200 tokens
Sentence 19: 200 tokens
VOORZITTER: 606 - Civic Mindedness: Positive
Top 3 topics: [('60