In [9]:
import pandas as pd
import numpy as np
# Add these imports for stopwords
import nltk
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords
nltk_available = True


pd.options.display.max_columns = None

# Load the data with embeddings (already segmented)
AT_combined = pd.read_pickle(r"data folder\AT\AT_final.pkl")
AT_combined.drop(columns=['Segment_ID'], inplace=True)

HR_combined= pd.read_pickle(r"data folder\HR\HR_final.pkl")
HR_combined.drop(columns=['Segment_ID'], inplace=True)

GB = pd.read_pickle(r"data folder\GB\GB_final.pkl")

print(f"✅ Loaded data: {AT_combined.shape}")
print(f"✅ Loaded data: {HR_combined.shape}")
print(f"✅ Loaded data: {GB.shape}")

✅ Loaded data: (231759, 32)
✅ Loaded data: (504338, 32)
✅ Loaded data: (670912, 29)


In [10]:
# === BERTOPIC SETUP WITH GUIDED TOPICS ===
from bertopic import BERTopic
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from umap import UMAP
from hdbscan import HDBSCAN
import openai
import os
from dotenv import load_dotenv
import time
import gc  # For garbage collection
from tqdm import tqdm  # Add tqdm for progress tracking

# Load environment variables
load_dotenv()

# Define the 22 target topic categories
label_list = [
    "Education", "Technology", "Health", "Environment", "Housing", "Labor", 
    "Defense", "Government Operations", "Social Welfare", "Other", "Macroeconomics", 
    "Domestic Commerce", "Civil Rights", "International Affairs", "Transportation", 
    "Immigration", "Law and Crime", "Agriculture", "Foreign Trade", "Culture", 
    "Public Lands", "Energy"
]

# Enhanced stopword lists with comprehensive coverage
english_custom_stopwords = [
    'mr', 'mrs', 'ms', 'dr', 'madam', 'honorable', 'honourable', 'member', 'members', 'vp', 'sp', 'fp', 
    'minister', 'speaker', 'deputy', 'president', 'chairman', 'chair', 'schilling', 
    'secretary', 'lord', 'lady', 'question', 'order', 'point', 'debate', 'motion', 'amendment',
    'congratulations', 'congratulate', 'thanks', 'thank', 'say', 'one', 'want', 'know', 'think', 
    'believe', 'see', 'go', 'come', 'give', 'take', 'people', 'federal', 'government', 'austria', 
    'austrian', 'committee', 'call', 'said', 'already', 'please', 'request', 'proceed', 'reading',
    'course', 'welcome', 'council', 'open', 'written', 'contain', 'items', 'item', 'yes', 'no', 
    'following', 'next', 'speech', 'year', 'years', 'state', 'also', 'would', 'like', 'may', 'must', 
    'upon', 'indeed', 'session', 'meeting', 'report', 'commission', 'behalf', 'gentleman', 'gentlemen', 
    'ladies', 'applause', 'group', 'colleague', 'colleagues', 'issue', 'issues', 'chancellor', 'court', 
    'ask', 'answer', 'reply', 'regard', 'regarding', 'regards', 'respect', 'respectfully', 'sign', 
    'shall', 'procedure', 'declare', 'hear', 'minutes', 'speaking', 'close', 'abg', 'mag', 'orf', 'wait'
]

# Comprehensive German stopwords (NLTK + custom parliamentary terms)
german_nltk_stopwords = stopwords.words('german')

german_custom_stopwords = [
    # Basic German stopwords
    'der', 'die', 'das', 'und', 'in', 'zu', 'den', 'mit', 'von', 'für', 
    'auf', 'ist', 'im', 'sich', 'eine', 'sie', 'dem', 'nicht', 'ein', 'als',
    'auch', 'es', 'an', 'werden', 'aus', 'er', 'hat', 'dass', 'wir', 'ich',
    'haben', 'sind', 'kann', 'sehr', 'meine', 'muss', 'doch', 'wenn', 'sein',
    'dann', 'weil', 'bei', 'nach', 'so', 'oder', 'aber', 'vor', 'über', 'noch',
    'nur', 'wie', 'war', 'waren', 'wird', 'wurde', 'wurden', 'ihr', 'ihre',
    'ihren', 'seiner', 'seine', 'seinem', 'seinen', 'dieser', 'diese', 'dieses',
    'durch', 'ohne', 'gegen', 'unter', 'zwischen', 'während', 'bis', 'seit',
    'danke', 'bitte', 'gern', 'gern geschehen', 'nichts zu danken', 
    # Austrian parliamentary terms
    'abgeordnete', 'abgeordneten', 'bundesregierung',
    'bundeskanzler', 'nationalrat', 'bundesrat', 'parlament', 'fraktion',
    'ausschuss', 'sitzung', 'präsident', 'vizepräsident', 'minister',
    'staatssekretär', 'klubobmann', 'antrag', 'anfrage', 'interpellation',
    'dringliche', 'aktuelle', 'stunde', 'debatte', 'abstimmung', 'beschluss',
    'gesetz', 'novelle', 'verordnung', 'regierungsvorlage', 'initiativantrag',
    'danke', 'dankeschön', 'geschätzte', 'kolleginnen', 'kollegen', 'hohes'
]

# Comprehensive Croatian stopwords from GitHub repository + parliamentary terms
croatian_github_stopwords = [
    'a', 'ako', 'ali', 'bi', 'bih', 'bila', 'bili', 'bilo', 'bio', 'bismo', 
    'biste', 'biti', 'bumo', 'da', 'do', 'duž', 'ga', 'hoće', 'hoćemo', 
    'hoćete', 'hoćeš', 'hoću', 'i', 'iako', 'ih', 'ili', 'iz', 'ja', 'je', 
    'jedna', 'jedne', 'jedno', 'jer', 'jesam', 'jesi', 'jesmo', 'jest', 
    'jeste', 'jesu', 'jim', 'joj', 'još', 'ju', 'kada', 'kako', 'kao', 
    'koja', 'koje', 'koji', 'kojima', 'koju', 'kroz', 'li', 'me', 'mene', 
    'meni', 'mi', 'mimo', 'moj', 'moja', 'moje', 'mu', 'na', 'nad', 'nakon', 
    'nam', 'nama', 'nas', 'naš', 'naša', 'naše', 'našeg', 'ne', 'nego', 
    'neka', 'neki', 'nekog', 'neku', 'nema', 'netko', 'neće', 'nećemo', 
    'nećete', 'nećeš', 'neću', 'nešto', 'ni', 'nije', 'nikoga', 'nikoje', 
    'nikoju', 'nisam', 'nisi', 'nismo', 'niste', 'nisu', 'njega', 'njegov', 
    'njegova', 'njegovo', 'njemu', 'njezin', 'njezina', 'njezino', 'njih', 
    'njihov', 'njihova', 'njihovo', 'njim', 'njima', 'njoj', 'nju', 'no', 
    'o', 'od', 'odmah', 'on', 'ona', 'oni', 'ono', 'ova', 'pa', 'pak', 
    'po', 'pod', 'pored', 'prije', 's', 'sa', 'sam', 'samo', 'se', 'sebe', 
    'sebi', 'si', 'smo', 'ste', 'su', 'sve', 'svi', 'svog', 'svoj', 'svoja', 
    'svoje', 'svom', 'ta', 'tada', 'taj', 'tako', 'te', 'tebe', 'tebi', 
    'ti', 'to', 'toj', 'tome', 'tu', 'tvoj', 'tvoja', 'tvoje', 'u', 'uz', 
    'vam', 'vama', 'vas', 'vaš', 'vaša', 'vaše', 'već', 'vi', 'vrlo', 'za', 
    'zar', 'će', 'ćemo', 'ćete', 'ćeš', 'ću', 'što'
]

croatian_parliamentary_stopwords = [
    # Previous custom Croatian stopwords + parliamentary terms
    'zastupnik', 'zastupnica', 'zastupnici', 'hvala', 'sabor', 'hrvatska', 
    'vlada', 'molim', 'gospodin', 'gospođa', 'premijer', 'predsjednik',
    'predsjednica', 'ministar', 'ministrica', 'državni', 'tajnik', 'tajnica',
    'odbor', 'sjednica', 'rasprava', 'prijedlog', 'zakon', 'odluka',
    'glasovanje', 'amandman', 'interpelacija', 'pitanje', 'odgovor',
    'klupski', 'obnašatelj', 'dužnosti', 'potpredsjednik', 'potpredsjednica',
    'kolegice', 'kolege', 'dame', 'gospodo', 'poštovani', 'poštovana', 
    
]

# Combine all stopwords
all_german_stopwords = list(set(german_nltk_stopwords + german_custom_stopwords))
all_croatian_stopwords = list(set(croatian_github_stopwords + croatian_parliamentary_stopwords))
all_english_stopwords = list(set(list(ENGLISH_STOP_WORDS) + english_custom_stopwords))

print(f"📚 Stopwords loaded:")
print(f"  • English: {len(all_english_stopwords)} words")
print(f"  • German: {len(all_german_stopwords)} words ({'NLTK + custom' if nltk_available else 'custom only'})")
print(f"  • Croatian: {len(all_croatian_stopwords)} words (custom)")

print(f"🎯 Target categories: {len(label_list)} topics")

📚 Stopwords loaded:
  • English: 417 words
  • German: 274 words (NLTK + custom)
  • Croatian: 219 words (custom)
🎯 Target categories: 22 topics


In [11]:
# === DATA VALIDATION AND OPTIMIZATION ===
def validate_data_before_processing(df, dataset_name, text_column, segment_id_column, embedding_column):
    """Validate data quality before running topic modeling."""
    print(f"\n🔍 Validating {dataset_name} dataset...")
    
    # Check for missing values
    missing_text = df[text_column].isna().sum()
    missing_segments = df[segment_id_column].isna().sum()
    missing_embeddings = df[embedding_column].isna().sum()
    
    print(f"  • Missing text: {missing_text}")
    print(f"  • Missing segment IDs: {missing_segments}")
    print(f"  • Missing embeddings: {missing_embeddings}")
    
    # Check text lengths
    text_lengths = df[text_column].str.len()
    print(f"  • Text length stats: min={text_lengths.min()}, mean={text_lengths.mean():.0f}, max={text_lengths.max()}")
    
    # Check segment distribution
    segments_per_group = df.groupby(segment_id_column).size()
    print(f"  • Segments: {len(segments_per_group)} unique, avg {segments_per_group.mean():.1f} rows per segment")
    
    # Check for very short segments (might cause issues)
    short_segments = segments_per_group[segments_per_group < 2]
    if len(short_segments) > 0:
        print(f"  ⚠️  Warning: {len(short_segments)} segments have only 1 row")
    
    return missing_text == 0 and missing_segments == 0 and missing_embeddings == 0

# Add cost estimation
def estimate_openai_costs(num_topics):
    """Estimate OpenAI API costs for topic processing."""
    cost_per_request = 0.03  # Rough estimate for GPT-4
    total_requests = num_topics * 2  # Name + classification
    estimated_cost = total_requests * cost_per_request
    print(f"💰 Estimated OpenAI cost: ~${estimated_cost:.2f} for {num_topics} topics ({total_requests} API calls)")
    return estimated_cost

In [12]:
# === SEGMENT-LEVEL TOPIC MODELING FUNCTIONS ===
from openai import OpenAI

def generate_topic_name(topic_words, max_retries=3):
    """Generate a descriptive topic name using OpenAI API."""
    if not isinstance(topic_words, list) or not topic_words:
        return "Miscellaneous"
    
    keywords_str = ', '.join(topic_words[:8])
    
    prompt = f"""Generate a clear, concise name for a parliamentary debate topic based on these keywords: {keywords_str}

Requirements:
- Maximum 3 words
- Noun-based, no action words (avoid: improving, achieving, strategies, developing, etc.)
- Direct topic name (e.g., "NHS Hospitals", "Climate Emissions", "Brexit Referendum")
- Parliamentary/political context

RESPONSE: Topic name only."""

    for attempt in range(max_retries):
        try:
            if not os.getenv('OPENAI_API_KEY'):
                print("Error: OPENAI_API_KEY not set")
                return "Miscellaneous"
            
            client = OpenAI()
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You generate concise parliamentary topic names. Use nouns only, no action words. Respond with the name only."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.3,
                max_tokens=15
            )
            
            topic_name = response.choices[0].message.content.strip()
            topic_name = topic_name.replace('"', '').replace("'", "").strip()
            
            return topic_name
                
        except Exception as e:
            error_msg = str(e)
            if "rate_limit" in error_msg.lower():
                wait_time = 2 ** attempt + 1
                print(f"Rate limit hit. Waiting {wait_time}s (attempt {attempt + 1}/{max_retries})...")
                time.sleep(wait_time)
                continue
            else:
                print(f"Error generating topic name (attempt {attempt + 1}/{max_retries}): {e}")
                if attempt == max_retries - 1:
                    return "Miscellaneous"
                time.sleep(1)
    
    return "Miscellaneous"

def classify_topic_to_22_categories(topic_words, topic_id=-1, max_retries=3):
    """Classify topic into one of the 22 predefined categories using OpenAI API."""
    if not isinstance(topic_words, list) or not topic_words:
        return "Other"
    
    keywords_str = ', '.join(topic_words[:8])
    categories_short = '\n'.join([f"• {cat}" for cat in label_list])
    
    prompt = f"""Classify this parliamentary topic into ONE category from the list below.

TOPIC KEYWORDS: {keywords_str}

CATEGORIES:
{categories_short}

Rules:
- Choose the single best match
- Climate/environment/emissions → "Environment"
- NHS/hospitals/health → "Health"
- Parliamentary procedures → "Other or Government Operations"
- Economic topics → "Macroeconomics"
- Social issues → "Social Welfare"
- Legal/court topics → "Law and Crime"

RESPONSE: Category name only."""

    for attempt in range(max_retries):
        try:
            if not os.getenv('OPENAI_API_KEY'):
                print(f"Error: OPENAI_API_KEY not set for topic {topic_id}")
                return "Other"
            
            client = OpenAI()
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You classify parliamentary topics. Respond with exact category name only."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.1,
                max_tokens=20
            )
            
            classification = response.choices[0].message.content.strip()
            classification = classification.replace('"', '').replace("'", "").strip()
            
            # Exact match check
            if classification in label_list:
                return classification
            
            # Fuzzy matching
            classification_lower = classification.lower()
            for category in label_list:
                if category.lower() == classification_lower:
                    return category
                if category.lower() in classification_lower or classification_lower in category.lower():
                    return category
                    
            # Common variations mapping
            category_mapping = {
                'macro': 'Macroeconomics', 'economics': 'Macroeconomics', 'economic': 'Macroeconomics',
                'rights': 'Civil Rights', 'welfare': 'Social Welfare', 'social': 'Social Welfare',
                'crime': 'Law and Crime', 'legal': 'Law and Crime', 'justice': 'Law and Crime',
                'foreign': 'International Affairs', 'international': 'International Affairs',
                'trade': 'Foreign Trade', 'government': 'Government Operations',
                'administration': 'Government Operations'
            }
            
            for key, mapped_category in category_mapping.items():
                if key in classification_lower:
                    return mapped_category
            
            return "Other"
                
        except Exception as e:
            error_msg = str(e)
            if "rate_limit" in error_msg.lower():
                wait_time = 2 ** attempt + 1
                print(f"Rate limit hit for topic {topic_id}. Waiting {wait_time}s (attempt {attempt + 1}/{max_retries})...")
                time.sleep(wait_time)
                continue
            else:
                print(f"Error classifying topic {topic_id} (attempt {attempt + 1}/{max_retries}): {e}")
                if attempt == max_retries - 1:
                    return "Other"
                time.sleep(1)
    
    return "Other"

def run_bertopic_on_segments(df, dataset_name, language, text_column, segment_id_column, embedding_column, min_cluster_size=15):
    """Run BERTopic on segment-level data using pre-computed embeddings."""
    print(f"\n🔍 Running BERTopic for {dataset_name} ({language}) - Segment Level")
    print(f"   Text column: {text_column}")
    print(f"   Segment ID column: {segment_id_column}")
    print(f"   Embedding column: {embedding_column}")
    print(f"   Min cluster size: {min_cluster_size}")
    
    # Validate data first
    if not validate_data_before_processing(df, dataset_name, text_column, segment_id_column, embedding_column):
        print("❌ Data validation failed, skipping processing")
        return df, None, None
    
    # Check if required columns exist
    required_columns = [text_column, segment_id_column, embedding_column]
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        print(f"❌ Missing columns: {missing_columns}")
        return df, None, None
    
    # Group by segment and aggregate text
    print("📝 Grouping text by segments...")
    grouped_data = df.groupby(segment_id_column).agg({
        text_column: ' '.join,
        embedding_column: 'first'  # Use the first embedding (should be same for all in segment)
    }).reset_index()
    
    # Extract embeddings and documents
    documents = grouped_data[text_column].tolist()
    embeddings_raw = grouped_data[embedding_column].tolist()
    segment_ids = grouped_data[segment_id_column].tolist()
    
    # Debug: Check embeddings format
    print(f"🔍 Debug - First embedding type: {type(embeddings_raw[0])}")
    if hasattr(embeddings_raw[0], 'shape'):
        print(f"🔍 Debug - First embedding shape: {embeddings_raw[0].shape}")
    
    # Convert to numpy array and validate
    try:
        embeddings = np.array(embeddings_raw)
        print(f"📊 Prepared {len(documents)} segments for topic modeling")
        print(f"   Embedding shape: {embeddings.shape}")
        
        # Additional validation
        if embeddings.ndim != 2:
            print(f"❌ Invalid embedding dimensions: {embeddings.ndim}, expected 2")
            return df, None, None
        
        if np.any(np.isnan(embeddings)):
            print("❌ Embeddings contain NaN values")
            return df, None, None
            
    except Exception as e:
        print(f"❌ Error processing embeddings: {e}")
        return df, None, None
    
    # Set up language-specific stopwords
    if language == "english":
        stopwords_list = all_english_stopwords
    elif language == "german":
        stopwords_list = all_german_stopwords
    elif language == "croatian":
        stopwords_list = all_croatian_stopwords
    else:
        stopwords_list = all_english_stopwords
    
    # Set up BERTopic components with memory-efficient settings
    vectorizer_model = CountVectorizer(
        stop_words=stopwords_list,
        ngram_range=(1, 1),  # Changed from (1, 2) to (1, 1) to reduce memory usage
        min_df=5,  # Increased from 2 to 5 to reduce vocabulary size
        max_df=0.95,  # Add max_df to remove very common words
        max_features=2000,  # Reduced from 5000 to 2000 to save memory
        lowercase=True,
        strip_accents='unicode'
    )
    
    umap_model = UMAP(
        n_neighbors=15,
        n_components=5,
        min_dist=0.0,
        metric='cosine',
        random_state=42,
        low_memory=True,
        n_jobs=1  # Single thread to reduce memory
    )
    
    hdbscan_model = HDBSCAN(
        min_cluster_size=min_cluster_size,  # Use parameter instead of hardcoded value
        metric='euclidean',
        cluster_selection_method='eom',
        prediction_data=True
    )
    
    # Initialize BERTopic with explicit parameters
    topic_model = BERTopic(
        vectorizer_model=vectorizer_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        verbose=True,
        calculate_probabilities=True,
        embedding_model=None  # Explicitly set to None since we provide embeddings
    )
    
    print("🤖 Fitting BERTopic model...")
    try:
        # Ensure embeddings are float32 for compatibility
        embeddings = embeddings.astype(np.float32)
        
        topics, probabilities = topic_model.fit_transform(documents, embeddings)
        print(f"✅ Topic modeling completed!")
        print(f"   Found {len(set(topics))} topics (including noise: {-1 in topics})")
        
        # Get topic information
        topic_info = topic_model.get_topic_info()
        print(f"   Topic distribution: {topic_info['Count'].describe()}")
        
        # Process topics with OpenAI API
        print("\n🤖 Processing topics with OpenAI API...")
        topic_names = {}
        topic_categories = {}
        
        for idx, row in topic_info.iterrows():
            if row['Topic'] != -1:  # Skip noise topic
                topic_id = row['Topic']
                topic_words = [word for word, _ in topic_model.get_topic(topic_id)]
                
                print(f"Processing topic {topic_id}: {topic_words[:10]}...")
                
                # Generate topic name
                topic_name = generate_topic_name(topic_words)
                topic_names[topic_id] = topic_name
                
                # Classify into 22 categories
                category = classify_topic_to_22_categories(topic_words, topic_id)
                topic_categories[topic_id] = category
                
                print(f"  → Name: {topic_name}")
                print(f"  → Category: {category}")
                
                time.sleep(0.3)  # Reduced rate limiting
        
        # Create mapping dataframe
        segment_topics = pd.DataFrame({
            segment_id_column: segment_ids,
            f'Segment_Topic_{dataset_name}_{language}': topics,
            f'Segment_Subtopic_{dataset_name}_{language}': [topic_names.get(t, "Noise") for t in topics],
            f'Segment_Category_{dataset_name}_{language}': [topic_categories.get(t, "Other") for t in topics]
        })
        
        # Merge back to original dataframe
        df_result = df.merge(segment_topics, on=segment_id_column, how='left')
        
        print(f"✅ Successfully processed {dataset_name} ({language}) segments")
        print(f"   Added columns: Segment_Topic, Segment_Subtopic, Segment_Category")
        
        return df_result, topic_model, topic_info
        
    except Exception as e:
        print(f"❌ Error in topic modeling: {e}")
        import traceback
        traceback.print_exc()
        return df, None, None

In [None]:
# === PROCESS GB DATASET (ENGLISH ONLY) ===
print("🇬🇧 Processing British Parliament (GB) - Segment Level...")

# Check available columns
print("Available columns in GB dataset:")
print(GB.columns.tolist())

# Run segment-level topic modeling for GB with custom cluster size
GB_processed, gb_model, gb_topics = run_bertopic_on_segments(
    GB, 
    dataset_name="GB", 
    language="english",
    text_column="Text",
    segment_id_column="Segment_ID",
    embedding_column="segment_embeddings_english",
    min_cluster_size=20  # Larger clusters for GB (large dataset)
)

# Add the required columns (my_topic and my_subtopic)
if f'Segment_Topic_GB_english' in GB_processed.columns:
    GB_processed['my_subtopic'] = GB_processed[f'Segment_Subtopic_GB_english']
    GB_processed['my_topic'] = GB_processed[f'Segment_Category_GB_english']
    print("✅ Added my_topic and my_subtopic columns to GB dataset")
else:
    print("⚠️ Topic modeling failed for GB dataset")

🇬🇧 Processing British Parliament (GB) - Segment Level...
Available columns in GB dataset:
['Text_ID', 'ID', 'Title', 'Date', 'Body', 'Term', 'Session', 'Meeting', 'Sitting', 'Agenda', 'Subcorpus', 'Lang', 'Speaker_role', 'Speaker_MP', 'Speaker_minister', 'Speaker_party', 'Speaker_party_name', 'Party_status', 'Party_orientation', 'Speaker_ID', 'Speaker_name', 'Speaker_gender', 'Speaker_birth', 'Topic', 'Text', 'Word_Count', 'Speech_Embeddings', 'Segment_ID', 'segment_embeddings_english']

🔍 Running BERTopic for GB (english) - Segment Level
   Text column: Text
   Segment ID column: Segment_ID
   Embedding column: segment_embeddings_english
   Min cluster size: 20

🔍 Validating GB dataset...
  • Missing text: 0
  • Missing segment IDs: 0
  • Missing embeddings: 0
  • Missing text: 0
  • Missing segment IDs: 0
  • Missing embeddings: 0
  • Text length stats: min=2, mean=1079, max=155937
  • Segments: 33381 unique, avg 20.1 rows per segment
📝 Grouping text by segments...
  • Text length st

2025-10-02 22:33:24,187 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-10-02 22:34:15,277 - BERTopic - Dimensionality - Completed ✓
2025-10-02 22:34:15,285 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-02 22:34:15,277 - BERTopic - Dimensionality - Completed ✓
2025-10-02 22:34:15,285 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-02 22:36:03,111 - BERTopic - Cluster - Completed ✓
2025-10-02 22:36:03,111 - BERTopic - Cluster - Completed ✓
2025-10-02 22:36:03,151 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-02 22:36:03,151 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-02 22:37:32,887 - BERTopic - Representation - Completed ✓
2025-10-02 22:37:32,887 - BERTopic - Representation - Completed ✓


✅ Topic modeling completed!
   Found 213 topics (including noise: True)
   Topic distribution: count      213.000000
mean       156.718310
std        768.631629
min         20.000000
25%         36.000000
50%         56.000000
75%         99.000000
max      11027.000000
Name: Count, dtype: float64

🤖 Processing topics with OpenAI API...
Processing topic 0: ['referendum', 'voted', 'withdrawal', 'backstop', 'customs', 'negotiations', 'negotiating', 'nodeal', 'extension', 'exit']...
  → Name: Withdrawal Negotiations
  → Category: Other
  → Name: Withdrawal Negotiations
  → Category: Other
Processing topic 1: ['backbench', 'tuesday', 'monday', 'adjournment', 'thursday', 'recess', 'wednesday', 'hall', 'constituent', 'christmas']...
Processing topic 1: ['backbench', 'tuesday', 'monday', 'adjournment', 'thursday', 'recess', 'wednesday', 'hall', 'constituent', 'christmas']...
  → Name: Parliamentary Schedule Recess
  → Category: Other
  → Name: Parliamentary Schedule Recess
  → Category: Other

In [None]:
# === PROCESS AT_COMBINED DATASET (ENGLISH AND GERMAN) ===
print("\n🇦🇹 Processing Austrian Parliament (AT) - Segment Level...")

# Check available columns
print("Available columns in AT_combined dataset:")
print(AT_combined.columns.tolist())

# First run: English
print("\n📊 AT - English processing...")
AT_processed, at_en_model, at_en_topics = run_bertopic_on_segments(
    AT_combined, 
    dataset_name="AT", 
    language="english",
    text_column="Text",
    segment_id_column="Segment_ID_english",
    embedding_column="segment_embeddings_english",
    min_cluster_size=8  
)

# Second run: German
print("\n📊 AT - German processing...")
AT_processed, at_de_model, at_de_topics = run_bertopic_on_segments(
    AT_processed,  # Use the result from English processing
    dataset_name="AT", 
    language="german",
    text_column="Text_native_language",
    segment_id_column="Segment_ID_english",  # Same segment ID for both languages
    embedding_column="segment_embeddings_native_language",
    min_cluster_size=8
)

# Add separate columns for both languages
# English columns
if f'Segment_Subtopic_AT_english' in AT_processed.columns:
    AT_processed['my_subtopic_en'] = AT_processed[f'Segment_Subtopic_AT_english']
    AT_processed['my_topic_en'] = AT_processed[f'Segment_Category_AT_english']
    print("✅ Added my_topic_en and my_subtopic_en columns to AT dataset")
else:
    AT_processed['my_subtopic_en'] = "Miscellaneous"
    AT_processed['my_topic_en'] = "Other"
    print("⚠️ English topic modeling failed, added default values")

# German columns
if f'Segment_Subtopic_AT_german' in AT_processed.columns:
    AT_processed['my_subtopic_native_language'] = AT_processed[f'Segment_Subtopic_AT_german']
    AT_processed['my_topic_native_language'] = AT_processed[f'Segment_Category_AT_german']
    print("✅ Added my_topic_native_language and my_subtopic_native_language columns to AT dataset")
else:
    AT_processed['my_subtopic_native_language'] = "Miscellaneous"
    AT_processed['my_topic_native_language'] = "Other"
    print("⚠️ German topic modeling failed, added default values")


🇦🇹 Processing Austrian Parliament (AT) - Segment Level...
Available columns in AT_combined dataset:
['Text_ID', 'ID', 'Title', 'Date', 'Body', 'Term', 'Session', 'Meeting', 'Sitting', 'Agenda', 'Subcorpus', 'Lang', 'Speaker_role', 'Speaker_MP', 'Speaker_minister', 'Speaker_party', 'Speaker_party_name', 'Party_status', 'Party_orientation', 'Speaker_ID', 'Speaker_name', 'Speaker_gender', 'Speaker_birth', 'Topic', 'Text', 'Text_native_language', 'Speech_Embeddings_english', 'Speech_Embeddings_native_language', 'Segment_ID_english', 'Segment_ID_german', 'segment_embeddings_english', 'segment_embeddings_native_language']

📊 AT - English processing...

🔍 Running BERTopic for AT (english) - Segment Level
   Text column: Text
   Segment ID column: Segment_ID_english
   Embedding column: segment_embeddings_english
   Min cluster size: 8

🔍 Validating AT dataset...
  • Missing text: 0
  • Missing segment IDs: 0
  • Missing embeddings: 0
  • Text length stats: min=3, mean=1699, max=238216
  • Mi

2025-10-02 22:50:59,370 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


📊 Prepared 12529 segments for topic modeling
   Embedding shape: (12529, 1024)
🤖 Fitting BERTopic model...


2025-10-02 22:51:16,122 - BERTopic - Dimensionality - Completed ✓
2025-10-02 22:51:16,134 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-02 22:51:16,134 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-02 22:51:38,952 - BERTopic - Cluster - Completed ✓
2025-10-02 22:51:38,973 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-02 22:51:38,952 - BERTopic - Cluster - Completed ✓
2025-10-02 22:51:38,973 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-02 22:52:39,190 - BERTopic - Representation - Completed ✓
2025-10-02 22:52:39,190 - BERTopic - Representation - Completed ✓


✅ Topic modeling completed!
   Found 155 topics (including noise: True)
   Topic distribution: count     155.000000
mean       80.832258
std       293.770005
min         8.000000
25%        13.000000
50%        25.000000
75%        63.500000
max      3508.000000
Name: Count, dtype: float64

🤖 Processing topics with OpenAI API...
Processing topic 0: ['students', 'universities', 'university', 'teachers', 'schools', 'educational', 'teaching', 'science', 'teacher', 'student']...
  → Name: Educational Institutions
  → Category: Education
  → Name: Educational Institutions
  → Category: Education
Processing topic 1: ['treaty', 'enlargement', 'turkey', 'presidency', 'peace', 'accession', 'united', 'war', 'referendum', 'nations']...
Processing topic 1: ['treaty', 'enlargement', 'turkey', 'presidency', 'peace', 'accession', 'united', 'war', 'referendum', 'nations']...
  → Name: Turkey Accession Treaty
  → Category: International Affairs
  → Name: Turkey Accession Treaty
  → Category: Internatio

2025-10-02 23:01:38,320 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


📊 Prepared 12529 segments for topic modeling
   Embedding shape: (12529, 1024)
🤖 Fitting BERTopic model...


2025-10-02 23:01:58,603 - BERTopic - Dimensionality - Completed ✓
2025-10-02 23:01:58,605 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-02 23:01:58,605 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-02 23:03:11,159 - BERTopic - Cluster - Completed ✓
2025-10-02 23:03:11,170 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-02 23:03:11,159 - BERTopic - Cluster - Completed ✓
2025-10-02 23:03:11,170 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-02 23:04:54,183 - BERTopic - Representation - Completed ✓
2025-10-02 23:04:54,183 - BERTopic - Representation - Completed ✓


✅ Topic modeling completed!
   Found 240 topics (including noise: True)
   Topic distribution: count     240.000000
mean       52.204167
std       212.181072
min         8.000000
25%        13.000000
50%        20.000000
75%        42.000000
max      3202.000000
Name: Count, dtype: float64

🤖 Processing topics with OpenAI API...
Processing topic 0: ['patienten', 'rzte', 'gesundheitssystem', 'rasinger', 'arzt', 'versorgung', 'gesundheit', 'pumberger', 'sozialversicherung', 'leistungen']...
  → Name: Gesundheitsversorgung Patienten
  → Category: Health
  → Name: Gesundheitsversorgung Patienten
  → Category: Health
Processing topic 1: ['schule', 'lehrer', 'schulen', 'schler', 'lehrerinnen', 'schlerinnen', 'bildung', 'eltern', 'bildungspolitik', 'unterricht']...
Processing topic 1: ['schule', 'lehrer', 'schulen', 'schler', 'lehrerinnen', 'schlerinnen', 'bildung', 'eltern', 'bildungspolitik', 'unterricht']...
  → Name: Bildungssystem Deutschland
  → Category: Education
  → Name: Bildungssys

In [None]:
# === PROCESS HR_COMBINED DATASET (ENGLISH AND CROATIAN) ===
print("\n🇭🇷 Processing Croatian Parliament (HR) - Segment Level...")

# Check available columns
print("Available columns in HR_combined dataset:")
print(HR_combined.columns.tolist())

# First run: English
print("\n📊 HR - English processing...")
HR_processed, hr_en_model, hr_en_topics = run_bertopic_on_segments(
    HR_combined, 
    dataset_name="HR", 
    language="english",
    text_column="Text",
    segment_id_column="Segment_ID_english",
    embedding_column="segment_embeddings_english",
    min_cluster_size=20  
)

# Second run: Croatian
print("\n📊 HR - Croatian processing...")
HR_processed, hr_hr_model, hr_hr_topics = run_bertopic_on_segments(
    HR_processed,  # Use the result from English processing
    dataset_name="HR", 
    language="croatian",
    text_column="Text_native_language",
    segment_id_column="Segment_ID_english",  # Same segment ID for both languages
    embedding_column="segment_embeddings_native_language",
    min_cluster_size=20 
)

# Add separate columns for both languages
# English columns
if f'Segment_Subtopic_HR_english' in HR_processed.columns:
    HR_processed['my_subtopic_en'] = HR_processed[f'Segment_Subtopic_HR_english']
    HR_processed['my_topic_en'] = HR_processed[f'Segment_Category_HR_english']
    print("✅ Added my_topic_en and my_subtopic_en columns to HR dataset")
else:
    HR_processed['my_subtopic_en'] = "Miscellaneous"
    HR_processed['my_topic_en'] = "Other"
    print("⚠️ English topic modeling failed, added default values")

# Croatian columns
if f'Segment_Subtopic_HR_croatian' in HR_processed.columns:
    HR_processed['my_subtopic_native_language'] = HR_processed[f'Segment_Subtopic_HR_croatian']
    HR_processed['my_topic_native_language'] = HR_processed[f'Segment_Category_HR_croatian']
    print("✅ Added my_topic_native_language and my_subtopic_native_language columns to HR dataset")
else:
    HR_processed['my_subtopic_native_language'] = "Miscellaneous"
    HR_processed['my_topic_native_language'] = "Other"
    print("⚠️ Croatian topic modeling failed, added default values")


🇭🇷 Processing Croatian Parliament (HR) - Segment Level...
Available columns in HR_combined dataset:
['Text_ID', 'ID', 'Title', 'Date', 'Body', 'Term', 'Session', 'Meeting', 'Sitting', 'Agenda', 'Subcorpus', 'Lang', 'Speaker_role', 'Speaker_MP', 'Speaker_minister', 'Speaker_party', 'Speaker_party_name', 'Party_status', 'Party_orientation', 'Speaker_ID', 'Speaker_name', 'Speaker_gender', 'Speaker_birth', 'Topic', 'Text', 'Text_native_language', 'Speech_Embeddings_english', 'Speech_Embeddings_native_language', 'Segment_ID_english', 'Segment_ID_croatian', 'segment_embeddings_english', 'segment_embeddings_native_language']

📊 HR - English processing...

🔍 Running BERTopic for HR (english) - Segment Level
   Text column: Text
   Segment ID column: Segment_ID_english
   Embedding column: segment_embeddings_english
   Min cluster size: 20

🔍 Validating HR dataset...
  • Missing text: 0
  • Missing segment IDs: 0
  • Missing embeddings: 0
  • Missing text: 0
  • Missing segment IDs: 0
  • Miss

2025-10-02 23:22:46,796 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-10-02 23:23:31,202 - BERTopic - Dimensionality - Completed ✓
2025-10-02 23:23:31,208 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-02 23:23:31,202 - BERTopic - Dimensionality - Completed ✓
2025-10-02 23:23:31,208 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-02 23:25:45,471 - BERTopic - Cluster - Completed ✓
2025-10-02 23:25:45,495 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-02 23:25:45,471 - BERTopic - Cluster - Completed ✓
2025-10-02 23:25:45,495 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-02 23:26:41,770 - BERTopic - Representation - Completed ✓
2025-10-02 23:26:41,770 - BERTopic - Representation - Completed ✓


✅ Topic modeling completed!
   Found 224 topics (including noise: True)
   Topic distribution: count     224.000000
mean      112.120536
std       450.086797
min        20.000000
25%        33.000000
50%        56.000000
75%        91.000000
max      6667.000000
Name: Count, dtype: float64

🤖 Processing topics with OpenAI API...
Processing topic 0: ['documentation', 'municipality', 'roads', 'port', 'hall', 'cofinancing', 'schools', 'reconstruction', 'town', 'funding']...
  → Name: Municipal Infrastructure Funding
  → Category: Transportation
  → Name: Municipal Infrastructure Funding
  → Category: Transportation
Processing topic 1: ['plenkovic', 'andrej', 'reform', 'hs', 'bih', 'growth', 'croats', 'energy', 'corruption', 'reforms']...
Processing topic 1: ['plenkovic', 'andrej', 'reform', 'hs', 'bih', 'growth', 'croats', 'energy', 'corruption', 'reforms']...
  → Name: Croatian Energy Reform
  → Category: Energy
  → Name: Croatian Energy Reform
  → Category: Energy
Processing topic 2: ['

2025-10-02 23:38:41,110 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


📊 Prepared 25115 segments for topic modeling
   Embedding shape: (25115, 1024)
🤖 Fitting BERTopic model...


2025-10-02 23:39:03,968 - BERTopic - Dimensionality - Completed ✓
2025-10-02 23:39:03,970 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-02 23:39:03,970 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-02 23:40:14,301 - BERTopic - Cluster - Completed ✓
2025-10-02 23:40:14,312 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-02 23:40:14,301 - BERTopic - Cluster - Completed ✓
2025-10-02 23:40:14,312 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-02 23:41:22,120 - BERTopic - Representation - Completed ✓
2025-10-02 23:41:22,120 - BERTopic - Representation - Completed ✓


✅ Topic modeling completed!
   Found 234 topics (including noise: True)
   Topic distribution: count     234.000000
mean      107.329060
std       372.759848
min        21.000000
25%        33.000000
50%        51.000000
75%        97.000000
max      5589.000000
Name: Count, dtype: float64

🤖 Processing topics with OpenAI API...
Processing topic 0: ['amandmanu', 'potrebnu', 'glasanje', 'amandmana', 'amandmanom', 'cesta', 'iznos', 'ceste', 'projekt', 'osigurana']...
  → Name: Amandmani Cesta
  → Category: Transportation
  → Name: Amandmani Cesta
  → Category: Transportation
Processing topic 1: ['mirovine', 'mirovina', 'mirovinu', 'umirovljenika', 'mirovinski', 'mirovinskog', 'mirovinskom', 'stup', 'mirovinsko', 'umirovljenicima']...
Processing topic 1: ['mirovine', 'mirovina', 'mirovinu', 'umirovljenika', 'mirovinski', 'mirovinskog', 'mirovinskom', 'stup', 'mirovinsko', 'umirovljenicima']...
  → Name: Mirovinski Sustav
  → Category: Social Welfare
  → Name: Mirovinski Sustav
  → Categor

In [16]:
# === SUMMARY AND VERIFICATION ===
print("\n📋 PROCESSING SUMMARY")
print("=" * 50)

datasets = {
    "GB (British Parliament)": GB_processed,
    "AT (Austrian Parliament)": AT_processed, 
    "HR (Croatian Parliament)": HR_processed
}

for name, df in datasets.items():
    print(f"\n{name}:")
    print(f"  • Shape: {df.shape}")
    
    # Check for GB columns (single language)
    if name == "GB (British Parliament)":
        print(f"  • Has my_topic: {'my_topic' in df.columns}")
        print(f"  • Has my_subtopic: {'my_subtopic' in df.columns}")
        
        if 'my_topic' in df.columns:
            topic_counts = df['my_topic'].value_counts()
            print(f"  • Unique topics: {len(topic_counts)}")
            print(f"  • Top 3 topics: {topic_counts.head(3).to_dict()}")
    
    # Check for AT/HR columns (dual language)
    else:
        print(f"  • Has my_topic_en: {'my_topic_en' in df.columns}")
        print(f"  • Has my_subtopic_en: {'my_subtopic_en' in df.columns}")
        print(f"  • Has my_topic_native_language: {'my_topic_native_language' in df.columns}")
        print(f"  • Has my_subtopic_native_language: {'my_subtopic_native_language' in df.columns}")
        
        if 'my_topic_en' in df.columns:
            topic_counts_en = df['my_topic_en'].value_counts()
            print(f"  • Unique English topics: {len(topic_counts_en)}")
            print(f"  • Top 3 English topics: {topic_counts_en.head(3).to_dict()}")
        
        if 'my_topic_native_language' in df.columns:
            topic_counts_native = df['my_topic_native_language'].value_counts()
            print(f"  • Unique native language topics: {len(topic_counts_native)}")
            print(f"  • Top 3 native language topics: {topic_counts_native.head(3).to_dict()}")

print(f"\n✅ All datasets processed and saved with segment-level topics!")
print(f"✅ GB dataset has 'my_topic' and 'my_subtopic' columns")
print(f"✅ AT/HR datasets have separate English and native language topic columns!")


📋 PROCESSING SUMMARY

GB (British Parliament):
  • Shape: (670912, 34)
  • Has my_topic: True
  • Has my_subtopic: True
  • Unique topics: 21
  • Top 3 topics: {'Other': 283051, 'Health': 44996, 'International Affairs': 41398}

AT (Austrian Parliament):
  • Shape: (231759, 42)
  • Has my_topic_en: True
  • Has my_subtopic_en: True
  • Has my_topic_native_language: True
  • Has my_subtopic_native_language: True
  • Unique English topics: 21
  • Top 3 English topics: {'Other': 73838, 'Macroeconomics': 20827, 'Education': 15476}
  • Unique native language topics: 21
  • Top 3 native language topics: {'Other': 75318, 'Social Welfare': 18390, 'Macroeconomics': 17085}

HR (Croatian Parliament):
  • Shape: (504338, 42)
  • Has my_topic_en: True
  • Has my_subtopic_en: True
  • Has my_topic_native_language: True
  • Has my_subtopic_native_language: True
  • Unique English topics: 20
  • Top 3 English topics: {'Other': 141151, 'Government Operations': 42505, 'Macroeconomics': 35346}
  • Unique

In [None]:
#columns to drop: Segment_Topic_HR_english, Segment_Subtopic_HR_english, Segment_Category_HR_english, Segment_Topic_HR_croatian, Segment_Subtopic_HR_croatian, Segment_Category_HR_croatian
HR_processed.drop(columns=[
    'Segment_Topic_HR_english', 'Segment_Subtopic_HR_english', 'Segment_Category_HR_english',
    'Segment_Topic_HR_croatian', 'Segment_Subtopic_HR_croatian', 'Segment_Category_HR_croatian'
], inplace=True)

AT_processed.drop(columns=[
    'Segment_Topic_AT_english', 'Segment_Subtopic_AT_english', 'Segment_Category_AT_english',
    'Segment_Topic_AT_german', 'Segment_Subtopic_AT_german', 'Segment_Category_AT_german'
], inplace=True)

GB_processed.drop(columns=[
    'Segment_Topic_GB_english', 'Segment_Subtopic_GB_english', 'Segment_Category_GB_english'
], inplace=True)

In [29]:
# HR_processed.head(2)

hr_path_final = r"data folder\HR\HR_with_topics.pkl"
pd.to_pickle(HR_processed, hr_path_final)
print(f"✅ Saved HR dataset with topics to {hr_path_final}")

✅ Saved HR dataset with topics to data folder\HR\HR_with_topics.pkl


In [32]:
# AT_processed.head(2)

# Save the processed AT dataset
at_path_final = r"data folder\AT\AT_with_topics.pkl"
pd.to_pickle(AT_processed, at_path_final)
print(f"✅ Saved AT dataset with topics to {at_path_final}")

✅ Saved AT dataset with topics to data folder\AT\AT_with_topics.pkl


In [31]:
# GB_processed.head(2)

# Save the processed GB dataset
gb_path_final = r"data folder\GB\GB_with_topics.pkl"
pd.to_pickle(GB_processed, gb_path_final)
print(f"✅ Saved GB dataset with topics to {gb_path_final}")

✅ Saved GB dataset with topics to data folder\GB\GB_with_topics.pkl


In [None]:
# === SAVE TOPIC INFO DATAFRAMES ===
print("💾 Saving topic info dataframes...")

# Save GB topic info
if gb_topics is not None:
    gb_topic_info_path = r"data folder\GB\GB_topic_info.pkl"
    pd.to_pickle(gb_topics, gb_topic_info_path)
    print(f"✅ Saved GB topic info to {gb_topic_info_path}")
    print(f"   GB topics shape: {gb_topics.shape}")
else:
    print("⚠️ GB topic info not available")

# Save AT topic info (English and German)
if at_en_topics is not None:
    at_en_topic_info_path = r"data folder\AT\AT_topic_info_english.pkl"
    pd.to_pickle(at_en_topics, at_en_topic_info_path)
    print(f"✅ Saved AT English topic info to {at_en_topic_info_path}")
    print(f"   AT English topics shape: {at_en_topics.shape}")
else:
    print("⚠️ AT English topic info not available")

if at_de_topics is not None:
    at_de_topic_info_path = r"data folder\AT\AT_topic_info_german.pkl"
    pd.to_pickle(at_de_topics, at_de_topic_info_path)
    print(f"✅ Saved AT German topic info to {at_de_topic_info_path}")
    print(f"   AT German topics shape: {at_de_topics.shape}")
else:
    print("⚠️ AT German topic info not available")

# Save HR topic info (English and Croatian)
if hr_en_topics is not None:
    hr_en_topic_info_path = r"data folder\HR\HR_topic_info_english.pkl"
    pd.to_pickle(hr_en_topics, hr_en_topic_info_path)
    print(f"✅ Saved HR English topic info to {hr_en_topic_info_path}")
    print(f"   HR English topics shape: {hr_en_topics.shape}")
else:
    print("⚠️ HR English topic info not available")

if hr_hr_topics is not None:
    hr_hr_topic_info_path = r"data folder\HR\HR_topic_info_croatian.pkl"
    pd.to_pickle(hr_hr_topics, hr_hr_topic_info_path)
    print(f"✅ Saved HR Croatian topic info to {hr_hr_topic_info_path}")
    print(f"   HR Croatian topics shape: {hr_hr_topics.shape}")
else:
    print("⚠️ HR Croatian topic info not available")

print("\n📊 Topic info summary:")
print("   These files contain detailed topic information including:")
print("   - Topic IDs and counts")
print("   - Representative words for each topic")
print("   - Topic names and categories")