In [1]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords

pd.options.display.max_columns = None

# Load the data with embeddings (already segmented)
AT_combined = pd.read_pickle(r"data folder\AT\AT_final.pkl")
AT_combined.drop(columns=['Segment_ID'], inplace=True)

HR_combined= pd.read_pickle(r"data folder\HR\HR_final.pkl")
HR_combined.drop(columns=['Segment_ID'], inplace=True)

GB = pd.read_pickle(r"data folder\GB\GB_final.pkl")

print(f"✅ Loaded data:")
print(f"   • AT: {AT_combined.shape}")
print(f"   • HR: {HR_combined.shape}")
print(f"   • GB: {GB.shape}")

✅ Loaded data:
   • AT: (231759, 32)
   • HR: (504338, 32)
   • GB: (670912, 29)


In [2]:
# === BERTOPIC SETUP AND CONFIGURATION ===
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN
from dotenv import load_dotenv
import time

load_dotenv()

# Define the 22 target topic categories + Mix category
label_dict = {
    "Education": "Issues related to educational policies, primary and secondary schools, student loans and education finance, the regulation of colleges and universities, school reforms, teachers, vocational training, evening schools, safety in schools, efforts to improve educational standards, and issues related to libraries, dictionaries, teaching material, research in education",
    "Technology": "Issues related to science and technology transfer and international science cooperation, research policy, government space programs and space exploration, telephones and telecommunication regulation, broadcast media (television, radio, newspapers, films), weather forecasting, geological surveys, computer industry, cyber security.",
    "Health": "Issues related to health care, health care reforms, health insurance, drug industry, medical facilities, medical workers, disease prevention, treatment, and health promotion, drug and alcohol abuse, mental health, research in medicine, medical liability and unfair medical practices.",
    "Environment": "Issues related to environmental policy, drinking water safety, all kinds of pollution (air, noise, soil), waste disposal, recycling, climate change, outdoor environmental hazards (e.g., asbestos), species and forest protection, marine and freshwater environment, hunting, regulation of laboratory or performance animals, land and water resource conservation, research in environmental technology.",
    "Housing": "Issues related to housing, urban affairs and community development, housing market, property tax, spatial planning, rural development, location permits, construction inspection, illegal construction, industrial and commercial building issues, national housing policy, housing for low-income individuals, rental housing, housing for the elderly, e.g., nursing homes, housing for the homeless and efforts to reduce homelessness, research related to housing, construction inspection, illegal construction, industrial and commercial building issues, national housing policy, housing for low-income individuals, rental housing, housing for the elderly, e.g., nursing homes, housing for the homeless and efforts to reduce homelessness, research related to housing.",
    "Labor": "Issues related to labor, employment, employment programs, employee benefits, pensions and retirement accounts, minimum wage, labor law, job training, labor unions, worker safety and protection, youth employment and seasonal workers.",
    "Defense": "Issues related to defense policy, military intelligence, espionage, weapons, military personnel, reserve forces, military buildings, military courts, nuclear weapons, civil defense, including firefighters and mountain rescue services, homeland security, military aid or arms sales to other countries, prisoners of war and collateral damage to civilian populations, military nuclear and hazardous waste disposal and military environmental compliance, defense alliances and agreements, direct foreign military operations, claims against military, defense research.",
    "Government Operations": "Issues related to general government operations, the work of multiple departments, public employees, postal services, nominations and appointments, national mints, medals, and commemorative coins, management of government property, government procurement and contractors, public scandal and impeachment, claims against the government, the state inspectorate and audit, anti-corruption policies, regulation of political campaigns, political advertising and voter registration, census and statistics collection by government; issues related to local government, capital city and municipalities, including decentralization; issues related to national holidays.",
    "Social Welfare": "Issues related to social welfare policy, the Ministry of Social Affairs, social services, poverty assistance for low-income families and for the elderly, parental leave and child care, assistance for people with physical or mental disabilities, including early retirement pension, discounts on public services, volunteer associations (e.g., Red Cross), charities, and youth organizations.",
    "Macroeconomics": "Issues related to domestic macroeconomic policy, such as the state and prospect of the national economy, economic policy,inflation, interest rates, monetary policy, cost of living, unemployment rate, national budget, public debt, price control, tax enforcement, industrial revitalization and growth.",
    "Domestic Commerce": "Issues related to banking, finance and internal commerce, including stock exchange, investments, consumer finance, mortgages, credit cards, insurance availability and cost, accounting regulation, personal, commercial, and municipal bankruptcies, programs to promote small businesses, copyrights and patents, intellectual property, natural disaster preparedness and relief, consumer safety; regulation and promotion of tourism, sports, gambling, and personal fitness; domestic commerce research.",
    "Civil Rights": "Issues related to civil rights and minority rights, discrimination towards races, gender, sexual orientation, handicap, and other minorities, voting rights, freedom of speech, religious freedoms, privacy rights, protection of personal data, abortion rights, anti-government activity groups (e.g., local insurgency groups), religion and the Church.",
    "International Affairs": "Issues related to international affairs, foreign policy and relations to other countries, issues related to the Ministry of Foreign Affairs, foreign aid, international agreements (such as Kyoto agreement on the environment, the Schengen agreement), international organizations (including United Nations, UNESCO, International Olympic Committee, International Criminal Court), NGOs, issues related to diplomacy, embassies, citizens abroad; issues related to border control; issues related to international finance, including the World Bank and International Monetary Fund, the financial situation of the EU; issues related to a foreign country that do not impact the home country; issues related to human rights in other countries, international terrorism.",
    "Transportation": "Issues related to mass transportation construction and regulation, bus transport, regulation related to motor vehicles, road construction, maintenance and safety, parking facilities, traffic accidents statistics, air travel, rail travel, rail freight, maritime transportation, inland waterways and channels, transportation research and development.",
    "Immigration": "Issues related to immigration, refugees, and citizenship, integration issues, regulation of residence permits, asylum applications; criminal offences and diseases caused by immigration.",
    "Law and Crime": "Issues related to the control, prevention, and impact of crime; all law enforcement agencies, including border and customs, police, court system, prison system; terrorism, white collar crime, counterfeiting and fraud, cyber-crime, drug trafficking, domestic violence, child welfare, family law, juvenile crime.",
    "Agriculture": " Issues related to agriculture policy, fishing, agricultural foreign trade, food marketing, subsidies to farmers, food inspection and safety, animal and crop disease, pest control and pesticide regulation, welfare for animals in farms, pets, veterinary medicine, agricultural research.",
    "Foreign Trade": "Issues related to foreign trade, trade negotiations, free trade agreements, import regulation, export promotion and regulation, subsidies, private business investment and corporate development, competitiveness, exchange rates, the strength of national currency in comparison to other currencies, foreign investment and sales of companies abroad.",
    "Culture": "Issues related to cultural policies, Ministry of Culture, public spending on culture, cultural employees, issues related to support of theatres and artists; allocation of funds from the national lottery, issues related to cultural heritage.",
    "Public Lands": "Issues related to national parks, memorials, historic sites, and protected areas, including the management and staffing of cultural sites; museums; use of public lands and forests, establishment and management of harbors and marinas; issues related to flood control, forest fires, livestock grazing.",
    "Energy": "Issues related to energy policy, electricity, regulation of electrical utilities, nuclear energy and disposal of nuclear waste, natural gas and oil, drilling, oil spills, oil and gas prices, heat supply, shortages and gasoline regulation, coal production, alternative and renewable energy, energy conservation and energy efficiency, energy research.",
    "Other": "Other topics not mentioning policy agendas, including the procedures of parliamentary meetings, e.g., points of order, voting procedures, meeting logistics; interpersonal speech, e.g., greetings, personal stories, tributes, interjections, arguments between the members; rhetorical speech, e.g., jokes, literary references.",
    "Mix": "Use this category when the topic clearly spans multiple policy areas or when there is significant uncertainty about which single category best fits the topic. This is for topics that genuinely combine elements from 2-3 different categories in a meaningful way, making it difficult to assign to just one category with high confidence."
}

print(f"🎯 Target categories: {len(label_dict.keys())} topics (including Mix)")

🎯 Target categories: 23 topics (including Mix)


In [3]:
# === STOPWORDS CONFIGURATION ===

# Enhanced stopword lists with comprehensive coverage
english_custom_stopwords = [
    'mr', 'mrs', 'ms', 'dr', 'madam', 'honorable', 'honourable', 'member', 'members', 'vp', 'sp', 'fp', 
    'minister', 'speaker', 'deputy', 'president', 'chairman', 'chair', 'schilling', 
    'secretary', 'lord', 'lady', 'question', 'order', 'point', 'debate', 'motion', 'amendment',
    'congratulations', 'congratulate', 'thanks', 'thank', 'say', 'one', 'want', 'know', 'think', 
    'believe', 'see', 'go', 'come', 'give', 'take', 'people', 'federal', 'government', 'austria', 
    'austrian', 'committee', 'call', 'said', 'already', 'please', 'request', 'proceed', 'reading',
    'course', 'welcome', 'council', 'open', 'written', 'contain', 'items', 'item', 'yes', 'no', 
    'following', 'next', 'speech', 'year', 'years', 'state', 'also', 'would', 'like', 'may', 'must', 
    'upon', 'indeed', 'session', 'meeting', 'report', 'commission', 'behalf', 'gentleman', 'gentlemen', 
    'ladies', 'applause', 'group', 'colleague', 'colleagues', 'issue', 'issues', 'chancellor', 'court', 
    'ask', 'answer', 'reply', 'regard', 'regarding', 'regards', 'respect', 'respectfully', 'sign', 
    'shall', 'procedure', 'declare', 'hear', 'minutes', 'speaking', 'close', 'abg', 'mag', 'orf', 'wait'
]

german_custom_stopwords = [
    'der', 'die', 'das', 'und', 'in', 'zu', 'den', 'mit', 'von', 'für', 
    'auf', 'ist', 'im', 'sich', 'eine', 'sie', 'dem', 'nicht', 'ein', 'als',
    'auch', 'es', 'an', 'werden', 'aus', 'er', 'hat', 'dass', 'wir', 'ich',
    'haben', 'sind', 'kann', 'sehr', 'meine', 'muss', 'doch', 'wenn', 'sein',
    'dann', 'weil', 'bei', 'nach', 'so', 'oder', 'aber', 'vor', 'über', 'noch',
    'nur', 'wie', 'war', 'waren', 'wird', 'wurde', 'wurden', 'ihr', 'ihre',
    'ihren', 'seiner', 'seine', 'seinem', 'seinen', 'dieser', 'diese', 'dieses',
    'durch', 'ohne', 'gegen', 'unter', 'zwischen', 'während', 'bis', 'seit',
    'danke', 'bitte', 'gern', 'abgeordnete', 'abgeordneten', 'bundesregierung',
    'bundeskanzler', 'nationalrat', 'bundesrat', 'parlament', 'fraktion',
    'ausschuss', 'sitzung', 'präsident', 'vizepräsident', 'minister',
    'staatssekretär', 'klubobmann', 'antrag', 'anfrage', 'interpellation',
    'dringliche', 'aktuelle', 'stunde', 'debatte', 'abstimmung', 'beschluss',
    'gesetz', 'novelle', 'verordnung', 'regierungsvorlage', 'initiativantrag',
    'danke', 'dankeschön', 'geschätzte', 'kolleginnen', 'kollegen', 'hohes'
]

croatian_custom_stopwords = [
    'a', 'ako', 'ali', 'bi', 'bih', 'bila', 'bili', 'bilo', 'bio', 'bismo', 
    'biste', 'biti', 'bumo', 'da', 'do', 'duž', 'ga', 'hoće', 'hoćemo', 
    'hoćete', 'hoćeš', 'hoću', 'i', 'iako', 'ih', 'ili', 'iz', 'ja', 'je', 
    'jedna', 'jedne', 'jedno', 'jer', 'jesam', 'jesi', 'jesmo', 'jest', 
    'jeste', 'jesu', 'jim', 'joj', 'još', 'ju', 'kada', 'kako', 'kao', 
    'koja', 'koje', 'koji', 'kojima', 'koju', 'kroz', 'li', 'me', 'mene', 
    'meni', 'mi', 'mimo', 'moj', 'moja', 'moje', 'mu', 'na', 'nad', 'nakon', 
    'nam', 'nama', 'nas', 'naš', 'naša', 'naše', 'našeg', 'ne', 'nego', 
    'neka', 'neki', 'nekog', 'neku', 'nema', 'netko', 'neće', 'nećemo', 
    'nećete', 'nećeš', 'neću', 'nešto', 'ni', 'nije', 'nikoga', 'nikoje', 
    'nikoju', 'nisam', 'nisi', 'nismo', 'niste', 'nisu', 'njega', 'njegov', 
    'njegova', 'njegovo', 'njemu', 'njezin', 'njezina', 'njezino', 'njih', 
    'njihov', 'njihova', 'njihovo', 'njim', 'njima', 'njoj', 'nju', 'no', 
    'o', 'od', 'odmah', 'on', 'ona', 'oni', 'ono', 'ova', 'pa', 'pak', 
    'po', 'pod', 'pored', 'prije', 's', 'sa', 'sam', 'samo', 'se', 'sebe', 
    'sebi', 'si', 'smo', 'ste', 'su', 'sve', 'svi', 'svog', 'svoj', 'svoja', 
    'svoje', 'svom', 'ta', 'tada', 'taj', 'tako', 'te', 'tebe', 'tebi', 
    'ti', 'to', 'toj', 'tome', 'tu', 'tvoj', 'tvoja', 'tvoje', 'u', 'uz', 
    'vam', 'vama', 'vas', 'vaš', 'vaša', 'vaše', 'već', 'vi', 'vrlo', 'za', 
    'zar', 'će', 'ćemo', 'ćete', 'ćeš', 'ću', 'što', 'zastupnik', 'zastupnica', 
    'zastupnici', 'hvala', 'sabor', 'hrvatska', 'vlada', 'molim', 'gospodin', 
    'gospođa', 'premijer', 'predsjednik', 'predsjednica', 'ministar', 'ministrica',
    'državni', 'tajnik', 'tajnica', 'odbor', 'sjednica', 'rasprava', 'prijedlog', 
    'zakon', 'odluka', 'glasovanje', 'amandman', 'interpelacija', 'pitanje', 
    'odgovor', 'klupski', 'obnašatelj', 'dužnosti', 'potpredsjednik', 
    'potpredsjednica', 'kolegice', 'kolege', 'dame', 'gospodo', 'poštovani', 'poštovana'
]

# Combine with NLTK stopwords
german_nltk_stopwords = stopwords.words('german')
all_german_stopwords = list(set(german_nltk_stopwords + german_custom_stopwords))
all_croatian_stopwords = list(set(croatian_custom_stopwords))
all_english_stopwords = list(set(list(ENGLISH_STOP_WORDS) + english_custom_stopwords))

print(f"📚 Stopwords configured:")
print(f"   • English: {len(all_english_stopwords)} words")
print(f"   • German: {len(all_german_stopwords)} words")
print(f"   • Croatian: {len(all_croatian_stopwords)} words")

📚 Stopwords configured:
   • English: 417 words
   • German: 272 words
   • Croatian: 219 words


In [6]:
# === OPTIMIZED TOPIC MODELING FUNCTIONS ===
from openai import OpenAI

def classify_topic_enhanced(topic_words, topic_id=-1):
    """Enhanced classification with internal reasoning and confidence filtering."""
    keywords_str = ', '.join(topic_words[:20])  # Use top 20 words
    categories_detailed = '\n'.join([f"• {cat}: {desc}" for cat, desc in label_dict.items()])
    
    prompt = f"""Analyze these parliamentary debate keywords and classify into ONE category.

KEYWORDS: {keywords_str}

CATEGORIES:
{categories_detailed}

Instructions:
1. Think step-by-step about what policy domain these keywords represent
2. Consider which government ministry would handle these issues
3. Look for domain-specific terminology (e.g., "medical" → Health, "school/university" → Education)
4. If keywords clearly span multiple domains or you're uncertain, use "Mix"
5. Use "Other" only for procedural/non-policy content

First reason through your decision, then provide your final classification.

Format:
REASONING: [your step-by-step analysis]
CATEGORY: [exact category name]
CONFIDENCE: [HIGH/MEDIUM/LOW]"""

    client = OpenAI()
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are an expert parliamentary policy classifier. Always reason through your decision first, then provide category and confidence level."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.02,
        max_tokens=150
    )
    
    response_text = response.choices[0].message.content.strip()
    
    # Parse response
    category = "Mix"
    confidence = "LOW"
    
    for line in response_text.split('\n'):
        if line.startswith('CATEGORY:'):
            category = line.split(':', 1)[1].strip()
        elif line.startswith('CONFIDENCE:'):
            confidence = line.split(':', 1)[1].strip()
    
    # Clean category name
    category = category.replace('"', '').replace("'", "").strip()
    
    if category not in label_dict.keys():
        print(f"⚠️ Warning: OpenAI returned '{category}'. Defaulting to 'Mix'")
        category = "Mix"
        confidence = "LOW"
    
    # Apply confidence filtering - convert low confidence to "Other"
    if confidence == "LOW":
        category = "Other"
    
    return category, confidence

def run_bertopic_optimized(df, dataset_name, language, text_column, segment_id_column, embedding_column, min_cluster_size=6):
    """Optimized BERTopic focused on classification accuracy."""
    print(f"\n🔍 Running Optimized BERTopic for {dataset_name} ({language})")
    print(f"   Min cluster size: {min_cluster_size}")
    
    # Group by segment and aggregate text
    print("📝 Grouping text by segments...")
    grouped_data = df.groupby(segment_id_column).agg({
        text_column: ' '.join,
        embedding_column: 'first'
    }).reset_index()
    
    documents = grouped_data[text_column].tolist()
    embeddings = np.array(grouped_data[embedding_column].tolist())
    segment_ids = grouped_data[segment_id_column].tolist()
    
    print(f"📊 Prepared {len(documents)} segments for topic modeling")
    
    # Language-specific stopwords
    stopwords_list = {
        "english": all_english_stopwords,
        "german": all_german_stopwords,
        "croatian": all_croatian_stopwords
    }.get(language, all_english_stopwords)
    
    # Optimized components for better clustering
    vectorizer_model = CountVectorizer(
        stop_words=stopwords_list,
        ngram_range=(1, 3),      # Include phrases
        min_df=5,                # Higher threshold
        max_df=0.7,              # Remove very common terms
        max_features=15000,      # Larger vocabulary
        lowercase=True,
        strip_accents='unicode'
    )
    
    umap_model = UMAP(
        n_neighbors=20,          # Larger neighborhood
        n_components=15,         # More dimensions
        min_dist=0.1,
        metric='cosine',
        random_state=42,
        low_memory=True
    )
    
    # Fixed HDBSCAN configuration - removed prediction_data=True to avoid conflict
    hdbscan_model = HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=1,
        metric='euclidean',
        cluster_selection_method='eom',
        cluster_selection_epsilon=0.02,
        allow_single_cluster=False
    )
    
    # BERTopic model - disable calculate_probabilities to avoid the conflict
    topic_model = BERTopic(
        top_n_words=20,          # Extract 20 words for classification
        vectorizer_model=vectorizer_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        verbose=True,
        calculate_probabilities=False,  # Disabled to avoid conflict
        embedding_model=None
    )
    
    print("🤖 Fitting optimized BERTopic model...")
    embeddings = embeddings.astype(np.float32)
    topics, _ = topic_model.fit_transform(documents, embeddings)
    
    # Statistics
    topics_array = np.array(topics)
    outlier_count = (topics_array == -1).sum()
    outlier_percentage = (outlier_count / len(topics_array)) * 100
    
    print(f"✅ Topic modeling completed!")
    print(f"   Found {len(set(topics))} topics (outliers: {outlier_count}/{len(topics_array)} = {outlier_percentage:.1f}%)")
    
    # Topic processing - only classification, no naming
    topic_info = topic_model.get_topic_info()
    
    print("🤖 Classifying topics with enhanced OpenAI...")
    topic_categories = {}
    topic_confidences = {}
    
    for idx, row in topic_info.iterrows():
        topic_id = row['Topic']
        
        if topic_id == -1:
            topic_categories[topic_id] = "Other"
            topic_confidences[topic_id] = "HIGH"
            print(f"   Topic {topic_id}: [OUTLIERS] → Other")
            continue
        
        topic_words = [word for word, _ in topic_model.get_topic(topic_id)]
        
        # Enhanced classification with confidence filtering
        category, confidence = classify_topic_enhanced(topic_words, topic_id)
        topic_categories[topic_id] = category
        topic_confidences[topic_id] = confidence
        
        print(f"   Topic {topic_id}: → {category} ({confidence})")
        time.sleep(0.3)
    
    # Enhanced topic_info
    topic_info['Category'] = topic_info['Topic'].map(topic_categories)
    topic_info['Classification_Confidence'] = topic_info['Topic'].map(topic_confidences)
    
    # Reorder columns
    cols = list(topic_info.columns)
    new_cols = ['Topic', 'Category', 'Classification_Confidence'] + [col for col in cols if col not in ['Topic', 'Category', 'Classification_Confidence']]
    topic_info = topic_info[new_cols]
    
    # Create segment mapping (without probabilities since we disabled them)
    segment_topics = pd.DataFrame({
        segment_id_column: segment_ids,
        f'Segment_Topic_{dataset_name}_{language}': topics,
        f'Segment_Category_{dataset_name}_{language}': [topic_categories.get(t, "Other") for t in topics],
        f'Segment_Classification_Confidence_{dataset_name}_{language}': [topic_confidences.get(t, "LOW") for t in topics]
    })
    
    # Merge back
    df_result = df.merge(segment_topics, on=segment_id_column, how='left')
    
    return df_result, topic_model, topic_info

print("🔧 Optimized topic modeling functions configured")

🔧 Optimized topic modeling functions configured


# Topic Modeling Execution

Using the optimized approach for all datasets with enhanced classification accuracy.

In [None]:
# === PROCESS ALL DATASETS WITH OPTIMIZED APPROACH ===
print("🚀 Starting optimized topic modeling for all datasets...")

# === GB DATASET (English only) ===
print("\n🇬🇧 British Parliament")
GB_processed, gb_model, gb_topics = run_bertopic_optimized(
    GB, "GB", "english", "Text", "Segment_ID", "segment_embeddings_english", min_cluster_size=5
)
GB_processed = GB_processed.rename(columns={
    'Segment_Category_GB_english': 'my_topic'
})

# Save GB immediately
print("💾 Saving GB dataset...")
pd.to_pickle(GB_processed, r"data folder\GB\GB_with_topics.pkl")
pd.to_pickle(gb_topics, r"data folder\GB\GB_topic_info.pkl")
print(f"✅ Saved GB: {GB_processed.shape} segments → data folder\\GB\\GB_with_topics.pkl")
print(f"✅ Saved GB topics: {gb_topics.shape} → data folder\\GB\\GB_topic_info.pkl")

🚀 Starting optimized topic modeling for all datasets...

🇬🇧 British Parliament

🔍 Running Optimized BERTopic for GB (english)
   Min cluster size: 5
📝 Grouping text by segments...


2025-10-07 13:27:39,743 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


📊 Prepared 33381 segments for topic modeling
🤖 Fitting optimized BERTopic model...


2025-10-07 13:29:10,607 - BERTopic - Dimensionality - Completed ✓
2025-10-07 13:29:10,611 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-07 13:29:10,611 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-07 13:29:13,682 - BERTopic - Cluster - Completed ✓
2025-10-07 13:29:13,717 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-07 13:29:13,682 - BERTopic - Cluster - Completed ✓
2025-10-07 13:29:13,717 - BERTopic - Representation - Fine-tuning topics using representation models.


In [None]:
# === AT DATASET (English + German) ===
print("\n🇦🇹 Austrian Parliament - English")
AT_processed, at_en_model, at_en_topics = run_bertopic_optimized(
    AT_combined, "AT", "english", "Text", "Segment_ID_english", "segment_embeddings_english", min_cluster_size=4
)

# Save AT English immediately
print("💾 Saving AT English topics...")
pd.to_pickle(at_en_topics, r"data folder\AT\AT_topic_info_english.pkl")
print(f"✅ Saved AT English topics: {at_en_topics.shape} → data folder\\AT\\AT_topic_info_english.pkl")

print("\n🇦🇹 Austrian Parliament - German")
AT_processed, at_de_model, at_de_topics = run_bertopic_optimized(
    AT_processed, "AT", "german", "Text_native_language", "Segment_ID_english", "segment_embeddings_native_language", min_cluster_size=4
)

AT_processed = AT_processed.rename(columns={
    'Segment_Category_AT_english': 'my_topic_en',
    'Segment_Category_AT_german': 'my_topic_native_language'
})

# Save AT complete dataset immediately
print("💾 Saving AT complete dataset...")
pd.to_pickle(AT_processed, r"data folder\AT\AT_with_topics.pkl")
pd.to_pickle(at_de_topics, r"data folder\AT\AT_topic_info_german.pkl")
print(f"✅ Saved AT complete: {AT_processed.shape} segments → data folder\\AT\\AT_with_topics.pkl")
print(f"✅ Saved AT German topics: {at_de_topics.shape} → data folder\\AT\\AT_topic_info_german.pkl")

In [None]:
# === HR DATASET (English + Croatian) ===
print("\n🇭🇷 Croatian Parliament - English")
HR_processed, hr_en_model, hr_en_topics = run_bertopic_optimized(
    HR_combined, "HR", "english", "Text", "Segment_ID_english", "segment_embeddings_english", min_cluster_size=5
)

# Save HR English immediately
print("💾 Saving HR English topics...")
pd.to_pickle(hr_en_topics, r"data folder\HR\HR_topic_info_english.pkl")
print(f"✅ Saved HR English topics: {hr_en_topics.shape} → data folder\\HR\\HR_topic_info_english.pkl")

print("\n🇭🇷 Croatian Parliament - Croatian")
HR_processed, hr_hr_model, hr_hr_topics = run_bertopic_optimized(
    HR_processed, "HR", "croatian", "Text_native_language", "Segment_ID_english", "segment_embeddings_native_language", min_cluster_size=5
)

HR_processed = HR_processed.rename(columns={
    'Segment_Category_HR_english': 'my_topic_en',
    'Segment_Category_HR_croatian': 'my_topic_native_language'
})

# Save HR complete dataset immediately
print("💾 Saving HR complete dataset...")
pd.to_pickle(HR_processed, r"data folder\HR\HR_with_topics.pkl")
pd.to_pickle(hr_hr_topics, r"data folder\HR\HR_topic_info_croatian.pkl")
print(f"✅ Saved HR complete: {HR_processed.shape} segments → data folder\\HR\\HR_with_topics.pkl")
print(f"✅ Saved HR Croatian topics: {hr_hr_topics.shape} → data folder\\HR\\HR_topic_info_croatian.pkl")

print("\n🎉 All optimized topic modeling completed!")

In [None]:
# === FINAL SUMMARY ===
print("📊 Processing Summary:")
print(f"   • GB: {len(GB_processed):,} segments with topics ✅ SAVED")
print(f"   • AT: {len(AT_processed):,} segments with topics (EN + DE) ✅ SAVED")
print(f"   • HR: {len(HR_processed):,} segments with topics (EN + HR) ✅ SAVED")
print(f"   • Total: 5 topic info files saved ✅")

print("\n📈 Topic Distribution:")
# GB distribution
if 'my_topic' in GB_processed.columns:
    gb_dist = GB_processed['my_topic'].value_counts()
    print(f"\n🇬🇧 GB: {len(gb_dist)} categories, top: {gb_dist.head(3).to_dict()}")

# AT distribution
if 'my_topic_en' in AT_processed.columns:
    at_en_dist = AT_processed['my_topic_en'].value_counts()
    print(f"🇦🇹 AT (EN): {len(at_en_dist)} categories, top: {at_en_dist.head(3).to_dict()}")
    
if 'my_topic_native_language' in AT_processed.columns:
    at_de_dist = AT_processed['my_topic_native_language'].value_counts()
    print(f"🇦🇹 AT (DE): {len(at_de_dist)} categories, top: {at_de_dist.head(3).to_dict()}")

# HR distribution
if 'my_topic_en' in HR_processed.columns:
    hr_en_dist = HR_processed['my_topic_en'].value_counts()
    print(f"🇭🇷 HR (EN): {len(hr_en_dist)} categories, top: {hr_en_dist.head(3).to_dict()}")
    
if 'my_topic_native_language' in HR_processed.columns:
    hr_hr_dist = HR_processed['my_topic_native_language'].value_counts()
    print(f"🇭🇷 HR (HR): {len(hr_hr_dist)} categories, top: {hr_hr_dist.head(3).to_dict()}")

print("\n🎯 Next Steps:")
print("1. Calculate F1 scores and compare with benchmark (0.75)")
print("2. Analyze cross-language consistency")
print("3. Fine-tune parameters if needed")
print("4. Use visualization.ipynb for detailed analysis")