In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = None

# Load the data with embeddings
embeddings = pd.read_pickle(r"data folder\data\AT_with_embeddings_final.pkl")

print(f"✅ Loaded data: {embeddings.shape}")
print(f"Columns: {list(embeddings.columns)}")

✅ Loaded data: (231752, 30)
Columns: ['Sitting_ID', 'Speech_ID', 'Title', 'Date', 'Body', 'Term', 'Session', 'Meeting', 'Sitting', 'Agenda', 'Subcorpus', 'Lang', 'Speaker_role', 'Speaker_MP', 'Speaker_minister', 'Speaker_party', 'Speaker_party_name', 'Party_status', 'Party_orientation', 'Speaker_ID', 'Speaker_name', 'Speaker_gender', 'Speaker_birth', 'Text', 'Word_Count', 'Is_Too_Short', 'Is_Filtered', 'Speech_Embeddings', 'Segment_ID', 'Segment_Embeddings']


In [2]:
# === DATA OVERVIEW ===
print("📊 Data Overview:")
print(f"  • Total speeches: {embeddings.shape[0]:,}")
print(f"  • Speech embedding shape: {embeddings['Speech_Embeddings'][0].shape}")
print(f"  • Segment embedding shape: {embeddings['Segment_Embeddings'][0].shape}")
print(f"  • Unique segments: {embeddings['Segment_ID'].nunique():,}")
print(f"  • Average speeches per segment: {embeddings.shape[0] / embeddings['Segment_ID'].nunique():.1f}")

# Check for missing values
print(f"\n🔍 Missing values:")
print(f"  • Segment_ID: {embeddings['Segment_ID'].isna().sum()}")
print(f"  • Speech_Embeddings: {embeddings['Speech_Embeddings'].isna().sum()}")
print(f"  • Segment_Embeddings: {embeddings['Segment_Embeddings'].isna().sum()}")

# Check sitting length distribution
sitting_lengths = embeddings.groupby('Sitting_ID').size()
print(f"\n📈 Sitting length distribution:")
print(f"  • Min speeches per sitting: {sitting_lengths.min()}")
print(f"  • Max speeches per sitting: {sitting_lengths.max()}")
print(f"  • Average speeches per sitting: {sitting_lengths.mean():.1f}")
print(f"  • Sittings with <50 speeches: {(sitting_lengths < 50).sum()}")
print(f"  • Sittings with >200 speeches: {(sitting_lengths > 200).sum()}")

📊 Data Overview:
  • Total speeches: 231,752
  • Speech embedding shape: (1024,)
  • Segment embedding shape: (1024,)
  • Unique segments: 5,728
  • Average speeches per segment: 40.5

🔍 Missing values:
  • Segment_ID: 0
  • Speech_Embeddings: 41119
  • Segment_Embeddings: 0

📈 Sitting length distribution:
  • Min speeches per sitting: 1
  • Max speeches per sitting: 1378
  • Average speeches per sitting: 189.8
  • Sittings with <50 speeches: 474
  • Sittings with >200 speeches: 542

📈 Sitting length distribution:
  • Min speeches per sitting: 1
  • Max speeches per sitting: 1378
  • Average speeches per sitting: 189.8
  • Sittings with <50 speeches: 474
  • Sittings with >200 speeches: 542


In [3]:
# === COMPREHENSIVE CHAIRPERSON AND AGENDA ANALYSIS ===

print("📊 Overall Speech Statistics:")
print(f"  • Total speeches in dataset: {len(embeddings):,}")

# Chairperson speeches
chairperson_total = embeddings[embeddings['Speaker_role'] == 'Chairperson']
print(f"  • Total chairperson speeches: {len(chairperson_total):,}")
print(f"  • Chairperson percentage: {len(chairperson_total)/len(embeddings)*100:.1f}%")

print("\n📋 Agenda-related Speech Analysis:")

# Various agenda patterns
agenda_patterns = {
    'agenda': embeddings['Text'].str.contains('agenda', case=False),
    'agenda item': embeddings['Text'].str.contains('agenda item', case=False),
    'next agenda': embeddings['Text'].str.contains('next agenda', case=False),
    'next agenda item': embeddings['Text'].str.contains('next agenda item', case=False)
}

# # Count for all speeches
# print("All speeches:")
# for pattern_name, pattern_mask in agenda_patterns.items():
#     count = pattern_mask.sum()
#     percentage = count / len(chairperson_total) * 100
#     print(f"  • Containing '{pattern_name}': {count:,} ({percentage:.2f}%)")

print("\nChairperson speeches only:")
# Count for chairperson speeches only
for pattern_name, pattern_mask in agenda_patterns.items():
    chairperson_with_pattern = embeddings[(embeddings['Speaker_role'] == 'Chairperson') & pattern_mask]
    count = len(chairperson_with_pattern)
    percentage_of_chairperson = count / len(chairperson_total) * 100 if len(chairperson_total) > 0 else 0
    percentage_of_total = count / len(embeddings) * 100
    print(f"  • Containing '{pattern_name}': {count:,} ({percentage_of_chairperson:.1f}% of chairperson speeches, {percentage_of_total:.2f}% of all speeches)")

# Sample of 100 chairperson speeches with 'agenda'
chairperson_with_agenda = embeddings[(embeddings['Speaker_role'] == 'Chairperson') & 
                                    (embeddings['Text'].str.contains('agenda', case=False))]

# Configure display settings
pd.set_option('display.max_colwidth', 200)  # Limit text to 200 chars for readability

sample_speeches = chairperson_with_agenda

for i, (idx, row) in enumerate(sample_speeches.iterrows(), 1):
    # Calculate rows between current and previous (if not first)
    if i > 1:
        prev_idx = prev_row_idx
        rows_between = idx - prev_idx - 1
        #print(f"\n[{rows_between} rows between previous and current]")
    
    # print(f"\n🎤 Speech #{i} (Row index: {idx})")
    # print(f"Sitting: {row['Sitting_ID']} | Speaker: {row['Speaker_role']}")
    # print("-" * 80)
    
    # Truncate very long speeches for readability
    # text = row['Text']
    # if len(text) > 300:
    #     text = text[:300] + "..."
    # print(text)
    # print("-" * 80)
    
    prev_row_idx = idx

print(f"\n📈 Summary of speeches:")
if len(sample_speeches) > 1:
    first_idx = sample_speeches.index[0]
    last_idx = sample_speeches.index[-1]
    total_span = last_idx - first_idx
    avg_gap = total_span / (len(sample_speeches) - 1) if len(sample_speeches) > 1 else 0
    print(f"  • First speech at row: {first_idx:,}")
    print(f"  • Last speech at row: {last_idx:,}")
    print(f"  • Total row span: {total_span:,}")
    print(f"  • Average gap between agenda speeches: {avg_gap:.1f} rows")

# Reset display options
pd.reset_option('display.max_colwidth')

📊 Overall Speech Statistics:
  • Total speeches in dataset: 231,752
  • Total chairperson speeches: 125,038
  • Chairperson percentage: 54.0%

📋 Agenda-related Speech Analysis:

Chairperson speeches only:
  • Containing 'agenda': 11,613 (9.3% of chairperson speeches, 5.01% of all speeches)
  • Containing 'agenda item': 5,039 (4.0% of chairperson speeches, 2.17% of all speeches)
  • Containing 'next agenda': 0 (0.0% of chairperson speeches, 0.00% of all speeches)
  • Containing 'next agenda item': 0 (0.0% of chairperson speeches, 0.00% of all speeches)

Chairperson speeches only:
  • Containing 'agenda': 11,613 (9.3% of chairperson speeches, 5.01% of all speeches)
  • Containing 'agenda item': 5,039 (4.0% of chairperson speeches, 2.17% of all speeches)
  • Containing 'next agenda': 0 (0.0% of chairperson speeches, 0.00% of all speeches)
  • Containing 'next agenda item': 0 (0.0% of chairperson speeches, 0.00% of all speeches)

📈 Summary of speeches:
  • First speech at row: 2
  • Last s

## Parliamentary Segmentation

Improved segmentation considering that Austrian parliament sittings can vary from 1-10 agendas (average 5-6).
We'll implement adaptive segmentation that adjusts to sitting length and uses multiple similarity signals.

In [4]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

def parliamentary_segment_speeches(df, window_size=5, min_segment_size=3):
    """
    Parliamentary segmentation with multi-scale analysis and chairperson agenda detection
    """
    segment_ids = []
    segmentation_metrics = []
    
    # Get unique sittings for progress tracking
    unique_sittings = df['Sitting_ID'].unique()
    print(f"🔄 Processing {len(unique_sittings)} sittings...")
    
    for sitting_id in tqdm(unique_sittings, desc="Segmenting sittings", unit="sitting"):
        group = df[df['Sitting_ID'] == sitting_id]
        sitting_length = len(group)
        
        if sitting_length < min_segment_size:
            # Very small sitting - one segment
            sitting_segments = [f"{sitting_id}_seg_0"] * len(group)
            segment_ids.extend(sitting_segments)
            segmentation_metrics.append({
                'sitting_id': sitting_id,
                'sitting_length': sitting_length,
                'num_segments': 1,
                'avg_segment_size': sitting_length,
                'boundaries_found': 0,
                'agenda_boundaries': 0
            })
            continue
        
        embeddings = np.array(group['Speech_Embeddings'].tolist())
        
        # --- NEW: Flexible target_segments formula ---
        target_segments = max(2, int(np.ceil(sitting_length / 25)))
        threshold_percentile = 40
        
        # === CHAIRPERSON AGENDA DETECTION ===
        agenda_boundaries = set()
        agenda_signals = []
        
        for i, (idx, row) in enumerate(group.iterrows()):
            agenda_score = 0
            
            # Strong signal for chairperson with agenda mentions
            if row['Speaker_role'] == 'Chairperson':
                text = str(row['Text']).lower()
                
                if 'agenda item' in text:
                    agenda_score = 1.0  # Strongest signal
                elif 'agenda' in text:
                    agenda_score = 0.7  # Strong signal
                elif i == 0:  # First speech by chairperson (session start)
                    agenda_score = 0.3  # Mild signal
            
            agenda_signals.append(agenda_score)
            
            # Add strong agenda boundaries
            if agenda_score >= 0.7 and i >= min_segment_size and (sitting_length - i) >= min_segment_size:
                agenda_boundaries.add(i)
        
        # === MULTI-SCALE SIMILARITY ANALYSIS ===
        similarity_signals = {}
        
        # 1. Primary windowed similarity
        similarities = []
        for i in range(len(embeddings) - window_size):
            window1 = np.mean(embeddings[i:i + window_size], axis=0)
            window2 = np.mean(embeddings[i + window_size:i + 2*window_size], axis=0)
            
            sim = cosine_similarity(
                window1.reshape(1, -1),
                window2.reshape(1, -1)
            )[0][0]
            similarities.append(sim)
        
        similarity_signals['primary'] = np.array(similarities)
        
        # 2. Point-to-point similarity for fine-grained detection
        if len(embeddings) > 6:
            point_sims = []
            for i in range(len(embeddings) - 1):
                sim = cosine_similarity(
                    embeddings[i].reshape(1, -1),
                    embeddings[i + 1].reshape(1, -1)
                )[0][0]
                point_sims.append(sim)
            
            # Align with primary signal
            point_sims = np.array(point_sims)
            if len(point_sims) > len(similarities):
                point_sims = point_sims[:len(similarities)]
            elif len(point_sims) < len(similarities):
                padding = len(similarities) - len(point_sims)
                point_sims = np.pad(point_sims, (0, padding), mode='edge')
            
            similarity_signals['point'] = point_sims
        
        # 3. Gradient-based change detection
        if len(embeddings) > 10:
            trajectory = []
            for i in range(1, len(embeddings)):
                displacement = np.linalg.norm(embeddings[i] - embeddings[i-1])
                trajectory.append(float(displacement))
            
            trajectory = np.array(trajectory, dtype=np.float64)
            if len(trajectory) > 3:
                try:
                    from scipy.ndimage import uniform_filter1d
                    smoothed = uniform_filter1d(trajectory.astype(np.float64), size=3)
                    gradient = np.gradient(smoothed)
                    
                    # Align with similarities
                    if len(gradient) > len(similarities):
                        gradient = gradient[:len(similarities)]
                    elif len(gradient) < len(similarities):
                        padding = len(similarities) - len(gradient)
                        gradient = np.pad(gradient, (0, padding), mode='edge')
                    
                    similarity_signals['gradient'] = gradient
                except:
                    pass
        
        if len(similarity_signals['primary']) == 0:
            sitting_segments = [f"{sitting_id}_seg_0"] * len(group)
            segment_ids.extend(sitting_segments)
            segmentation_metrics.append({
                'sitting_id': sitting_id,
                'sitting_length': sitting_length,
                'num_segments': 1,
                'avg_segment_size': sitting_length,
                'boundaries_found': 0,
                'agenda_boundaries': 0
            })
            continue
        
        # === BOUNDARY DETECTION ===
        candidate_boundaries = set()
        
        # 1. Add agenda boundaries (highest priority)
        candidate_boundaries.update(agenda_boundaries)
        
        # 2. Find boundaries from primary similarity drops
        primary_sims = similarity_signals['primary']
        threshold = np.percentile(primary_sims, threshold_percentile)
        
        for i in range(len(primary_sims)):
            if (primary_sims[i] < threshold and 
                i >= min_segment_size and 
                (len(group) - i - window_size) >= min_segment_size):
                candidate_boundaries.add(i + window_size)
        
        # 3. Add from point-to-point analysis
        if 'point' in similarity_signals:
            point_threshold = np.percentile(similarity_signals['point'], threshold_percentile - 10)
            for i in range(len(similarity_signals['point'])):
                if (similarity_signals['point'][i] < point_threshold and 
                    i >= min_segment_size and 
                    (len(group) - i) >= min_segment_size):
                    candidate_boundaries.add(i)
        
        # 4. Add from gradient analysis
        if 'gradient' in similarity_signals:
            gradient = similarity_signals['gradient']
            gradient_threshold = np.percentile(np.abs(gradient), 75)
            for i in range(len(gradient)):
                if (np.abs(gradient[i]) > gradient_threshold and 
                    i >= min_segment_size and 
                    (len(group) - i) >= min_segment_size):
                    candidate_boundaries.add(i)
        
        candidates = sorted(list(candidate_boundaries))
        
        # === BOUNDARY SELECTION WITH AGENDA PRIORITIZATION ===
        boundaries = []
        if candidates:
            if len(candidates) <= target_segments - 1:
                boundaries = candidates
            else:
                # Score candidates with agenda boost
                candidate_scores = []
                for c in candidates:
                    score = 0
                    
                    # Agenda boost (highest priority)
                    if c < len(agenda_signals):
                        score += agenda_signals[c] * 5.0  # Very high weight for agenda
                    
                    # Primary similarity score
                    if c - window_size >= 0 and c - window_size < len(primary_sims):
                        score += (1 - primary_sims[c - window_size]) * 2.0
                    
                    # Point similarity score
                    if 'point' in similarity_signals and c < len(similarity_signals['point']):
                        score += (1 - similarity_signals['point'][c]) * 1.5
                    
                    # Gradient score
                    if 'gradient' in similarity_signals and c < len(similarity_signals['gradient']):
                        score += np.abs(similarity_signals['gradient'][c]) * 1.0
                    
                    candidate_scores.append((c, score))
                
                # Select top scoring boundaries
                candidate_scores.sort(key=lambda x: x[1], reverse=True)
                boundaries = sorted([c for c, _ in candidate_scores[:target_segments-1]])
        
        # === BOUNDARY VALIDATION ===
        validated_boundaries = []
        for boundary in boundaries:
            if not validated_boundaries or (boundary - validated_boundaries[-1]) >= min_segment_size:
                validated_boundaries.append(boundary)
        
        boundaries = validated_boundaries
        
        # Assign segment IDs
        current_segment = 0
        sitting_segments = []
        
        for i in range(len(group)):
            if i > 0 and (i - 1) in boundaries:
                current_segment += 1
            sitting_segments.append(f"{sitting_id}_seg_{current_segment}")
        
        segment_ids.extend(sitting_segments)
        
        # Store metrics
        num_segments = len(set(sitting_segments))
        agenda_bound_count = len([b for b in boundaries if b in agenda_boundaries])
        
        segmentation_metrics.append({
            'sitting_id': sitting_id,
            'sitting_length': sitting_length,
            'num_segments': num_segments,
            'avg_segment_size': sitting_length / num_segments,
            'boundaries_found': len(boundaries),
            'agenda_boundaries': agenda_bound_count,
            'target_segments': target_segments,
            'candidate_boundaries': len(candidates),
            'signals_used': len(similarity_signals) + 1  # +1 for agenda signals
        })
    
    df['Segment_ID'] = segment_ids
    return df, segmentation_metrics

print("✅ Enhanced parliamentary segmentation function loaded (with agenda detection)")

✅ Enhanced parliamentary segmentation function loaded (with agenda detection)


In [5]:
# === APPLY PARLIAMENTARY SEGMENTATION ===
print("🏛️ Running Parliamentary Segmentation (optimized for more segments)...")

# Check data size first
long_speeches_df = embeddings[~embeddings['Is_Too_Short']].copy()
unique_sittings = long_speeches_df['Sitting_ID'].nunique()
total_speeches = len(long_speeches_df)

print(f"📊 Data to process:")
print(f"  • Unique sittings: {unique_sittings:,}")
print(f"  • Total speeches: {total_speeches:,}")
print(f"  • Average speeches per sitting: {total_speeches/unique_sittings:.1f}")

# Run parliamentary segmentation (tuned for more segments)
segmented_df, seg_metrics = parliamentary_segment_speeches(
    long_speeches_df, 
    window_size=5,        
    min_segment_size=3    # Smaller minimum for more segments
)

print(f"\n✅ Parliamentary segmentation complete!")
print(f"📊 Results:")
print(f"  • Total speeches processed: {len(segmented_df):,}")
print(f"  • Unique segments created: {segmented_df['Segment_ID'].nunique():,}")
print(f"  • Average speeches per segment: {len(segmented_df) / segmented_df['Segment_ID'].nunique():.1f}")

# Convert metrics to DataFrame for analysis
metrics_df = pd.DataFrame(seg_metrics)

print(f"\n📈 Segmentation Quality Overview:")
print(f"  • Average segments per sitting: {metrics_df['num_segments'].mean():.1f}")
print(f"  • Segments per sitting (std): {metrics_df['num_segments'].std():.1f}")
print(f"  • Average segment size: {metrics_df['avg_segment_size'].mean():.1f}")
print(f"  • Target vs actual correlation: {metrics_df[['target_segments', 'num_segments']].corr().iloc[0,1]:.3f}")
print(f"  • Average candidate boundaries: {metrics_df['candidate_boundaries'].mean():.1f}")
print(f"  • Average signals used: {metrics_df['signals_used'].mean():.1f}")

# Analysis of signal effectiveness
sittings_with_point = (metrics_df['signals_used'] >= 2).sum()
sittings_with_gradient = (metrics_df['signals_used'] >= 3).sum()
print(f"\n🔍 Signal Usage Analysis:")
print(f"  • Sittings using point-to-point analysis: {sittings_with_point}/{len(metrics_df)} ({sittings_with_point/len(metrics_df)*100:.1f}%)")
print(f"  • Sittings using gradient analysis: {sittings_with_gradient}/{len(metrics_df)} ({sittings_with_gradient/len(metrics_df)*100:.1f}%)")

🏛️ Running Parliamentary Segmentation (optimized for more segments)...
📊 Data to process:
  • Unique sittings: 1,221
  • Total speeches: 190,633
  • Average speeches per sitting: 156.1
🔄 Processing 1221 sittings...
📊 Data to process:
  • Unique sittings: 1,221
  • Total speeches: 190,633
  • Average speeches per sitting: 156.1
🔄 Processing 1221 sittings...


Segmenting sittings: 100%|██████████| 1221/1221 [03:10<00:00,  6.40sitting/s]


✅ Parliamentary segmentation complete!
📊 Results:
  • Total speeches processed: 190,633
  • Unique segments created: 7,043
  • Average speeches per segment: 27.1

📈 Segmentation Quality Overview:
  • Average segments per sitting: 5.8
  • Segments per sitting (std): 5.1
  • Average segment size: 18.9
  • Target vs actual correlation: 0.964
  • Average candidate boundaries: 159.1
  • Average signals used: 4.0

🔍 Signal Usage Analysis:
  • Sittings using point-to-point analysis: 790/1221 (64.7%)
  • Sittings using gradient analysis: 777/1221 (63.6%)





## Topic Modeling with BERTopic

This section runs the topic modeling on the segmented speeches using BERTopic with guided topics.

In [None]:
# === BERTOPIC SETUP WITH GUIDED TOPICS ===
from bertopic import BERTopic
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from umap import UMAP
import openai
import os
from dotenv import load_dotenv

# Define the 22 target topic categories
label_list = [
    "Education", "Technology", "Health", "Environment", "Housing", "Labor", 
    "Defense", "Government Operations", "Social Welfare", "Other", "Macroeconomics", 
    "Domestic Commerce", "Civil Rights", "International Affairs", "Transportation", 
    "Immigration", "Law and Crime", "Agriculture", "Foreign Trade", "Culture", 
    "Public Lands", "Energy"
]

# Detailed topic descriptions for better classification
majortopics_description = {
    'Macroeconomics': 'issues related to domestic macroeconomic policy, such as the state and prospect of the national economy, economic policy, inflation, interest rates, monetary policy, cost of living, unemployment rate, national budget, public debt, price control, tax enforcement, industrial revitalization and growth.',
    'Civil Rights': 'issues related to civil rights and minority rights, discrimination towards races, gender, sexual orientation, handicap, and other minorities, voting rights, freedom of speech, religious freedoms, privacy rights, protection of personal data, abortion rights, anti-government activity groups (e.g., local insurgency groups), religion and the Church.',
    'Health': 'issues related to health care, health care reforms, health insurance, drug industry, medical facilities, medical workers, disease prevention, treatment, and health promotion, drug and alcohol abuse, mental health, research in medicine, medical liability and unfair medical practices.',
    'Agriculture': 'issues related to agriculture policy, fishing, agricultural foreign trade, food marketing, subsidies to farmers, food inspection and safety, animal and crop disease, pest control and pesticide regulation, welfare for animals in farms, pets, veterinary medicine, agricultural research.',
    'Labor': 'issues related to labor, employment, employment programs, employee benefits, pensions and retirement accounts, minimum wage, labor law, job training, labor unions, worker safety and protection, youth employment and seasonal workers.',
    'Education': 'issues related to educational policies, primary and secondary schools, student loans and education finance, the regulation of colleges and universities, school reforms, teachers, vocational training, evening schools, safety in schools, efforts to improve educational standards, and issues related to libraries, dictionaries, teaching material, research in education.',
    'Environment': 'issues related to environmental policy, drinking water safety, all kinds of pollution (air, noise, soil), waste disposal, recycling, climate change, outdoor environmental hazards (e.g., asbestos), species and forest protection, marine and freshwater environment, hunting, regulation of laboratory or performance animals, land and water resource conservation, research in environmental technology.',
    'Energy': 'issues related to energy policy, electricity, regulation of electrical utilities, nuclear energy and disposal of nuclear waste, natural gas and oil, drilling, oil spills, oil and gas prices, heat supply, shortages and gasoline regulation, coal production, alternative and renewable energy, energy conservation and energy efficiency, energy research.',
    'Immigration': 'issues related to immigration, refugees, and citizenship, integration issues, regulation of residence permits, asylum applications; criminal offences and diseases caused by immigration.',
    'Transportation': 'issues related to mass transportation construction and regulation, bus transport, regulation related to motor vehicles, road construction, maintenance and safety, parking facilities, traffic accidents statistics, air travel, rail travel, rail freight, maritime transportation, inland waterways and channels, transportation research and development.',
    'Law and Crime': 'issues related to the control, prevention, and impact of crime; all law enforcement agencies, including border and customs, police, court system, prison system; terrorism, white collar crime, counterfeiting and fraud, cyber-crime, drug trafficking, domestic violence, child welfare, family law, juvenile crime.',
    'Social Welfare': 'issues related to social welfare policy, the Ministry of Social Affairs, social services, poverty assistance for low-income families and for the elderly, parental leave and child care, assistance for people with physical or mental disabilities, including early retirement pension, discounts on public services, volunteer associations (e.g., Red Cross), charities, and youth organizations.',
    'Housing': 'issues related to housing, urban affairs and community development, housing market, property tax, spatial planning, rural development, location permits, construction inspection, illegal construction, industrial and commercial building issues, national housing policy, housing for low-income individuals, rental housing, housing for the elderly, e.g., nursing homes, housing for the homeless and efforts to reduce homelessness, research related to housing.',
    'Domestic Commerce': 'issues related to banking, finance and internal commerce, including stock exchange, investments, consumer finance, mortgages, credit cards, insurance availability and cost, accounting regulation, personal, commercial, and municipal bankruptcies, programs to promote small businesses, copyrights and patents, intellectual property, natural disaster preparedness and relief, consumer safety; regulation and promotion of tourism, sports, gambling, and personal fitness; domestic commerce research.',
    'Defense': 'issues related to defense policy, military intelligence, espionage, weapons, military personnel, reserve forces, military buildings, military courts, nuclear weapons, civil defense, including firefighters and mountain rescue services, homeland security, military aid or arms sales to other countries, prisoners of war and collateral damage to civilian populations, military nuclear and hazardous waste disposal and military environmental compliance, defense alliances and agreements, direct foreign military operations, claims against military, defense research.',
    'Technology': 'issues related to science and technology transfer and international science cooperation, research policy, government space programs and space exploration, telephones and telecommunication regulation, broadcast media (television, radio, newspapers, films), weather forecasting, geological surveys, computer industry, cyber security.',
    'Foreign Trade': 'issues related to foreign trade, trade negotiations, free trade agreements, import regulation, export promotion and regulation, subsidies, private business investment and corporate development, competitiveness, exchange rates, the strength of national currency in comparison to other currencies, foreign investment and sales of companies abroad.',
    'International Affairs': 'issues related to international affairs, foreign policy and relations to other countries, issues related to the Ministry of Foreign Affairs, foreign aid, international agreements (such as Kyoto agreement on the environment, the Schengen agreement), international organizations (including United Nations, UNESCO, International Olympic Committee, International Criminal Court), NGOs, issues related to diplomacy, embassies, citizens abroad; issues related to border control; issues related to international finance, including the World Bank and International Monetary Fund, the financial situation of the EU; issues related to a foreign country that do not impact the home country; issues related to human rights in other countries, international terrorism.',
    'Government Operations': 'issues related to general government operations, the work of multiple departments, public employees, postal services, nominations and appointments, national mints, medals, and commemorative coins, management of government property, government procurement and contractors, public scandal and impeachment, claims against the government, the state inspectorate and audit, anti-corruption policies, regulation of political campaigns, political advertising and voter registration, census and statistics collection by government; issues related to local government, capital city and municipalities, including decentralization; issues related to national holidays.',
    'Public Lands': 'issues related to national parks, memorials, historic sites, and protected areas, including the management and staffing of cultural sites; museums; use of public lands and forests, establishment and management of harbors and marinas; issues related to flood control, forest fires, livestock grazing.',
    'Culture': 'issues related to cultural policies, Ministry of Culture, public spending on culture, cultural employees, issues related to support of theatres and artists; allocation of funds from the national lottery, issues related to cultural heritage.',
    'Other': 'other topics not mentioning policy agendas, including the procedures of parliamentary meetings, e.g., points of order, voting procedures, meeting logistics; interpersonal speech, e.g., greetings, personal stories, tributes, interjections, arguments between the members; rhetorical speech, e.g., jokes, literary references.'
}

# Austrian parliament-specific stop words
custom_stopwords = [
    'mr', 'mrs', 'ms', 'dr', 'madam', 'honourable', 'member', 'members', 'vp', 'sp', 'fp', 
    'minister', 'speaker', 'deputy', 'president', 'chairman', 'chair', 'schilling', 
    'secretary', 'lord', 'lady', 'question', 'order', 'point', 'debate', 'motion', 'amendment',
    'congratulations', 'congratulate', 'thanks', 'thank', 'say', 'one', 'want', 'know', 'think', 
    'believe', 'see', 'go', 'come', 'give', 'take', 'people', 'federal', 'government', 'austria', 
    'austrian', 'committee', 'call', 'said', 'already', 'please', 'request', 'proceed', 'reading',
    'course', 'welcome', 'council', 'open', 'written', 'contain', 'items', 'item', 'yes', 'no', 
    'following', 'next', 'speech', 'year', 'years', 'state', 'also', 'would', 'like', 'may', 'must', 
    'upon', 'indeed', 'session', 'meeting', 'report', 'commission', 'behalf', 'gentleman', 'gentlemen', 
    'ladies', 'applause', 'group', 'colleague', 'colleagues', 'issue', 'issues', 'chancellor', 'court', 
    'ask', 'answer', 'reply', 'regard', 'regarding', 'regards', 'respect', 'respectfully', 'sign', 
    'shall', 'procedure', 'declare', 'hear', 'minutes', 'speaking', 'close', 'abg', 'mag', 'orf', 'wait'
]

all_stopwords = list(ENGLISH_STOP_WORDS) + custom_stopwords

# Use enhanced segmentation for topic modeling
segment_texts = segmented_df.groupby('Segment_ID')['Text'].apply(lambda x: ' '.join(x)).tolist()
segment_embeddings = np.array(segmented_df.groupby('Segment_ID')['Segment_Embeddings'].first().tolist())

# Configure vectorizer
vectorizer_model = CountVectorizer(
    stop_words=all_stopwords,
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.9,
    max_features=1000
)

print(f"🎯 Target categories: {len(label_list)} topics")
print(f"📊 Topic modeling data prepared:")
print(f"  • Segments for modeling: {len(segment_texts)}")
print(f"  • Embedding dimension: {segment_embeddings.shape[1]}")
print(f"📖 Topic descriptions loaded for enhanced classification")

In [None]:
# === HIERARCHICAL BERTOPIC (RECOMMENDED APPROACH) ===
from bertopic.representation import KeyBERTInspired
from hdbscan import HDBSCAN

def train_hierarchical_bertopic():
    """Train BERTopic with many subtopics, then map to 22 main categories."""
    print("🏗️ Training Hierarchical BERTopic with HDBSCAN...")
    
    # Configure UMAP and HDBSCAN for more granular topics
    umap_model = UMAP(n_neighbors=10, n_components=8, metric='cosine', random_state=42)
    
    # Use HDBSCAN to find clusters automatically.
    # A smaller min_cluster_size will result in more, smaller topics.
    clustering_model = HDBSCAN(
        min_cluster_size=5,
        metric='euclidean',
        cluster_selection_method='eom',
        prediction_data=True
    )
    
    representation_model = KeyBERTInspired()
    
    topic_model_hierarchical = BERTopic(
        embedding_model="all-MiniLM-L6-v2",
        umap_model=umap_model,
        hdbscan_model=clustering_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model,
        min_topic_size=10,  # Align with min_cluster_size
        calculate_probabilities=True,
        verbose=True
    )
    
    topics, probs = topic_model_hierarchical.fit_transform(segment_texts, embeddings=segment_embeddings)
    topic_info_hierarchical = topic_model_hierarchical.get_topic_info()
    
    print(f"✅ Hierarchical model created {len(topic_info_hierarchical[topic_info_hierarchical['Topic'] != -1])} subtopics")
    
    return topic_model_hierarchical, topics, topic_info_hierarchical

# Train hierarchical model
topic_model_hierarchical, topics_hierarchical, topic_info_hierarchical = train_hierarchical_bertopic()

In [None]:
topic_info_hierarchical

In [None]:
# === LLM CLASSIFICATION TO 22 CATEGORIES ===

# Load environment variables
dotenv_path = os.path.join(os.pardir, '.env')
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path)
    print(f"✅ Loaded .env file from: {dotenv_path}")
else:
    print("⚠️ .env file not found. Ensure OPENAI_API_KEY is set in environment.")

def classify_topic_to_22_categories(topic_words, topic_id=-1):
    """Enhanced classification using detailed topic descriptions."""
    if not isinstance(topic_words, list) or not topic_words:
        return "Other"
    
    keywords_str = ', '.join(topic_words[:12])  # Use top 12 words for better context
    
    # Create detailed category descriptions for the prompt
    category_descriptions = []
    for category in label_list:
        description = majortopics_description.get(category, f"Issues related to {category.lower()}")
        category_descriptions.append(f"• {category}: {description}")
    
    categories_text = '\n'.join(category_descriptions)
    
    prompt = f"""You are analyzing topics from parliamentary debates. 

TOPIC KEYWORDS: {keywords_str}

AVAILABLE CATEGORIES WITH DESCRIPTIONS:
{categories_text}

INSTRUCTIONS:
1. Analyze the keywords carefully in the context of parliamentary discussions
2. Consider which category description best matches the semantic content of the keywords
3. Look for key thematic indicators (e.g., "economic", "health", "defense", "education", etc.)
4. If keywords relate to parliamentary procedures, interpersonal speech, or don't fit policy areas, choose "Other"
5. Choose the single best-fitting category

RESPONSE: Only output the exact category name from the list above."""

    try:
        if not os.getenv('OPENAI_API_KEY'):
            print(f"Error: OPENAI_API_KEY not set for topic {topic_id}")
            return "Other"
        
        response = openai.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are an expert in political science, parliamentary procedures, and policy classification. You excel at accurately mapping topic keywords to policy domains."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.05,  # Very low temperature for consistency
            max_tokens=30
        )
        
        classification = response.choices[0].message.content.strip()
        
        # Clean the response and ensure exact match
        classification = classification.replace('"', '').replace("'", "").strip()
        
        # Exact match check
        if classification in label_list:
            return classification
        
        # Fuzzy matching for partial matches
        classification_lower = classification.lower()
        for category in label_list:
            if category.lower() == classification_lower:
                return category
            # Check if classification contains the category name
            if category.lower() in classification_lower or classification_lower in category.lower():
                return category
                
        # Special handling for common variations
        category_mapping = {
            'macro': 'Macroeconomics',
            'economics': 'Macroeconomics', 
            'economic': 'Macroeeconomics',
            'rights': 'Civil Rights',
            'welfare': 'Social Welfare',
            'social': 'Social Welfare',
            'crime': 'Law and Crime',
            'legal': 'Law and Crime',
            'justice': 'Law and Crime',
            'foreign': 'International Affairs',
            'international': 'International Affairs',
            'trade': 'Foreign Trade',
            'government': 'Government Operations',
            'administration': 'Government Operations'
        }
        
        for key, mapped_category in category_mapping.items():
            if key in classification_lower:
                return mapped_category
        
        return "Other"  # Final fallback
            
    except Exception as e:
        print(f"Error classifying topic {topic_id}: {e}")
        return "Other"

def map_topics_to_22_categories(topic_info, approach_name):
    """Map topics to the 22 predefined categories using LLM classification."""
    print(f"🤖 Classifying {approach_name} topics into 22 categories...")
    
    topic_info_classified = topic_info.copy()
    topic_info_classified['Category_22'] = "Other"
    
    classification_results = []
    
    for idx, row in topic_info_classified.iterrows():
        if row['Topic'] != -1:
            topic_words = row['Representation']
            classification = classify_topic_to_22_categories(topic_words, row['Topic'])
            topic_info_classified.loc[idx, 'Category_22'] = classification
            
            classification_results.append({
                'Topic_ID': row['Topic'],
                'Keywords': ', '.join(topic_words[:5]),
                'Classification': classification,
                'Count': row['Count']
            })
    
    # Count topics per category
    category_counts = topic_info_classified[topic_info_classified['Topic'] != -1]['Category_22'].value_counts()
    
    print(f"\n📊 {approach_name} - Classification results:")
    for category in label_list:
        count = category_counts.get(category, 0)
        if count > 0:
            topic_count = len(classification_results)
            percentage = (count / topic_count * 100) if topic_count > 0 else 0
            print(f"  • {category:<20}: {count:>2} topics ({percentage:>5.1f}%)")
    
    return topic_info_classified

# Apply LLM classification
print("🚀 Applying LLM classification to map subtopics to 22 categories...")
topic_info_classified = map_topics_to_22_categories(topic_info_hierarchical, "Hierarchical BERTopic")

In [None]:
# === CREATE FINAL 22-TOPIC MAPPING ===

def create_final_22_topic_mapping():
    """Create final mapping to 22 topics."""
    print("🏗️ Creating final 22-topic mapping...")
    
    # Create segment-level mapping
    segment_topic_map = pd.DataFrame({
        'Segment_ID': segmented_df['Segment_ID'].unique(),
        'Subtopic_ID': topics_hierarchical,
    })
    
    # Map subtopics to 22 categories
    subtopic_to_category = dict(zip(
        topic_info_classified['Topic'], 
        topic_info_classified['Category_22']
    ))
    
    segment_topic_map['Topic_22'] = segment_topic_map['Subtopic_ID'].map(subtopic_to_category)
    segment_topic_map['Topic_22'] = segment_topic_map['Topic_22'].fillna('Other')
    
    # Merge with original data
    embeddings_with_22_topics = segmented_df.merge(
        segment_topic_map, 
        on='Segment_ID', 
        how='left'
    )
    
    # Generate topic statistics
    category_stats = embeddings_with_22_topics.groupby('Topic_22').agg({
        'Segment_ID': 'nunique',
        'Text': 'count'
    }).rename(columns={
        'Segment_ID': 'Unique_Segments',
        'Text': 'Total_Speeches'
    }).sort_values('Total_Speeches', ascending=False)
    
    print(f"\n✅ Final 22-topic mapping created!")
    print(f"📊 Topic distribution:")
    for topic, stats in category_stats.iterrows():
        print(f"  • {topic:<20}: {stats['Total_Speeches']:>4} speeches, {stats['Unique_Segments']:>3} segments")
    
    return embeddings_with_22_topics, topic_info_classified, category_stats

# Create final mapping
final_embeddings, final_topic_info, category_stats = create_final_22_topic_mapping()

print(f"\n🎉 SUCCESS: Mapped all topics to 22 predefined categories!")
print(f"📈 Coverage: {len(category_stats)} out of {len(label_list)} categories have content")

In [None]:
# === FINAL RESULTS ===

# Display final topic information
print("📋 Final Topic Classifications:")
final_display = final_topic_info[final_topic_info['Topic'] != -1][['Topic', 'Count', 'Category_22', 'Representation']].copy()
final_display['Keywords'] = final_display['Representation'].apply(lambda x: ', '.join(x[:5]))
final_display = final_display.drop('Representation', axis=1).sort_values('Count', ascending=False)

print(final_display.head(15).to_string(index=False))

# Show category coverage
print(f"\n📊 Category Coverage Summary:")
print(f"Categories with content: {len(category_stats)}/{len(label_list)}")
print(f"Most common categories:")
for category, stats in category_stats.head(10).iterrows():
    percentage = (stats['Total_Speeches'] / final_embeddings.shape[0] * 100)
    print(f"  • {category:<20}: {percentage:>5.1f}% of speeches")

# Find and display categories with no content
represented_categories = set(category_stats.index)
unrepresented_categories = set(label_list) - represented_categories

if unrepresented_categories:
    print(f"\nCategories with no content (0% of speeches):")
    for category in sorted(list(unrepresented_categories)):
        print(f"  • {category}")

In [None]:
# === SAVE RESULTS (OPTIONAL) ===
# Uncomment to save the final results

# print("💾 Saving final results...")
# final_embeddings.to_pickle('data folder/data/AT_with_22_topics_final.pkl')
# final_topic_info.to_pickle('data folder/data/topic_info_22_categories.pkl')
# category_stats.to_pickle('data folder/data/category_statistics.pkl')
# print("✅ Results saved!")

print("\n🎉 Analysis complete! Ready to run.")