# Topic Modeling with BERTopic on Austrian Parliament Speeches

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

pickle_file_path = r'data folder\data\AT_preprocessed.pkl'
AT_preprocessed_df = pd.read_pickle(pickle_file_path)
print(f"Loaded {pickle_file_path}. Initial shape: {AT_preprocessed_df.shape}")

Loaded data folder\data\AT_preprocessed.pkl. Initial shape: (170096, 19)


## Analyze Speech Length Distribution (Token Count)

Before grouping utterances into segments, we can also analyze the distribution of individual speech lengths based on their approximate token count. This provides context on the typical length of speeches that are later aggregated into segments.

In [8]:
AT_preprocessed_df['Word_Count'] = AT_preprocessed_df['Processed_Text'].apply(lambda x: len(x.split()))

# (Assuming 1 word approx. 1.33 tokens)
AT_preprocessed_df['Speech_Token_Count'] = (AT_preprocessed_df['Word_Count'] * 1.33).round().astype(int)

# --- Cumulative Distribution for Speech Token Counts ---
print("\n--- Cumulative Token Count Distribution for Speeches (approx.) ---")
speech_token_counts = AT_preprocessed_df['Speech_Token_Count'].dropna()
total_speeches_token_dist = len(speech_token_counts)

if total_speeches_token_dist > 0:
    # Define percentiles for a more granular view
    percentiles_to_show_speech = np.array([50, 75, 90, 95, 98, 99, 99.5, 99.9, 100])
    speech_token_count_thresholds = np.percentile(speech_token_counts, percentiles_to_show_speech)

    print("Token Count Threshold <= X: % of Speeches")
    for p, tc_thresh in zip(percentiles_to_show_speech, speech_token_count_thresholds):
        print(f"<= {tc_thresh:.0f} tokens: {p:.2f}%")
else:
    print("No speech data to calculate cumulative token count distribution.")

# Visualize cumulative distribution for speeches (plot uses its own binning for nbins=20)
if total_speeches_token_dist > 0:
    # Calculate the 99.9th percentile for x-axis range using speech token counts
    p99_9_speech_tokens = np.percentile(speech_token_counts, 99.9)
    # Filter the DataFrame for plotting based on speech token counts
    # Ensure the column exists before filtering
    if 'Speech_Token_Count' in AT_preprocessed_df.columns:
        speech_df_filtered_plot_tokens = AT_preprocessed_df[AT_preprocessed_df['Speech_Token_Count'] <= p99_9_speech_tokens]
    
        fig_cumulative_hist_speech_tokens = px.histogram(speech_df_filtered_plot_tokens, # Use filtered data
                                                         x='Speech_Token_Count', # Use speech token count column
                                                         nbins=30,
                                                         title='Cumulative Distribution of Speech Lengths (Token Count, up to 99.9th percentile, approx.)',
                                                         labels={'Speech_Token_Count': 'Number of Tokens per Speech (approx.)'}, # Updated label
                                                         cumulative=True,
                                                         histnorm='percent') 
        fig_cumulative_hist_speech_tokens.update_layout(yaxis_title='Cumulative Percentage of Speeches')
        fig_cumulative_hist_speech_tokens.show()


--- Cumulative Token Count Distribution for Speeches (approx.) ---
Token Count Threshold <= X: % of Speeches
<= 105 tokens: 50.00%
<= 306 tokens: 75.00%
<= 475 tokens: 90.00%
<= 632 tokens: 95.00%
<= 833 tokens: 98.00%
<= 1033 tokens: 99.00%
<= 1254 tokens: 99.50%
<= 1613 tokens: 99.90%
<= 9494 tokens: 100.00%


### Generate Sentence Embeddings

Before applying segmentation algorithms, we need to represent the semantic meaning of each utterance. We will use a Sentence-BERT model to generate dense vector embeddings for the `Processed_Text` of each speech.

In [9]:
# from sentence_transformers import SentenceTransformer
# import time
# import torch
# import pandas as pd # Ensure pandas is imported

# # # Determine which DataFrame to use for embeddings
# # # Directly use AT_preprocessed_df as no filtering for short speeches is needed here.
# df_for_embedding = AT_preprocessed_df.copy() # Use .copy() to avoid SettingWithCopyWarning if modifications are made later

# # # Proceed with embedding generation
# print(f"Final DataFrame shape for embedding: {df_for_embedding.shape}")
# print("Proceeding with Nomic Embed Text v1.5 embedding generation.")

# # # Choose the Nomic Embed Text v1.5 model
# model_name = 'nomic-ai/nomic-embed-text-v1.5'
# print(f"Loading SentenceTransformer model: {model_name}...")
# load_start_time = time.time()

# device = 'cpu'

# embedding_model_nomic = SentenceTransformer(model_name, device=device, trust_remote_code=True) # Added trust_remote_code=True for Nomic
# print(f"Model '{model_name}' loaded in {time.time() - load_start_time:.2f} seconds.")

# docs_to_embed = df_for_embedding['Processed_Text'].astype(str).tolist()

# print(f"Generating embeddings for {len(docs_to_embed)} documents using {model_name}...")
# embed_start_time = time.time()

# batch_size = 16 # Nomic might benefit from smaller or larger batch sizes depending on GPU memory
# print(f"Using batch_size: {batch_size}")

# embeddings_nomic = embedding_model_nomic.encode(docs_to_embed, show_progress_bar=True, batch_size=batch_size)

# print(f"Nomic Embed Text v1.5 embeddings generated in {time.time() - embed_start_time:.2f} seconds.")
# print(f"Shape of Nomic Embed Text v1.5 embeddings: {embeddings_nomic.shape}")

# # # Create a temporary series with the new embeddings, aligned with the index of df_for_embedding
# embeddings_series = pd.Series(list(embeddings_nomic), index=df_for_embedding.index)

# # # Update/Create the 'Embeddings' column in the global AT_preprocessed_df DataFrame
# AT_preprocessed_df['Embeddings'] = pd.Series(dtype='object')
# AT_preprocessed_df.loc[df_for_embedding.index, 'Embeddings'] = embeddings_series
# print(f"Embeddings column updated in 'AT_preprocessed_df' for {len(df_for_embedding)} rows.")

# print("\nDataFrame Info for 'AT_preprocessed_df' after adding/updating Nomic Embed Text v1.5 embeddings:")
# AT_preprocessed_df.info(memory_usage='deep')
# print("\nSample of rows from 'AT_preprocessed_df' that received new embeddings:")
# print(AT_preprocessed_df.loc[df_for_embedding.index, ['Processed_Text', 'Word_Count', 'Is_Short_Speech', 'Embeddings']].head(2).assign(Embeddings_Sample=lambda x: x['Embeddings'].apply(lambda y: y[:5] if hasattr(y, '__iter__') else None)))

In [None]:
# # save the DataFrame with embeddings to a new pickle file
# output_pickle_file_path = r'data folder\data\AT_with_embeddings.pkl' # Save in the parent directory
# AT_preprocessed_df.to_pickle(output_pickle_file_path)
# print(f"DataFrame with embeddings saved to {output_pickle_file_path}.")

In [None]:
# To load the DataFrame with embeddings in a future session:
import pandas as pd
AT_preprocessed_df = pd.read_pickle(r'data folder\data\AT_with_embeddings.pkl')

## Unsupervised Text Segmentation

Now that we have semantic embeddings for each utterance, we can attempt to automatically divide each parliamentary sitting into topically coherent segments. We will use a similarity-based approach:

1.  **Group by Sitting**: Process utterances belonging to the same sitting together.
2.  **Calculate Adjacent Similarity**: For each sitting, compute the cosine similarity between the embedding of each utterance and the next one.
3.  **Identify Boundaries**: Look for significant drops (valleys) in the similarity scores. These drops suggest a potential shift in topic.
4.  **Mark Segments**: Assign a segment ID to each utterance based on the detected boundaries.

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.signal import find_peaks
import plotly.graph_objects as go
import numpy as np # Ensure numpy is imported

def calculate_windowed_similarity(embeddings_list, window_size):
    """Calculates cosine similarity between mean embeddings of windows on either side of a gap."""
    if len(embeddings_list) < 2:
        return np.array([])
    if window_size < 1:
        raise ValueError("Window size must be at least 1.")

    num_utterances = len(embeddings_list)
    similarities = []

    for g in range(num_utterances - 1):  # g is the index of the utterance before the gap
        # Window before or at g (ending at g)
        start_idx_before = max(0, g - window_size + 1)
        end_idx_before = g + 1
        window_before_embeddings_raw = embeddings_list[start_idx_before:end_idx_before]

        # Window after g (starting at g+1)
        start_idx_after = g + 1
        end_idx_after = min(num_utterances, g + 1 + window_size)
        window_after_embeddings_raw = embeddings_list[start_idx_after:end_idx_after]

        if not window_before_embeddings_raw or not window_after_embeddings_raw:
            similarities.append(0)  # Should ideally not happen with proper checks
            continue

        # Convert to numpy arrays and calculate mean
        mean_emb_before = np.mean([np.asarray(e) for e in window_before_embeddings_raw], axis=0)
        mean_emb_after = np.mean([np.asarray(e) for e in window_after_embeddings_raw], axis=0)

        sim = cosine_similarity(mean_emb_before.reshape(1, -1), mean_emb_after.reshape(1, -1))
        similarities.append(sim[0][0])
    return np.array(similarities)

def find_boundaries(similarities, height_threshold=0.2, prominence_threshold=0.1, distance_threshold=5):
    """Finds boundaries (valleys) in similarity scores using peak finding on inverted scores."""
    if len(similarities) == 0:
        return np.array([])
    inverted_similarities = 1 - similarities
    # Ensure inverted_similarities are non-negative for height threshold
    inverted_similarities = np.maximum(0, inverted_similarities)
    peaks, properties = find_peaks(inverted_similarities, 
                                   height=height_threshold, 
                                   prominence=prominence_threshold, 
                                   distance=distance_threshold)
    return peaks

In [13]:
segment_ids_sim = []
boundary_indices_sim = []
processed_groups_count = 0

total_groups = AT_preprocessed_df['Sitting_ID'].nunique() 
similarity_window_size = 3 # Define the window size for similarity calculation

print(f"Using similarity window size: {similarity_window_size}")
print(f"Total unique sittings to process: {total_groups}")

# Group by the sitting identifier (Sitting_ID)
for sitting_identifier, group in AT_preprocessed_df.groupby('Sitting_ID'):
    processed_groups_count += 1
        
    if len(group) < 2: # Not enough utterances to form a gap or windows
        segment_ids_sim.extend([f"{sitting_identifier}_seg_0"] * len(group))
        continue
    
    embeddings_list = group['Embeddings'].tolist()
    
    # Calculate similarities
    similarities = calculate_windowed_similarity(embeddings_list, similarity_window_size)
    
    if len(similarities) == 0: # Should not happen if len(group) >= 2
        segment_ids_sim.extend([f"{sitting_identifier}_seg_0"] * len(group))
        continue

    # --- Experiment with these threshold values ---
    height_val = 0.25 # Increased: Requires a more significant drop in similarity (original_sim <= 0.85)
    prominence_val = 0.15 # Increased: Requires the drop to be more distinct from surroundings
    distance_val = 5    # Increased: Minimum number of utterances between boundaries
    # --- End of threshold values ---

    boundaries = find_boundaries(similarities, 
                                 height_threshold=height_val, 
                                 prominence_threshold=prominence_val, 
                                 distance_threshold=distance_val)
    
    current_segment = 0
    sitting_segment_ids = []
    group_indices = group.index.tolist() # Get original DataFrame indices for this group
    
    for i in range(len(group)):
        # A boundary is *after* utterance (i-1), so utterance i starts a new segment
        if i > 0 and (i - 1) in boundaries: 
            current_segment += 1
            # Store the original DataFrame index of the utterance that starts the new segment
            boundary_indices_sim.append(group_indices[i]) 
            
        sitting_segment_ids.append(f"{sitting_identifier}_seg_{current_segment}")
        
    segment_ids_sim.extend(sitting_segment_ids)

# Assign the generated segment IDs to the DataFrame
AT_preprocessed_df['Segment_ID'] = segment_ids_sim

print("\nSimilarity Segmentation complete. 'Segment_ID' column added.")
print(f"Identified {len(boundary_indices_sim)} potential topic boundaries across all sittings.")

# Calculate and print total unique segments and average segments per sitting
total_unique_segments = AT_preprocessed_df['Segment_ID'].nunique()
print(f"Total number of unique segments created: {total_unique_segments}")

avg_segments_per_sitting = AT_preprocessed_df.groupby('Sitting_ID')['Segment_ID'].nunique().mean()
print(f"Average number of segments per sitting: {avg_segments_per_sitting:.2f}")

Using similarity window size: 3
Total unique sittings to process: 1221

Similarity Segmentation complete. 'Segment_ID' column added.
Identified 15466 potential topic boundaries across all sittings.
Total number of unique segments created: 16687
Average number of segments per sitting: 13.67


Sample 1 sitting

In [14]:
sample_sitting_identifier = AT_preprocessed_df['Sitting_ID'][11223]
sample_group = AT_preprocessed_df[AT_preprocessed_df['Sitting_ID'] == sample_sitting_identifier]
print(f"Number of utterances in this example sitting: {len(sample_group)}")

sample_embeddings = sample_group['Embeddings'].tolist()

sample_similarities = calculate_windowed_similarity(sample_embeddings, similarity_window_size) 

sample_boundaries = find_boundaries(sample_similarities, 
                                   height_threshold=height_val, 
                                   prominence_threshold=prominence_val, 
                                   distance_threshold=distance_val) 
print(f"Plotting {len(sample_similarities)} similarity scores for this example sitting.")
fig = go.Figure()
fig.add_trace(go.Scatter(y=sample_similarities, mode='lines+markers', name='Adjacent Cosine Similarity (Windowed)'))
valid_boundaries = sample_boundaries 
    
fig.add_trace(go.Scatter(x=valid_boundaries, y=sample_similarities[valid_boundaries], 
                                        mode='markers', marker=dict(color='red', size=10, symbol='x'), name='Detected Boundaries (Sim)'))
fig.update_layout(title=f'Boundaries for Sitting (Sitting_ID):<br>{sample_sitting_identifier}',
                              xaxis_title='Speech Index (within sitting)',
                              yaxis_title='Cosine Similarity')

Number of utterances in this example sitting: 332
Plotting 331 similarity scores for this example sitting.


### Analyze Segment Length Distribution

In [15]:
# Calculate the length (number of utterances) of each segment
segment_grouped = AT_preprocessed_df.groupby('Segment_ID')

segment_lengths = segment_grouped.size()

# Display descriptive statistics
print("Segment Length Statistics (Similarity-Based - Utterance Count):")
print(segment_lengths.describe())

# --- Cumulative Distribution for Segment Lengths ---
print("\n--- Cumulative Length Distribution for Segments (Utterance Count) ---")
segment_length_values = segment_lengths.values
total_segments_len_dist = len(segment_length_values) # Renamed to avoid conflict

if total_segments_len_dist > 0:
    # Define percentiles for a more granular view
    percentiles_to_show = np.array([50, 70, 80, 90, 95, 99])
    utterance_count_thresholds = np.percentile(segment_length_values, percentiles_to_show)

    print("Utterance Count Threshold <= X: % of Segments")
    for p, uc_thresh in zip(percentiles_to_show, utterance_count_thresholds):
        print(f"<= {uc_thresh:.0f} utterances: {p:.2f}%")
else:
    print("No segment data to calculate cumulative utterance count distribution.")

# Visualize cumulative distribution (plot uses its own binning)
if total_segments_len_dist > 0:
    # Create a DataFrame suitable for px.histogram from the Series
    segment_lengths_df = segment_lengths.reset_index()
    segment_lengths_df.columns = ['Segment_ID', 'Utterance_Count'] # Rename columns for clarity

    fig_cumulative_hist_utterances = px.histogram(segment_lengths_df,
                                                  x='Utterance_Count', # Use the correct column name
                                                  nbins=20,
                                                  title='Cumulative Distribution of Segment Lengths (Utterance Count)',
                                                  labels={'Utterance_Count': 'Number of Utterances per Segment'},
                                                  cumulative=True,
                                                  histnorm='percent')
    fig_cumulative_hist_utterances.update_layout(yaxis_title='Cumulative Percentage of Segments')
    fig_cumulative_hist_utterances.show()


Segment Length Statistics (Similarity-Based - Utterance Count):
count    16687.000000
mean        10.193324
std          5.111805
min          1.000000
25%          7.000000
50%          9.000000
75%         13.000000
max         48.000000
dtype: float64

--- Cumulative Length Distribution for Segments (Utterance Count) ---
Utterance Count Threshold <= X: % of Segments
<= 9 utterances: 50.00%
<= 12 utterances: 70.00%
<= 14 utterances: 80.00%
<= 17 utterances: 90.00%
<= 20 utterances: 95.00%
<= 26 utterances: 99.00%


In [16]:
# Aggregate 'Processed_Text' for each segment
segment_text_aggregated_df = AT_preprocessed_df.groupby('Segment_ID').agg(
    Aggregated_Text=('Processed_Text', ' '.join)  # Concatenate texts within each segment
).reset_index()

# Calculate token count for each segment's text using word count approximation
segment_text_aggregated_df['Token_Count'] = segment_text_aggregated_df['Aggregated_Text'].apply(
    lambda x: len(x.split())
)

# Visualize cumulative distribution of token counts
fig_cumulative_hist_tokens = px.histogram(segment_text_aggregated_df, 
                                          x='Token_Count', 
                                          nbins=30, 
                                          title='Cumulative Distribution of Segment Lengths (Token Count, approx.)',
                                          labels={'Token_Count': 'Number of Tokens per Segment (approx.)'},
                                          cumulative=True,
                                          histnorm='percent')
fig_cumulative_hist_tokens.update_layout(yaxis_title='Cumulative Percentage of Segments')
fig_cumulative_hist_tokens.show()

# Calculate percentages for specific thresholds
thresholds = [4096, 8192, 16384]
for threshold in thresholds:
    less_than_percentage = (segment_text_aggregated_df['Token_Count'] < threshold).mean() * 100
    less_than_count = (segment_text_aggregated_df['Token_Count'] < threshold).sum()
    more_than_percentage = (segment_text_aggregated_df['Token_Count'] >= threshold).mean() * 100
    more_than_count = (segment_text_aggregated_df['Token_Count'] >= threshold).sum()
    
    print(f"{more_than_percentage:.2f}% ({more_than_count}) of segments have more than or equal to {threshold} tokens (approx. based on word count).")

3.52% (588) of segments have more than or equal to 4096 tokens (approx. based on word count).
0.11% (19) of segments have more than or equal to 8192 tokens (approx. based on word count).
0.01% (1) of segments have more than or equal to 16384 tokens (approx. based on word count).


## BERTopic Model Training on Segments

Now, we will train BERTopic on the text segments identified by the similarity-based approach (`Segment_ID_Sim`). We will:
1. Group the DataFrame by `Segment_ID_Sim`.
2. Concatenate the `Processed_Text` for all utterances within each segment.
3. **Re-calculate sentence embeddings for these concatenated segment texts.** This approach allows the embedding model to capture the holistic meaning of the entire segment, rather than just averaging the embeddings of individual utterances.
4. Train a BERTopic model using these aggregated texts and their new embeddings.

allenai/led-base-16384

In [17]:
# import pandas as pd
# import torch
# import numpy as np
# import time
# from tqdm.auto import tqdm
# from sentence_transformers import SentenceTransformer

# # --- Configuration ---
# MODEL_NAME = "BAAI/bge-m3"
# TOKEN_LIMIT = 8192
# CHUNK_OVERLAP = 128
# device = "cuda" if torch.cuda.is_available() else "cpu"

# print(f"Using model: {MODEL_NAME} on device: {device}")

# # --- Load model ---
# model = SentenceTransformer(MODEL_NAME, device=device)
# tokenizer = model.tokenizer
# model.max_seq_length = TOKEN_LIMIT

# # --- Load your data ---
# texts = segment_grouped["Aggregated_Text"].tolist()

# # --- Chunking helper ---
# def chunk_text(text, token_limit, overlap):
#     token_ids = tokenizer.encode(text, add_special_tokens=False)
#     chunks = []
#     start = 0
#     while start < len(token_ids):
#         end = start + token_limit
#         chunk_ids = token_ids[start:end]
#         chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
#         chunks.append(chunk_text)
#         start += token_limit - overlap
#     return chunks

# # --- Embedding logic ---
# def embed_segment(text):
#     # Tokenize and truncate here
#     tokenized = tokenizer(
#         text,
#         truncation=True,
#         max_length=TOKEN_LIMIT - 50,  # small buffer
#         return_tensors="pt"
#     )

#     # Decode back to text from tokenized (safe and truncated!)
#     truncated_text = tokenizer.decode(
#         tokenized["input_ids"][0], skip_special_tokens=True
#     )

#     if tokenized["input_ids"].shape[-1] <= TOKEN_LIMIT:
#         # No chunking needed
#         return model.encode(truncated_text, convert_to_tensor=False)
#     else:
#         # Fallback: chunk the original long text
#         chunks = chunk_text(text, TOKEN_LIMIT, CHUNK_OVERLAP)
#         chunk_embeddings = model.encode(chunks, batch_size=4, convert_to_tensor=False)
#         return np.mean(chunk_embeddings, axis=0)


# # --- Run embedding for all segments ---
# print("\nGenerating embeddings...")
# start = time.time()
# all_embeddings = [embed_segment(text) for text in tqdm(texts)]
# elapsed = time.time() - start
# print(f"\nDone in {elapsed:.2f} seconds.")

# # --- Add to DataFrame ---
# segment_grouped["embedding"] = all_embeddings


In [None]:
# # save segment_grouped to a new pickle file
# segment_grouped.to_pickle(r'data folder\data\AT_segmented_embeddings.pkl')

# call the pickle file to load the DataFrame with embeddings
segment_grouped = pd.read_pickle(r'data folder\data\AT_segmented_embeddings.pkl')


In [19]:
# Define custom stop words
custom_stopwords = [
    'mr', 'mrs', 'ms', 'madam', 'honourable', 'member', 'members', 'vp', 'sp', 'fp', 'today', 'time',
    'minister', 'speaker', 'deputy', 'president', 'chairman', 'chair', 'schilling', 
    'secretary', 'lord', 'lady', 'question', 'order', 'point', 'debate',
    'motion', 'amendment', 'hear', 'minutes', 'speaking', 'close', 'abg',
    'congratulations', 'congratulate', 'thanks', 'thank', 'vp', 'say', 'sp', 'one', 'want',
    'know', 'think', 'believe', 'see', 'go', 'come', 'give', 'take',
    'people', 'federal', 'government', 'austria', 'austrian', 'committee', 'call', 'said',
    'already', 'committee', 'para', 'please', 'request', 'proceed', 'reading', 'people',
    'course', 'mag', 'welcome', 'council', 'open', 'written', 'contain', 'items', 'item',
    'peter', 'jonah', 'auer', 'vow', 'yes', 'no', 'following', 'orf', 'wait', 'ing',
    'next', 'speech', 'year', 'years', 'state',
    'also', 'would', 'like', 'may', 'must', 'upon', 'indeed', 'item',
    'session', 'meeting', 'report', 'commission', 'behalf', 'gentleman', 'gentlemen', 'ladies', 'thank',
    'applause', 'group', 'colleague', 'colleagues', 'issue', 'issues',
    'chancellor', 'court', 'ask', 'answer', 'reply', 'regard', 'regarding',
    'regards', 'respect', 'respectfully', 'sign', 'shall', 'procedure', 'declare'
]

# Optional: You might want to ensure uniqueness and convert to a set for efficiency
custom_stopwords = list(set(custom_stopwords))
print(f"Defined {len(custom_stopwords)} unique custom stop words.")

Defined 121 unique custom stop words.


In [20]:
from bertopic import BERTopic
from sklearn.cluster import KMeans
import hdbscan # Import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text # Import text module for stop words
import time # time might be needed if re-timing specific parts
import numpy as np

# Ensure segment_docs, segment_embeddings, and segment_grouped are available from the previous cell
# Also custom_stopwords should be defined in a prior cell.

# --- Clustering Model Definitions ---
# Option 1: KMeans
num_segment_topics_kmeans = 12 # Target number of topics for KMeans
kmeans_model = KMeans(n_clusters=num_segment_topics_kmeans, random_state=42, n_init=10)

# Option 2: HDBSCAN
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=30,
                                min_samples=30, 
                                metric='euclidean', 
                                cluster_selection_method='eom', 
                                prediction_data=True)

# --- Choose Clustering Model ---
# Uncomment the model you want to use:
chosen_clustering_model = kmeans_model # Example: if you want to switch
# chosen_clustering_model = hdbscan_model # Assuming HDBSCAN is chosen
print(f"Using clustering model: {type(chosen_clustering_model).__name__}")

# Define CountVectorizer with combined English and custom stop words
# Fetch default English stop words and combine with custom list
default_stopwords = text.ENGLISH_STOP_WORDS
combined_stopwords = list(default_stopwords.union(custom_stopwords)) # Ensure custom_stopwords is defined
print(f"Using CountVectorizer with {len(combined_stopwords)} combined stop words.")
vectorizer_model = CountVectorizer(stop_words=combined_stopwords)

# Initialize BERTopic
# Pass pre-calculated embeddings, disable embedding/dimensionality reduction steps within BERTopic
# Pass the CHOSEN clustering model
# nr_topics is mainly for deterministic models like KMeans. HDBSCAN determines clusters automatically.
# Set top_n_words for keyword generation
segment_topic_model = BERTopic(
    embedding_model=None, # Embeddings are pre-calculated
    umap_model=None,      # Using pre-calculated embeddings, no UMAP needed here
    hdbscan_model=chosen_clustering_model, # Pass the selected model
    vectorizer_model=vectorizer_model, # Use CountVectorizer with combined stop words
    # nr_topics is mainly for deterministic models like KMeans. HDBSCAN determines clusters automatically.
    # nr_topics=num_segment_topics_kmeans, # Only set nr_topics if using KMeans or similar fixed-cluster model
    top_n_words=8, # Generate 30 keywords per topic
    verbose=False
)

# Removed: if isinstance(chosen_clustering_model, KMeans): ...
# Assuming HDBSCAN, so nr_topics is not explicitly set here for KMeans.

# Train the model
start_time_segment_topic = time.time() # Start timer before training
segment_topics, _ = segment_topic_model.fit_transform(segment_grouped['Aggregated_Text'].tolist(), embeddings=np.array(segment_grouped['embedding'].tolist()))

print(f"BERTopic training on segments finished in {time.time() - start_time_segment_topic:.2f} seconds.")

# Add topic assignments to the segment DataFrame
segment_grouped['Segment_Topic'] = segment_topics

segment_topic_info = segment_topic_model.get_topic_info()

# --- Calculate and Print Outlier Percentage ---
if isinstance(chosen_clustering_model, hdbscan.HDBSCAN):
    total_docs_in_topic_info = segment_topic_info['Count'].sum() 
    outlier_row = segment_topic_info[segment_topic_info['Topic'] == -1]
    
    if not outlier_row.empty:
        outlier_count = outlier_row['Count'].iloc[0]
        if total_docs_in_topic_info > 0:
            outlier_percentage = (outlier_count / total_docs_in_topic_info) * 100
            print(f"\nHDBSCAN Outlier Analysis:")
            print(f"  - Total documents/segments: {total_docs_in_topic_info}")
            print(f"  - Outlier count: {outlier_count}")
            print(f"  - Percentage of outliers: {outlier_percentage:.2f}%")
        else:
            print("\nHDBSCAN Outlier Analysis: Total documents in topic info is 0, cannot calculate outlier percentage.")
    else:
        print("\nHDBSCAN Outlier Analysis: No outlier topic (-1) found.")
else:
    print("\nOutlier analysis is specific to HDBSCAN and was not performed or chosen model is not HDBSCAN.")

Using clustering model: KMeans
Using CountVectorizer with 423 combined stop words.
BERTopic training on segments finished in 46.98 seconds.

Outlier analysis is specific to HDBSCAN and was not performed or chosen model is not HDBSCAN.


In [21]:
import openai
import os # Added
from dotenv import load_dotenv 

# --- Load .env file from parent directory ---
dotenv_path = os.path.join(os.pardir, '.env')
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path)
    print(f"Loaded .env file from: {dotenv_path}")
else:
    print(f".env file not found at: {dotenv_path}. Ensure OPENAI_API_KEY is set in your environment or .env file.")
# You can optionally check if the key is loaded, but be careful not to print the key itself
# print(f"OPENAI_API_KEY is set: {bool(os.getenv('OPENAI_API_KEY'))}")


# --- LLM Topic Name Generation using OpenAI API ---

# --- Updated generate_topic_name_with_llm function (Keywords Only) ---
def generate_topic_name_with_llm(keywords_list):
    """ 
    Generates a concise topic name using the OpenAI API, considering only keywords.
    Args: 
        keywords_list: A list of strings (topic keywords).
    Returns: A concise topic name (string) or "" if an error occurs.
    """
    if not isinstance(keywords_list, list) or not all(isinstance(kw, str) for kw in keywords_list):
        print("Error: keywords_list must be a list of strings.")
        return ""
    if not keywords_list:
        # print("Warning: Empty keywords_list provided to LLM.") # Optional warning
        return ""


    keyword_string = ', '.join(keywords_list)
    
    # Prompt using keywords
    prompt = (f"""Keywords: {keyword_string}\n\n
    Generate a concise vague topic name(1 to 3 words, not counting 'and') pointing in the direction of the agenda of the parliamentary debate using these keywords.
    Give slightly more weight to earlier keywords compared to later ones. Don't use words reform, security, allocation, strategy. Output only the name.""")

    try:
        if not os.getenv('OPENAI_API_KEY'):
            print("Error: OPENAI_API_KEY not set. Cannot make API call.")
            return ""
        response = openai.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are an assistant skilled at summarizing topics from keywords related to parliamentary proceedings."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.5, 
            max_tokens=25   
        )
        generated_name = response.choices[0].message.content.strip().replace('"', '').replace('\n', ' ').strip()
        return generated_name
    except Exception as e:
        print(f"Error calling OpenAI API: {e}")
        return ""

# --- DataFrame processing code --- 
print("\nGenerating topic names using OpenAI API (keywords only)...")

# Directly apply the function - assumes 'Representation' column exists and is mostly valid
# Errors during generation or parsing within the function will result in empty strings
segment_topic_info['LLM_Name'] = segment_topic_info.apply(
    lambda row: generate_topic_name_with_llm(row['Representation']), 
    axis=1
)

Loaded .env file from: ..\.env

Generating topic names using OpenAI API (keywords only)...


### Merge the 2 datasets

The topic names below were generated programmatically using an LLM (OpenAI API) based on the topic keywords and the context of parliamentary debates. 
The `LLM_Name` column has been added to `segment_topic_info`.

In [22]:
# 1. Merge Segment Topics back to the original DataFrame
# Select only the necessary columns from segment_grouped
segment_topic_map = segment_grouped[['Segment_ID', 'Segment_Topic']]
AT_preprocessed_df = AT_preprocessed_df.merge(segment_topic_map, on='Segment_ID', how='left')
print(f"Merged segment topics. DataFrame shape: {AT_preprocessed_df.shape}")

# 2. Prepare Timestamps
# Convert 'Date' column to datetime objects, coercing errors
AT_preprocessed_df['Timestamp'] = pd.to_datetime(AT_preprocessed_df['Date'], errors='coerce')

# Drop rows where date conversion failed or segment topic is missing
original_count_dtm = len(AT_preprocessed_df)
AT_preprocessed_df.dropna(subset=['Timestamp', 'Segment_Topic'], inplace=True)
dropped_count_dtm = original_count_dtm - len(AT_preprocessed_df)
print(f"Dropped {dropped_count_dtm} rows due to invalid dates or missing segment topics (if any).")

# Ensure Segment_Topic is integer type before mapping
AT_preprocessed_df['Segment_Topic'] = AT_preprocessed_df['Segment_Topic'].astype(int)

# 3. Prepare lists for topics_over_time
# Ensure Timestamp column is dt type before using .dt accessor
AT_preprocessed_df['Year'] = AT_preprocessed_df['Timestamp'].dt.year
timestamps_seg = AT_preprocessed_df['Year'].tolist()
docs_seg = AT_preprocessed_df['Processed_Text'].tolist()
topics_seg = AT_preprocessed_df['Segment_Topic'].tolist() # Use the assigned segment topics

print(f"Prepared {len(docs_seg)} documents, {len(timestamps_seg)} timestamps, and {len(topics_seg)} topics for DTM.")

# 4. Create the topic ID to name map dynamically from segment_topic_info
topic_id_to_name_map = pd.Series(segment_topic_info.LLM_Name.values, index=segment_topic_info.Topic).to_dict()
print("Created topic ID to name map using LLM_Name column.")


Merged segment topics. DataFrame shape: (170096, 22)
Dropped 106586 rows due to invalid dates or missing segment topics (if any).
Prepared 63510 documents, 63510 timestamps, and 63510 topics for DTM.
Created topic ID to name map using LLM_Name column.


In [23]:
print("\nStarting Dynamic Topic Modeling (Segment Topics Over Time)...")
start_time_dtm_seg = time.time()
    
# Use the segment_topic_model and the prepared data
segment_topics_over_time = segment_topic_model.topics_over_time(
    docs=docs_seg,
    topics=topics_seg,
    timestamps=timestamps_seg,
    global_tuning=True,
    evolution_tuning=True,
    nr_bins=20 # Adjust nr_bins as needed (e.g., number of years or periods)
)
    
print(f"Dynamic Topic Modeling finished in {time.time() - start_time_dtm_seg:.2f} seconds.")


Starting Dynamic Topic Modeling (Segment Topics Over Time)...
Dynamic Topic Modeling finished in 16.62 seconds.


### Visualize Dynamic Topics (Subplots)

To examine the evolution of each topic individually, we can create a grid of subplots, where each subplot shows the frequency trend for a single topic over time.

In [24]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import math

print("Generating subplot visualization for Segment Topics Over Time...")
    
# Get unique topic IDs from the DTM results
# Exclude outlier topic -1 if present
seg_topic_ids = sorted([topic for topic in segment_topics_over_time['Topic'].unique() if topic != -1])
num_topics = len(seg_topic_ids)
    
print(f"Found {num_topics} topics to plot.")

# Calculate number of speeches per topic
speech_counts_per_topic = AT_preprocessed_df.groupby('Segment_Topic').size()

# Determine grid size (e.g., 3 columns)
cols = 3 
rows = math.ceil(num_topics / cols) if num_topics > 0 else 1

# Create subplot titles from the topic names using the map and include speech counts
subplot_titles_seg = []
for topic_id in seg_topic_ids:
    topic_name = topic_id_to_name_map.get(topic_id, f"Topic {topic_id}")
    topic_speech_count = speech_counts_per_topic.get(topic_id, 'N/A')
    subplot_titles_seg.append(f"{topic_name}<br>(Speeches: {topic_speech_count})")

fig_subplots_seg = make_subplots(
        rows=rows,
        cols=cols,
        subplot_titles=subplot_titles_seg,
        shared_xaxes=True,
        vertical_spacing=0.15
    )

# Add traces for each topic
if num_topics > 0:
    for i, topic_id in enumerate(seg_topic_ids):
        row = (i // cols) + 1
        col = (i % cols) + 1
            
        topic_data = segment_topics_over_time[segment_topics_over_time['Topic'] == topic_id]
            
        fig_subplots_seg.add_trace(
                go.Scatter(x=topic_data['Timestamp'], y=topic_data['Frequency'], mode='lines+markers', name=f"Topic {topic_id}"),
                row=row, col=col
            )
        fig_subplots_seg.update_xaxes(title_text="Year", row=row, col=col)
        fig_subplots_seg.update_yaxes(title_text="Frequency", row=row, col=col)

    fig_subplots_seg.update_layout(
            title_text="Frequency of Each Topic Over Time Segments with Total Speech Counts",
            height=350 * rows + 90, 
            showlegend=False
        )
    fig_subplots_seg.show()
else:
    print("No topics (excluding -1) to visualize for DTM.")

Generating subplot visualization for Segment Topics Over Time...
Found 12 topics to plot.


In [25]:
# drop columns: Timestamp, Title, Year, Word_Count, Embeddings
AT_preprocessed_df.drop(columns=['Timestamp', 'Title', 'Year', 'Word_Count', 'Embeddings'], inplace=True, errors='ignore')

# Map the numeric Segment_Topic to the LLM-generated topic names
# The 'topic_id_to_name_map' was created in a previous cell
AT_preprocessed_df['Segment_Topic'] = AT_preprocessed_df['Segment_Topic'].map(topic_id_to_name_map)

# Rename the column for clarity
AT_preprocessed_df.rename(columns={'Segment_Topic': 'Topic'}, inplace=True)

In [None]:
# save it as a pickle file too
AT_preprocessed_df.to_pickle('data folder\data\AT_with_topics.pkl')

# save it as csv file
# AT_preprocessed_df.to_csv('data folder\data\AT_with_topics.csv', index=False)

In [27]:
AT_preprocessed_df.head(2)

Unnamed: 0,Text,Processed_Text,Sitting_ID,Speech_ID,Date,Speaker_role,Speaker_MP,Speaker_minister,Speaker_party,Speaker_party_name,Party_status,Party_orientation,Speaker_ID,Speaker_name,Speaker_gender,Speaker_birth,Is_Short_Speech,Segment_ID,Topic
0,Please take a seat. – I also ask the photograp...,seat photographer stop activity attend first c...,ParlaMint-AT-en_1996-01-15-020-XX-NRSITZ-00001,ParlaMint-AT_1996-01-15-020-XX-NRSITZ-00001_d7...,1996-01-15,Chairperson,MP,notMinister,SPÖ,Parliamentary group of the Social Democratic P...,Coalition,Centre-left,PAD_00334,"Fischer, Heinz",M,1938,False,ParlaMint-AT-en_1996-01-15-020-XX-NRSITZ-00001...,Party and Legislative Inquiry
1,They will be vowed unbreakable fidelity of the...,vowed unbreakable fidelity republic constant f...,ParlaMint-AT-en_1996-01-15-020-XX-NRSITZ-00001,ParlaMint-AT_1996-01-15-020-XX-NRSITZ-00001_d7...,1996-01-15,Regular,MP,notMinister,SPÖ,Parliamentary group of the Social Democratic P...,Coalition,Centre-left,PAD_01153,"Parfuss, Ludmilla",F,1942,False,ParlaMint-AT-en_1996-01-15-020-XX-NRSITZ-00001...,Party and Legislative Inquiry


In [28]:
# Rename 'Count' to 'Segment_Count' and add 'Speech_Count' column
segment_topic_info = segment_topic_info.rename(columns={'Count': 'Segment_Count'})
segment_topic_info['Speech_Count'] = segment_topic_info['Topic'].map(speech_counts_per_topic).fillna(0).astype(int)
segment_topic_info

Unnamed: 0,Topic,Segment_Count,Name,Representation,Representative_Docs,LLM_Name,Speech_Count
0,0,867,0_social_woman_pension_health,"[social, woman, pension, health, care, family,...",[questioner jank house whether matter decision...,Social and Family Welfare,9149
1,1,791,1_tax_budget_country_financial,"[tax, budget, country, financial, need, law, p...",[bank regret stadler actually wanted perfect b...,Tax and Budget Needs,8683
2,2,770,2_farmer_energy_law_agriculture,"[farmer, energy, law, agriculture, area, prote...",[two actual correction case refer provision ru...,Farmer and Energy Legislation,8115
3,3,630,3_european_country_right_europe,"[european, country, right, europe, policy, for...",[wish good morning national official protocol ...,European Policy Direction,6586
4,4,594,4_law_right_case_police,"[law, right, case, police, criminal, justice, ...",[parliament adopted legislative resolution spe...,Law and Rights,6434
5,5,550,5_liberal_policy_tax_social,"[liberal, policy, tax, social, law, economic, ...",[khol representative party clearly clearly dis...,Liberal Economic Policy,5704
6,6,449,6_communication_allocation_national_distributed,"[communication, allocation, national, distribu...",[national reported prevented gisela worm entry...,National Communication Framework,471
7,7,426,7_party_right_law_house,"[party, right, law, house, political, green, i...",[bank high house grastaur home party make keep...,Party and Legislative Inquiry,4466
8,8,426,8_school_education_university_student,"[school, education, university, student, teach...",[warmly place national convened therefore deem...,Education and Learning,4625
9,9,375,9_need_euro_work_law,"[need, euro, work, law, green, health, importa...",[week crisis lot doctor lot dealing disease lo...,Euro and Green Health Crisis,3863
