In [1]:
# Import required libraries
import json
import pandas as pd
import numpy as np
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from transformers import pipeline
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')
print("Libraries loaded!")

Libraries loaded!


## 1. Load Data with Pre-computed Embeddings

In [2]:
# Load data with pre-computed embeddings
data = []
with open('data_with_embeddings.jsonl', 'r') as f:
    for line in f:
        data.append(json.loads(line))

df = pd.DataFrame(data)
df['combined_text'] = (df['title'].fillna('') + " " + df['selftext'].fillna('')).str.strip()
embeddings = np.array(df['embedding'].tolist())

print(f"Loaded {len(df)} posts")
print(f"Embedding shape: {embeddings.shape}")

Loaded 8799 posts
Embedding shape: (8799, 1024)


## 2. Advanced BERTopic Configuration

In [3]:
# Use the same embedding model that was used to create the embeddings
# This is required for representation models to work correctly
embedding_model = "Qwen/Qwen3-Embedding-0.6B"

# Fine-tune the keywords (C-TF-IDF)
# Remove stop words for cleaner topic labels
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Diversity & Keyword Accuracy
# KeyBERTInspired handles 'relevance', MMR handles 'diversity' of words
representation_model = {
    "Main": KeyBERTInspired(),
    "Aspect2": MaximalMarginalRelevance(diversity=0.3)
}

topic_model = BERTopic(
    embedding_model=embedding_model,  # Required for representation models
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    representation_model=representation_model,
    min_topic_size=20,
    nr_topics="auto",  # Automatically merge small/similar topics
    verbose=True
)

print("BERTopic configured!")

BERTopic configured!


In [4]:
# Fit using pre-computed embeddings to save GPU/CPU time
print("Fitting BERTopic (this may take a few minutes)...")

docs = df['combined_text'].tolist()
topics, probs = topic_model.fit_transform(docs, embeddings)

print(f"\nTopics extracted! Found {len(set(topics)) - 1} topics (excluding outliers)")

Fitting BERTopic (this may take a few minutes)...




Loading weights:   0%|          | 0/310 [00:00<?, ?it/s]

2026-02-19 15:54:26,567 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-02-19 15:54:35,188 - BERTopic - Dimensionality - Completed âœ“
2026-02-19 15:54:35,188 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-02-19 15:54:35,329 - BERTopic - Cluster - Completed âœ“
2026-02-19 15:54:35,330 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2026-02-19 15:54:35,597 - BERTopic - Representation - Completed âœ“
2026-02-19 15:54:35,597 - BERTopic - Topic reduction - Reducing number of topics
2026-02-19 15:54:35,606 - BERTopic - Representation - Fine-tuning topics using representation models.


RuntimeError: MPS backend out of memory (MPS allocated: 8.93 GiB, other allocations: 53.08 GiB, max allowed: 63.65 GiB). Tried to allocate 13.27 GiB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
# Add topic information to dataframe
df['topic_id'] = topics

# Get topic labels safely
topic_info = topic_model.get_topic_info()
topic_name_map = dict(zip(topic_info['Topic'], topic_info['Name']))
df['topic_label'] = df['topic_id'].map(topic_name_map)

print("\nTopic distribution:")
print(df['topic_id'].value_counts().head(15))


Topic distribution:
topic_id
 0    8634
 1     106
-1      59
Name: count, dtype: int64


In [None]:
# Display discovered topics
print("\n" + "="*80)
print("                    DISCOVERED TOPICS")
print("="*80)

for idx, row in topic_info.head(15).iterrows():
    if row['Topic'] != -1:  # Skip outlier topic
        print(f"\nTopic {row['Topic']}: {row['Name']}")
        print(f"   Posts: {row['Count']}")


                    DISCOVERED TOPICS

Topic 0: 0_community_efforts_anarchism_thursday
   Posts: 8634

Topic 1: 1_tifa_day link_posting_posted
   Posts: 106


## 3. Optimized Batch Sentiment Analysis

In [None]:
# Load RoBERTa sentiment pipeline
print("Loading Sentiment Pipeline...")

sentiment_pipe = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
    device=-1,  # Set to 0 if you have a GPU
    truncation=True,
    max_length=512
)

print("Sentiment model loaded!")

Loading Sentiment Pipeline...


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: cardiffnlp/twitter-roberta-base-sentiment-latest
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.pooler.dense.weight     | UNEXPECTED |  | 
roberta.embeddings.position_ids | UNEXPECTED |  | 
roberta.pooler.dense.bias       | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Sentiment model loaded!


In [None]:
# Batch processing for sentiment (2-5x faster than one-by-one)
def get_sentiment_batches(texts, batch_size=32):
    results = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Sentiment Analysis"):
        batch = texts[i:i+batch_size]
        # Handle empty strings
        batch = [t if t.strip() else "neutral" for t in batch]
        results.extend(sentiment_pipe(batch))
    return results

# Analyze title for sentiment (more emotive/dense than full text)
print("Analyzing sentiment...")
sentiment_results = get_sentiment_batches(df['title'].astype(str).tolist())

df['sentiment_label'] = [res['label'] for res in sentiment_results]
df['sentiment_confidence'] = [res['score'] for res in sentiment_results]
df['sentiment_score'] = df['sentiment_label'].map({'negative': -1, 'neutral': 0, 'positive': 1})

print("\nSentiment analysis complete!")

Analyzing sentiment...


Sentiment Analysis:  21%|â–ˆâ–ˆ        | 57/275 [00:26<01:42,  2.13it/s]

In [None]:
# Sentiment distribution
print("\nSentiment distribution:")
print(df['sentiment_label'].value_counts())
print(f"\nAverage sentiment confidence: {df['sentiment_confidence'].mean():.3f}")


Sentiment distribution:
sentiment_label
neutral     5411
negative    2787
positive     601
Name: count, dtype: int64

Average sentiment confidence: 0.738


## 4. Export & Save

In [None]:
# Save enhanced data
output_file = 'data_with_topics_sentiment_v2.jsonl'

print(f"Saving to {output_file}...")
df.to_json(output_file, orient='records', lines=True)

print(f"\nâœ… Saved {len(df)} posts to '{output_file}'")

Saving to data_with_topics_sentiment_v2.jsonl...

âœ… Saved 8799 posts to 'data_with_topics_sentiment_v2.jsonl'


In [None]:
# Save BERTopic model for later use
topic_model.save("bertopic_v2_model", serialization="safetensors", save_ctfidf=True)
print("âœ… BERTopic model saved to 'bertopic_v2_model/'")

âœ… BERTopic model saved to 'bertopic_v2_model/'


In [None]:
# Summary
print("\n" + "="*60)
print("           COMPUTATION SUMMARY")
print("="*60)
print(f"â€¢ Total posts processed: {len(df)}")
print(f"â€¢ Topics discovered: {len(topic_model.get_topic_info())}")
print(f"â€¢ Embedding model: Qwen/Qwen3-Embedding-0.6B")
print(f"â€¢ Sentiment model: cardiffnlp/twitter-roberta-base-sentiment-latest")
print(f"â€¢ Output file: {output_file}")
print(f"â€¢ Model saved: bertopic_v2_model/")
print("="*60)
print("\nðŸŽ‰ Ready for voter_insights_enhanced.ipynb!")


           COMPUTATION SUMMARY
â€¢ Total posts processed: 8799
â€¢ Topics discovered: 48
â€¢ Embedding model: Qwen/Qwen3-Embedding-0.6B
â€¢ Sentiment model: cardiffnlp/twitter-roberta-base-sentiment-latest
â€¢ Output file: data_with_topics_sentiment_v2.jsonl
â€¢ Model saved: bertopic_v2_model/

ðŸŽ‰ Ready for voter_insights_enhanced.ipynb!
