In [4]:
#!/usr/bin/env python
# coding: utf-8
"""
BERTopic with Zero-Shot Pre-filtering (v9.4)

This script implements a two-stage process:

1. **Filtering:** First, a zero-shot classification model (facebook/bart-large-mnli) is used to select only those tweets that match a given topic ("a positive vision of the future").

2. **Topic Modeling:** Then, BERTopic is applied to the filtered, more relevant dataset to create high-quality topics, guided by a comprehensive, refined, and non-overlapping set of seed topics.
"""

import os
import re
import csv
import json
import logging
import torch
from collections import Counter
import pandas as pd
import numpy as np
from tqdm import tqdm

# Set up tqdm for pandas to see progress in apply
tqdm.pandas()

# Import pipeline from transformers
from transformers import pipeline

# Import matplotlib for plotting the histogram
import matplotlib
matplotlib.use('Agg')  # Use a non-interactive backend
import matplotlib.pyplot as plt

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.preprocessing import normalize

# ------------------ 1. Configuration & Setup ------------------

# --- Paths and Logging ---
in_path = "mocked_dataset_generated_by_chatgpt.csv"
out_dir = "bertopic_with_zeroshot_chatgpt"
os.makedirs(out_dir, exist_ok=True)

# --- Logging Setup ---
log_path = os.path.join(out_dir, "pipeline.log")
root = logging.getLogger()
root.setLevel(logging.INFO)
for h in list(root.handlers):
    root.removeHandler(h)

fh = logging.FileHandler(log_path, mode="w", encoding="utf-8")
sh = logging.StreamHandler()
fmt = logging.Formatter("%(asctime)s | %(levelname)s | %(name)s | %(message)s")
fh.setFormatter(fmt)
sh.setFormatter(fmt)
root.addHandler(fh)
root.addHandler(sh)

logging.getLogger("bertopic").setLevel(logging.INFO)
logging.getLogger("transformers").setLevel(logging.WARNING)

logging.info("Starting the BERTopic pipeline with zero-shot pre-filtering (v9).")

# --- Modeling Parameters ---
SEED = 100
BATCH_SIZE = 64
MIN_TOPIC_SIZE = 100

# --- Zero-Shot Filtering Parameters ---
ZERO_SHOT_MODEL = "facebook/bart-large-mnli"
CANDIDATE_LABEL = "a positive vision of the future"
SCORE_THRESHOLD = 0.75  # Confidence threshold for selecting tweets

# --- Seed Topics List ---
SEED_TOPICS = [
    # I. Economy & management
    ["circular economy", "reuse", "repair", "recycle"],
    ["wellbeing economy", "steady state", "sufficiency"],
    ["sustainable finance", "impact investing", "green bonds"],
    ["post growth", "less consumption"],

    # II. Governance, rights & justice
    ["indigenous rights", "land rights", "self determination"],
    ["community leadership", "local decisions", "shared management"],
    ["rights of nature", "legal rights for nature"],
    ["global goals", "policy reform", "climate treaty"],

    # III. Human–nature relationship
    ["restore nature", "habitat restoration"],
    ["protect wildlife", "biodiversity"],
    ["healthy planet and people", "one health"],
    ["values shift", "nature connection", "stewardship"],

    # IV. Culture
    ["traditions", "festivals", "spiritual connection", "arts and culture", "creative arts", "storytelling"],

    # V. Sector & application areas
    ["urban nature", "green streets", "city parks"],
    ["food and farming", "agroecology", "local food"],
    ["clean energy", "wind power", "solar power"],
    ["public transport", "cycling", "electric vehicles"],
    ["digital technology", "ai for nature"],
    ["tourism", "nature travel"],
    ["fashion", "textiles", "upcycling"],
    ["mining", "minerals", "materials"],
    ["education", "schools", "training"],
]

# ------------------ 2. Data Loading ------------------
logging.info(f"Loading data from {in_path}")
df = pd.read_csv(in_path, engine="python", on_bad_lines="skip", encoding="utf-8", encoding_errors="replace")

if "text" not in df.columns:
    raise ValueError("Input CSV must contain a 'text' column.")

df = df[df["text"].notnull()].copy()
logging.info(f"Loaded {len(df):,} initial tweets.")

# ------------------ 3. Stage 1: Zero-Shot Filtering ------------------
logging.info(f"Starting zero-shot classification with model {ZERO_SHOT_MODEL}.")
logging.info(f"Label for classification: '{CANDIDATE_LABEL}'")
logging.info("This stage may take a long time...")

DEVICE_ID = 0 if torch.cuda.is_available() else -1
classifier = pipeline("zero-shot-classification", model=ZERO_SHOT_MODEL, device=DEVICE_ID)

# Using the original, raw text for better classification quality
sequences = df['text'].tolist()
results = classifier(sequences, candidate_labels=[CANDIDATE_LABEL], batch_size=BATCH_SIZE)

# Extract scores and add them to the DataFrame
scores = [res['scores'][0] for res in results]
df['relevance_score'] = scores

# **NEW**: Saving the dataframe with ALL scores before filtering
all_scores_path = os.path.join(out_dir, "df_all_tweets_with_scores.csv")
df.to_csv(all_scores_path, index=False, quoting=csv.QUOTE_NONNUMERIC)
logging.info(f"All tweets with their relevance scores have been saved to {all_scores_path}")

# **NEW**: Visualizing the score distribution
logging.info("Creating a histogram of the relevance score distribution...")
plt.figure(figsize=(12, 7))
plt.hist(df['relevance_score'], bins=np.arange(0, 1.05, 0.05), edgecolor='black')
plt.title('Relevance Score Distribution')
plt.xlabel('Relevance Score')
plt.ylabel('Number of Tweets')
plt.xticks(np.arange(0, 1.05, 0.05))
plt.grid(axis='y', alpha=0.75)
histogram_path = os.path.join(out_dir, "relevance_score_distribution.png")
plt.savefig(histogram_path)
logging.info(f"Histogram saved to {histogram_path}")

# Filter the DataFrame by the threshold
df_filtered = df[df['relevance_score'] >= SCORE_THRESHOLD].copy()
num_kept = len(df_filtered)
num_discarded = len(df) - num_kept
logging.info(f"Filtering complete. Selected {num_kept:,} relevant tweets ({num_kept/len(df):.2%}).")
logging.info(f"Discarded {num_discarded:,} tweets.")

# Saving the filtered data with their scores for analysis
filtered_path = os.path.join(out_dir, "df_filtered_with_scores.csv")
df_filtered.to_csv(filtered_path, index=False, quoting=csv.QUOTE_NONNUMERIC)
logging.info(f"Filtered data saved to {filtered_path}")

# ------------------ 4. Preprocessing Filtered Data ------------------
def clean_text(t):
    t = re.sub(r"http\S+|www\.\S+", " ", t)
    t = re.sub(r"(?:^|\s)@[\w_]+|^rt\s+", " ", t, flags=re.IGNORECASE)
    t = t.replace("#", " ")
    t = re.sub(r"[^A-Za-z\s]", " ", t).lower()
    cleaned_text = re.sub(r"\s+", " ", t).strip()
    return cleaned_text

df_filtered["text_clean"] = df_filtered["text"].astype(str).apply(clean_text)
df_filtered = df_filtered[df_filtered["text_clean"].str.split().str.len() >= 3].copy()
df_filtered = df_filtered.drop_duplicates(subset=["text_clean"]).reset_index(drop=True)

docs = df_filtered["text_clean"].tolist()
N = len(docs)
logging.info(f"Preprocessing complete. Total documents for topic modeling: {N:,}")

# ------------------ 5. Stage 2: Topic Modeling ------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
logging.info(f"Using device: {DEVICE}")

EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
sentence_model = SentenceTransformer(EMB_MODEL_NAME, device=DEVICE)

custom_stop = {"rt", "amp", "https", "co", "t", "via", "im", "dont", "cant", "u"}
stopwords_list = sorted(list(ENGLISH_STOP_WORDS.union(custom_stop)))

vectorizer_model = CountVectorizer(
    ngram_range=(1, 2),
    stop_words=stopwords_list,
    min_df=2,
    token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z]+\b"
)

representation_model = KeyBERTInspired()
umap_model = UMAP(
    n_neighbors=15,
    n_components=10,
    min_dist=0.0,
    metric="cosine",
    random_state=SEED
)

hdbscan_model = HDBSCAN(
    min_cluster_size=MIN_TOPIC_SIZE,
    min_samples=10,
    metric="euclidean",
    cluster_selection_method="leaf",
    prediction_data=True
)

logging.info("Generating embeddings for the filtered documents...")
embeddings_all = sentence_model.encode(
    docs,
    batch_size=BATCH_SIZE,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)
logging.info(f"Embeddings generated with shape: {embeddings_all.shape}")

logging.info("--- Starting topic modeling on the filtered data ---")
topic_model = BERTopic(
    embedding_model=sentence_model,
    vectorizer_model=vectorizer_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    representation_model=representation_model,
    min_topic_size=MIN_TOPIC_SIZE,
    nr_topics=None,
    top_n_words=10,
    language="english",
    calculate_probabilities=True,
    verbose=True,
    seed_topic_list=SEED_TOPICS
)

topics, _ = topic_model.fit_transform(docs, embeddings=embeddings_all)
logging.info(f"Modeling complete. Found {len(set(topics))} topics.")

# ------------------ 6. Saving Results ------------------
topic_info = topic_model.get_topic_info()
topic_info_path = os.path.join(out_dir, "topic_info.csv")
topic_info.to_csv(topic_info_path, index=False, quoting=csv.QUOTE_NONNUMERIC)
logging.info(f"Topic info saved to {topic_info_path}")

df_filtered["Topic"] = topics
final_predictions_path = os.path.join(out_dir, "df_with_final_predictions.csv")
df_filtered.to_csv(final_predictions_path, index=False, quoting=csv.QUOTE_NONNUMERIC)
logging.info(f"Final predictions with original data saved to {final_predictions_path}")

counts = Counter([t for t in topics if t != -1])
max_topic_size = max(counts.values()) if counts else 0
logging.info(f"Total topics (excluding -1): {len(counts)}; Largest topic size: {max_topic_size:,}.")

run_params = {
    "embedding_model": EMB_MODEL_NAME,
    "zero_shot_model": ZERO_SHOT_MODEL,
    "zero_shot_label": CANDIDATE_LABEL,
    "zero_shot_threshold": SCORE_THRESHOLD,
    "min_topic_size": MIN_TOPIC_SIZE,
}
with open(os.path.join(out_dir, "run_params.json"), "w", encoding="utf-8") as f:
    json.dump(run_params, f, ensure_ascii=False, indent=4)

logging.info("Pipeline finished successfully.")


2025-09-24 08:18:51,014 | INFO | root | Starting the BERTopic pipeline with zero-shot pre-filtering (v9).
2025-09-24 08:18:51,016 | INFO | root | Loading data from mocked_dataset_generated_by_chatgpt.csv
2025-09-24 08:18:51,049 | INFO | root | Loaded 1,000 initial tweets.
2025-09-24 08:18:51,051 | INFO | root | Starting zero-shot classification with model facebook/bart-large-mnli.
2025-09-24 08:18:51,052 | INFO | root | Label for classification: 'a positive vision of the future'
2025-09-24 08:18:51,054 | INFO | root | This stage may take a long time...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu
2025-09-24 08:35:26,257 | INFO | root | All tweets with their relevance scores have been saved to bertopic_with_zeroshot_chatgpt/df_all_tweets_with_scores.csv
2025-09-24 08:35:26,261 | INFO | root | Creating a histogram of the relevance score distribution...
2025-09-24 08:35:26,716 | INFO | root | Histogram saved to bertopic_with_zeroshot_chatgpt/relevance_score_distribution.png
2025-09-24 08:35:26,731 | INFO | root | Filtering complete. Selected 462 relevant tweets (46.20%).
2025-09-24 08:35:26,732 | INFO | root | Discarded 538 tweets.
2025-09-24 08:35:26,743 | INFO | root | Filtered data saved to bertopic_with_zeroshot_chatgpt/df_filtered_with_scores.csv
2025-09-24 08:35:26,778 | INFO | root | Preprocessing complete. Total documents for topic modeling: 432
2025-09-24 08:35:26,780 | INFO | root | Using device: cpu
2025-09-24 08:35:26,792 | INFO | sentence_transformers.SentenceTransformer | Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2025-09-24 08:35:30,086 | INFO | root | Generating embeddings for the filtered documents...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

2025-09-24 08:35:36,733 | INFO | root | Embeddings generated with shape: (432, 384)
2025-09-24 08:35:36,735 | INFO | root | --- Starting topic modeling on the filtered data ---
2025-09-24 08:35:36,760 - BERTopic - Guided - Find embeddings highly related to seeded topics.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-09-24 08:35:37,017 - BERTopic - Guided - Completed ✓
2025-09-24 08:35:37,018 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-24 08:35:48,599 - BERTopic - Dimensionality - Completed ✓
2025-09-24 08:35:48,600 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-24 08:35:48,702 - BERTopic - Cluster - Completed ✓
2025-09-24 08:35:48,712 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-24 08:35:49,379 - BERTopic - Representation - Completed ✓
2025-09-24 08:35:49,411 | INFO | root | Modeling complete. Found 2 topics.
2025-09-24 08:35:49,420 | INFO | root | Topic info saved to bertopic_with_zeroshot_chatgpt/topic_info.csv
2025-09-24 08:35:49,437 | INFO | root | Final predictions with original data saved to bertopic_with_zeroshot_chatgpt/df_with_final_predictions.csv
2025-09-24 08:35:49,439 | INFO | root | Total topics (excluding -1): 2; Largest topic size: 281.
2025-09-24 08:35:49,441 | INFO | roo