In [1]:
# load the data 
import pandas as pd
df = pd.read_csv('posts_processed1.csv')

# will do a clustering on the post "cleaned text" column

TEXT_COL = "cleaned_text" if "cleaned_text" in df.columns else "text"
docs = df[TEXT_COL].fillna("").astype(str).tolist()

# optional: dropping  super short docs (helps topic quality)
min_chars = 40
keep_mask = df[TEXT_COL].fillna("").astype(str).str.len() >= min_chars
df_small = df[keep_mask].copy()
docs_small = df_small[TEXT_COL].fillna("").astype(str).tolist()
print("done")

ModuleNotFoundError: No module named 'pandas'

In [None]:
# topic model for clustering, find a better cybersecurity embedding model 
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic

from transformers import pipeline
embedder = SentenceTransformer("all-MiniLM-L6-v2")

topic_model = BERTopic(
    embedding_model=embedder,
    calculate_probabilities=True,
    verbose=True
)

print("fitting topic model...")
topics, probs = topic_model.fit_transform(docs_small)
df_small["topic_id"] = topics
df_small["topic_prob"] = [float(p.max()) if p is not None else None for p in probs]

# topic summary table
topic_info = topic_model.get_topic_info()
print ("done")

: 

In [None]:

# Candidate labels (edit/extend to match your paper's taxonomy)
CANDIDATE_LABELS = [
    "workload and time pressure",
    "on-call fatigue and overtime",
    "sleep disruption and exhaustion",
    "burnout and emotional exhaustion",
    "frustration and helplessness",
    "organizational constraints and bureaucracy",
    "poor leadership and lack of support",
    "moral distress (can't do the right thing due to constraints)",
    "moral injury (betrayal/disillusionment, erosion of purpose/trust)",
    "hero complex (sole responsibility, can't disengage)",
    "imposter syndrome and anxiety",
    "team conflict and communication issues",
    "career stagnation and low recognition",
    "incident response stress",
    "tools, process, and technical debt stress",
    "training and skill pressure"
]

# zero-shot classifier (NLI)
# If you have GPU: it will run faster; otherwise still works on CPU
zsc = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
print("classifying topics...")

In [None]:
def build_topic_text(topic_id, n_words=12, n_examples=3):
    """Combine top keywords + a few representative posts as the text to label."""
    words = topic_model.get_topic(topic_id)
    if not words:
        return ""
    top_words = [w for w, _ in words[:n_words]]

    # representative posts (BERTopic helper)
    # note: this returns a DF with a "Document" column in recent versions
    rep = topic_model.get_representative_docs(topic_id)
    examples = rep[:n_examples] if rep else []

    text = "Topic keywords: " + ", ".join(top_words) + "\n\nExamples:\n" + "\n---\n".join(examples)
    return text

# Label each topic (skip outlier topic -1)
topic_id_to_label = {}
for tid in topic_info["Topic"].tolist():
    if tid == -1:
        topic_id_to_label[tid] = "outlier/misc"
        continue

    topic_text = build_topic_text(tid)
    if not topic_text.strip():
        topic_id_to_label[tid] = "misc"
        continue

    pred = zsc(topic_text, CANDIDATE_LABELS, multi_label=False)
    topic_id_to_label[tid] = pred["labels"][0]  # top label

# attach labels to per-post dataframe
df_small["topic_label"] = df_small["topic_id"].map(topic_id_to_label)

# attach labels to topic_info summary table
topic_info["topic_label"] = topic_info["Topic"].map(topic_id_to_label)

In [None]:
# 4) Save results
# -------------------------
df_small.to_csv("data/posts_with_topics.csv", index=False)
topic_info.to_csv("data/topic_info.csv", index=False)

print("Saved:")
print(" posts_with_topics.csv")
print(" topic_info.csv")
print("\nTop topics:")
print(topic_info.head(10))