In [None]:
!pip install praw

In [None]:
import praw
import pandas as pd
from time import sleep

# Fill these with your Reddit app credentials
reddit = praw.Reddit(
    client_id='hsEaEyvNyz40hQGCx5LkGQ',
    client_secret='m2Wg9wPGCk_M-UMzgFCEvYfCPOvzSg',
    user_agent='networkscience'
)

# 🔍 Keywords to search for
keywords = ["climate", "melting", "ice", "pollution", "co2", "oil", "gasoline", "electric", "energy"]

# 🌐 Subreddits to search in
subreddits = ["climate", "climatechange", "environment", "science", "sustainability", "energy", "renewableenergy"]

def collect_comments(keywords, subreddits, posts_per_combo=20, comments_per_post=20, max_comments=15000):
    data = []

    for keyword in keywords:
        print(f"\n🔎 Searching for keyword: '{keyword}'")
        for subreddit_name in subreddits:
            print(f"  → in subreddit: r/{subreddit_name}")
            try:
                subreddit = reddit.subreddit(subreddit_name)
                for submission in subreddit.search(keyword, sort='new', limit=posts_per_combo):
                    submission.comments.replace_more(limit=0)
                    for comment in submission.comments[:comments_per_post]:
                        data.append({
                            "keyword": keyword,
                            "subreddit": subreddit_name,
                            "submission_title": submission.title,
                            "comment_body": comment.body,
                            "author": str(comment.author),
                            "created_utc": comment.created_utc,
                        })

                        # 🛑 Stop once we've reached the target
                        if len(data) >= max_comments:
                            print(f"\n✅ Reached {max_comments} comments. Stopping collection.")
                            return pd.DataFrame(data)

                sleep(1)
            except Exception as e:
                print(f"⚠️ Error in r/{subreddit_name}: {e}")
                sleep(2)

    return pd.DataFrame(data)

df = collect_comments(keywords, subreddits, posts_per_combo=20, comments_per_post=20, max_comments=15000)

df.to_csv("reddit_climate_comments_full.csv", index=False)
print("\n✅ Dataset saved to: reddit_climate_comments_full.csv")
print(f"📊 Total collected comments: {len(df)}")


In [None]:
import pandas as pd

df = pd.read_csv("reddit_climate_comments_full.csv")

print("🔢 Shape of dataset:", df.shape)

print("📋 Columns:", df.columns.tolist())

print("\n🔍 Sample rows:")
print(df.head())


In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')  # workaround for buggy environments


In [None]:
import pandas as pd
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Load raw dataset
df = pd.read_csv("reddit_climate_comments_full.csv")

# 1. Remove duplicate comments
df = df.drop_duplicates(subset="comment_body")

# 2. Drop empty or very short comments (< 10 characters)
df = df[df["comment_body"].astype(str).str.strip().str.len() > 10]

# 3. Filter irrelevant content (deleted, removed, AutoModerator)
df = df[~df["comment_body"].str.contains("removed|deleted", case=False, na=False)]
df = df[df["author"].str.lower() != "automoderator"]

# 4. Clean text: lowercase, remove links, extra spaces
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", "", text)  # remove URLs
    text = re.sub(r"[^a-z\s]", "", text)        # remove punctuation/numbers
    text = re.sub(r"\s+", " ", text).strip()    # remove extra whitespace
    return text

df["clean_text"] = df["comment_body"].apply(clean_text)

# 5. Tokenize the clean text (for future NLP usage)
df["tokens"] = df["clean_text"].apply(word_tokenize)

# 6. Drop rows with less than 3 tokens (optional quality control)
df = df[df["tokens"].apply(len) >= 3]

# 7. Reset index and show result
df = df.reset_index(drop=True)

# ✅ Save cleaned dataset
df.to_csv("reddit_climate_cleaned.csv", index=False)
print("✅ Cleaned dataset saved as 'reddit_climate_cleaned.csv'")
print("✅ Final shape:", df.shape)
print(df[["comment_body", "clean_text", "tokens"]].head())


community detection

In [None]:
!pip install sentence-transformers networkx community matplotlib seaborn -q

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import community as community_louvain
from wordcloud import WordCloud
from tqdm import tqdm

# Load cleaned dataset
df = pd.read_csv("reddit_climate_cleaned.csv")
texts = df["clean_text"].tolist()

# 🔹 Step 1: Get embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(texts, show_progress_bar=True)

# 🔹 Step 2: Compute cosine similarity matrix
similarity_matrix = cosine_similarity(embeddings)

# 🔹 Step 3: Build similarity graph
G = nx.Graph()
threshold = 0.6  # adjust for density

for i in tqdm(range(len(texts))):
    for j in range(i+1, len(texts)):
        sim = similarity_matrix[i][j]
        if sim > threshold:
            G.add_edge(i, j, weight=sim)

print(f"✅ Graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")

# 🔹 Step 4: Louvain community detection
partition = community_louvain.best_partition(G)
df["louvain_community"] = df.index.map(partition)

# 🔹 Save updated dataset
df.to_csv("reddit_with_louvain.csv", index=False)
print("✅ Saved with community labels")

# 🔸 Step 5: Plot community size
plt.figure(figsize=(10,5))
sns.countplot(x="louvain_community", data=df, order=df["louvain_community"].value_counts().index)
plt.title("Community Size Distribution")
plt.xlabel("Community ID")
plt.ylabel("Number of Comments")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 🔸 Step 6: Network Graph Visualization (sample 300 nodes)
sample_nodes = list(G.nodes)[:300]
subG = G.subgraph(sample_nodes)
pos = nx.spring_layout(subG, seed=42)
colors = [partition[n] for n in subG.nodes]
plt.figure(figsize=(12, 8))
nx.draw_networkx_nodes(subG, pos, node_size=40, node_color=colors, cmap='tab20')
nx.draw_networkx_edges(subG, pos, alpha=0.2)
plt.title("Reddit Comments Graph with Louvain Communities")
plt.axis('off')
plt.show()

# 🔸 Step 7: Word Cloud for each top community
top_communities = df["louvain_community"].value_counts().head(5).index

for com in top_communities:
    text = " ".join(df[df["louvain_community"] == com]["clean_text"])
    wc = WordCloud(width=800, height=300, background_color="white").generate(text)
    plt.figure(figsize=(10, 3))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Word Cloud for Community {com}")
    plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import community as community_louvain
from wordcloud import WordCloud
import numpy as np

# Load preprocessed and labeled data
df = pd.read_csv("reddit_with_louvain.csv")

# Ensure community column is int
df["louvain_community"] = df["louvain_community"].fillna(-1).astype(int)

# === 1. COMMUNITY SIZE BAR PLOT ===
plt.figure(figsize=(10, 5))
order = df["louvain_community"].value_counts().index
sns.countplot(data=df, x="louvain_community", order=order, palette="tab20")
plt.title("🧩 Number of Comments per Community", fontsize=14)
plt.xlabel("Community ID", fontsize=12)
plt.ylabel("Number of Comments", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# === 2. TEXTUAL PREVIEW ===
top_communities = df["louvain_community"].value_counts().head(5).index
print("📘 Sample comments from top communities:")
for com in top_communities:
    sample = df[df["louvain_community"] == com]["clean_text"].iloc[0]
    print(f"\n🟦 Community {com} example:")
    print("  ", sample)

# === 3. WORD CLOUDS FOR EACH COMMUNITY ===
for com in top_communities:
    text = " ".join(df[df["louvain_community"] == com]["clean_text"])
    wc = WordCloud(width=1000, height=300, background_color="white", max_words=100).generate(text)
    plt.figure(figsize=(12, 3))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"☁️ Word Cloud – Community {com}", fontsize=14)
    plt.tight_layout()
    plt.show()

# === 4. NETWORK GRAPH (Cleaner) ===
import networkx as nx
import matplotlib.cm as cm

# Reload graph (optional, if not kept)
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

texts = df["clean_text"].tolist()
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(texts, show_progress_bar=True)

sim_matrix = cosine_similarity(embeddings)
threshold = 0.6
G = nx.Graph()
for i in range(len(texts)):
    for j in range(i+1, len(texts)):
        if sim_matrix[i][j] > threshold:
            G.add_edge(i, j, weight=sim_matrix[i][j])

partition = community_louvain.best_partition(G)

# Draw a subgraph (e.g. top 200 nodes)
sub_nodes = list(G.nodes)[:200]
subG = G.subgraph(sub_nodes)
pos = nx.spring_layout(subG, seed=42)

plt.figure(figsize=(12, 8))
colors = [partition[n] for n in subG.nodes]
nx.draw_networkx_nodes(subG, pos, node_size=40, node_color=colors, cmap=cm.get_cmap('tab20'))
nx.draw_networkx_edges(subG, pos, alpha=0.2)
plt.title("🌐 Reddit Comment Similarity Graph (Louvain Coloring)", fontsize=14)
plt.axis("off")
plt.tight_layout()
plt.show()

# === 5. DEGREE DISTRIBUTION ===
degrees = [val for (node, val) in G.degree()]
plt.figure(figsize=(8, 4))
sns.histplot(degrees, bins=50, kde=False, color="purple")
plt.title("📊 Degree Distribution of Reddit Comment Graph", fontsize=14)
plt.xlabel("Degree (number of edges)", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.tight_layout()
plt.show()


In [None]:
!pip install scikit-learn matplotlib seaborn networkx community -q

from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import community as community_louvain
from sentence_transformers import SentenceTransformer

# Load your data
df = pd.read_csv("reddit_climate_cleaned.csv")
texts = df["clean_text"].tolist()

# Get embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(texts, show_progress_bar=True)

# Build similarity graph
from sklearn.metrics.pairwise import cosine_similarity
sim_matrix = cosine_similarity(embeddings)
threshold = 0.6
G = nx.Graph()
for i in range(len(texts)):
    for j in range(i+1, len(texts)):
        if sim_matrix[i][j] > threshold:
            G.add_edge(i, j, weight=sim_matrix[i][j])

# Louvain clustering
partition = community_louvain.best_partition(G)
df["louvain_community"] = df.index.map(partition).fillna(-1).astype(int)

# Use t-SNE for layout
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
tsne_results = tsne.fit_transform(embeddings)

df["tsne-1"] = tsne_results[:, 0]
df["tsne-2"] = tsne_results[:, 1]

# Plot with Seaborn
plt.figure(figsize=(12, 8))
sns.scatterplot(
    x="tsne-1", y="tsne-2",
    hue="louvain_community",
    palette="tab20",
    data=df,
    legend="full",
    s=25
)
plt.title("🌐 t-SNE Visualization of Reddit Comment Communities", fontsize=16)
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import numpy as np

# Load cleaned + labeled data
df = pd.read_csv("reddit_with_louvain.csv")

# Fill missing community labels if needed
df["louvain_community"] = df["louvain_community"].fillna(-1).astype(int)

# Focus on top N communities
TOP_N = 10
top_communities = df["louvain_community"].value_counts().head(TOP_N).index.tolist()
df_top = df[df["louvain_community"].isin(top_communities)].copy()

# Get embeddings again (or reuse if stored)
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(df_top["clean_text"].tolist(), show_progress_bar=True)

# Run t-SNE
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
tsne_results = tsne.fit_transform(embeddings)
df_top["tsne-1"] = tsne_results[:, 0]
df_top["tsne-2"] = tsne_results[:, 1]

# Compute centroids for each community
centroids = df_top.groupby("louvain_community")[["tsne-1", "tsne-2"]].mean().reset_index()

# Plot
plt.figure(figsize=(12, 8))
sns.scatterplot(
    x="tsne-1", y="tsne-2",
    hue="louvain_community",
    palette="tab10",
    data=df_top,
    s=30,
    legend="full"
)

# Add text labels at centroid positions
for _, row in centroids.iterrows():
    plt.text(
        row["tsne-1"], row["tsne-2"],
        f"Community {int(row['louvain_community'])}",
        fontsize=12, weight='bold', color='black',
        bbox=dict(facecolor='white', alpha=0.7, edgecolor='gray', boxstyle='round,pad=0.3')
    )

plt.title("Louvain Communities (Top 10) — t-SNE Layout", fontsize=16)
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.legend(title="Community ID", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


Descriptive Summary of Each Community
python
Copy
Edit


In [None]:
import pandas as pd
from collections import Counter
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Load data
df = pd.read_csv("reddit_with_louvain.csv")
df["louvain_community"] = df["louvain_community"].fillna(-1).astype(int)

# Use only clean_text
stop_words = set(stopwords.words("english"))

def get_top_words(texts, top_n=10):
    words = " ".join(texts).lower().split()
    words = [w for w in words if w not in stop_words and len(w) > 2]
    return Counter(words).most_common(top_n)

# Generate summary per community
summary = []

for community_id in sorted(df["louvain_community"].unique()):
    subset = df[df["louvain_community"] == community_id]
    count = len(subset)
    top_words = get_top_words(subset["clean_text"].tolist())
    sample_comment = subset["clean_text"].iloc[0] if count > 0 else ""

    summary.append({
        "community_id": community_id,
        "comment_count": count,
        "top_keywords": [kw for kw, _ in top_words],
        "example_comment": sample_comment
    })

summary_df = pd.DataFrame(summary)
summary_df = summary_df.sort_values("comment_count", ascending=False)

# Display
import pandas as pd
print(summary_df.to_string(index=False))


 Summary for Top 10 Communities

In [None]:
import pandas as pd
from collections import Counter
import nltk
import textwrap
nltk.download('stopwords')
from nltk.corpus import stopwords

# Load data
df = pd.read_csv("reddit_with_louvain.csv")
df["louvain_community"] = df["louvain_community"].fillna(-1).astype(int)

# Focus only on Top 10 communities
top_coms = df["louvain_community"].value_counts().head(10).index.tolist()
df_top = df[df["louvain_community"].isin(top_coms)].copy()

stop_words = set(stopwords.words("english"))

def get_top_keywords(texts, n=5):
    words = " ".join(texts).lower().split()
    words = [w for w in words if w not in stop_words and len(w) > 2]
    return [kw for kw, _ in Counter(words).most_common(n)]

# Wrap text to fixed width for display
def clean_and_wrap(text, width=70):
    return "\n".join(textwrap.wrap(text.strip(), width=width))

# Build summary
summary = []
for cid in top_coms:
    subset = df_top[df_top["louvain_community"] == cid]
    count = len(subset)
    keywords = get_top_keywords(subset["clean_text"].tolist(), 5)
    sample = clean_and_wrap(subset["clean_text"].iloc[0])

    summary.append({
        "Community ID": cid,
        "Comment Count": count,
        "Top Keywords": ", ".join(keywords),
        "Example Comment": sample
    })

summary_df = pd.DataFrame(summary)

# Optional: style it for notebook display
from IPython.display import display
display(summary_df.style.set_properties(**{
    'white-space': 'pre-wrap',
    'text-align': 'left'
}))


 t-SNE Plot with Keyword Labels

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import numpy as np
import nltk
import textwrap

nltk.download('stopwords')
from nltk.corpus import stopwords

# Load data
df = pd.read_csv("reddit_with_louvain.csv")
df["louvain_community"] = df["louvain_community"].fillna(-1).astype(int)

# Focus on top 10 communities
top_coms = df["louvain_community"].value_counts().head(10).index.tolist()
df_top = df[df["louvain_community"].isin(top_coms)].copy()

# Sentence embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(df_top["clean_text"].tolist(), show_progress_bar=True)

# t-SNE projection
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
tsne_result = tsne.fit_transform(embeddings)
df_top["tsne-1"] = tsne_result[:, 0]
df_top["tsne-2"] = tsne_result[:, 1]

# Compute centroids
centroids = df_top.groupby("louvain_community")[["tsne-1", "tsne-2"]].mean().reset_index()

# Compute top keywords per community
stop_words = set(stopwords.words("english"))

def get_top_keywords(texts, n=4):
    words = " ".join(texts).lower().split()
    words = [w for w in words if w not in stop_words and len(w) > 2]
    return [kw for kw, _ in Counter(words).most_common(n)]

keyword_labels = {}
for com in top_coms:
    texts = df_top[df_top["louvain_community"] == com]["clean_text"].tolist()
    keywords = get_top_keywords(texts, 4)
    label = ", ".join(keywords)
    keyword_labels[com] = label

# Plot
plt.figure(figsize=(12, 8))
sns.scatterplot(
    data=df_top,
    x="tsne-1", y="tsne-2",
    hue="louvain_community",
    palette="tab10",
    legend=False,
    s=30
)

# Annotate with keyword labels
for _, row in centroids.iterrows():
    com = int(row["louvain_community"])
    label = keyword_labels.get(com, f"Community {com}")
    plt.text(
        row["tsne-1"], row["tsne-2"],
        label,
        fontsize=11,
        weight="bold",
        bbox=dict(facecolor="white", edgecolor="gray", boxstyle="round,pad=0.3", alpha=0.75)
    )

plt.title("Top Reddit Comment Communities (Louvain) — t-SNE with Keyword Labels", fontsize=15)
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.tight_layout()
plt.show()


Sentiment Analysis per Community

In [None]:
!pip install -q transformers
from transformers import pipeline

sentiment_model = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
df_top["sentiment"] = df_top["clean_text"].apply(lambda x: sentiment_model(x[:512])[0]['label'])

# Group by community
sentiment_summary = df_top.groupby("louvain_community")["sentiment"].value_counts(normalize=True).unstack().fillna(0)
print(sentiment_summary)


Evaluate how semantically tight or noisy each community is.

Use intra-cluster cosine distances between embeddings

Lower average distance → tighter community

In [None]:
from sklearn.metrics.pairwise import cosine_distances

compactness = []
for com in top_coms:
    subset = df_top[df_top["louvain_community"] == com]
    embs = model.encode(subset["clean_text"].tolist())
    avg_dist = cosine_distances(embs).mean()
    compactness.append((com, avg_dist))

compactness = sorted(compactness, key=lambda x: x[1])
print("📏 Community Compactness (lower is better):")
for cid, dist in compactness:
    print(f"Community {cid}: {dist:.4f}")


Bar Chart of Sentiment Distribution per Community

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Reuse the sentiment_summary you created earlier
sentiment_summary = df_top.groupby("louvain_community")["sentiment"].value_counts(normalize=True).unstack().fillna(0)

# Plot bar chart
sentiment_summary = sentiment_summary[["POSITIVE", "NEGATIVE"]]  # consistent order

sentiment_summary.plot(
    kind="bar",
    stacked=True,
    figsize=(10, 5),
    colormap="coolwarm"
)

plt.title("📊 Sentiment Distribution per Louvain Community", fontsize=15)
plt.ylabel("Proportion of Comments")
plt.xlabel("Louvain Community")
plt.xticks(rotation=45)
plt.legend(title="Sentiment")
plt.tight_layout()
plt.show()


Add Sentiment as Hover Info in Interactive t-SNE Plot

In [None]:
!pip install plotly -q
import plotly.express as px

# Reduce long text for hover
df_top["short_comment"] = df_top["clean_text"].apply(lambda x: x[:100] + "..." if len(x) > 100 else x)

fig = px.scatter(
    df_top,
    x="tsne-1", y="tsne-2",
    color="louvain_community",
    hover_data=["short_comment", "sentiment"],
    title="🌐 t-SNE of Reddit Comments — Hover Sentiment & Text",
    color_continuous_scale="Viridis"
)

fig.update_traces(marker=dict(size=5, opacity=0.7))
fig.update_layout(legend_title="Louvain Community", height=600)
fig.show()


Degree Distribution for Comments

In [None]:
# Install dependencies (if not done already)
!pip install -q sentence-transformers scikit-learn networkx matplotlib seaborn

import pandas as pd
import numpy as np
import networkx as nx
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Step 1: Load cleaned dataset (use comments or titles)
df = pd.read_csv("reddit_climate_cleaned.csv")
texts = df["clean_text"].dropna().astype(str).tolist()

# Step 2: Encode using SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(texts, show_progress_bar=True)

# Step 3: Compute cosine similarity matrix
print("Computing similarity matrix...")
similarity_matrix = cosine_similarity(embeddings)

# Step 4: Build graph from similarity threshold
threshold = 0.6
G = nx.Graph()

print("Building graph edges from similarity matrix...")
for i in tqdm(range(len(texts))):
    for j in range(i + 1, len(texts)):
        if similarity_matrix[i][j] > threshold:
            G.add_edge(i, j, weight=similarity_matrix[i][j])

print(f"✅ Graph built with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")

# Step 5: Compute and plot degree distribution
degrees = [val for (node, val) in G.degree()]

plt.figure(figsize=(10, 5))
sns.histplot(degrees, bins=40, kde=False, color='darkblue')
plt.title("📊 Degree Distribution of Reddit Comments Graph", fontsize=15)
plt.xlabel("Degree (Number of Similar Comments)")
plt.ylabel("Number of Comments (Nodes)")
plt.tight_layout()
plt.show()


Top commenters by degree

In [None]:
# Install needed packages (if not already)
!pip install -q sentence-transformers scikit-learn networkx matplotlib seaborn

import pandas as pd
import numpy as np
import networkx as nx
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# === CONFIG: Set to 'comment_body' or 'submission_title' ===
use_column = "submission_title"  # 🔄 Change to "clean_text" for comments

# Step 1: Load dataset
df = pd.read_csv("reddit_climate_cleaned.csv")
texts = df[use_column].dropna().astype(str).tolist()

# Step 2: Embed using SentenceTransformer
print(f"🔍 Encoding {len(texts)} texts from column: '{use_column}'")
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(texts, show_progress_bar=True)

# Step 3: Build cosine similarity graph
print("🔗 Computing similarity matrix...")
similarity_matrix = cosine_similarity(embeddings)

threshold = 0.6
G = nx.Graph()

print("⚙️ Building graph...")
for i in tqdm(range(len(texts))):
    for j in range(i + 1, len(texts)):
        if similarity_matrix[i][j] > threshold:
            G.add_edge(i, j, weight=similarity_matrix[i][j])

print(f"✅ Graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")

# Step 4: Degree distribution
degrees = [val for (node, val) in G.degree()]
degree_series = pd.Series(degrees)
top_hubs = degree_series.sort_values(ascending=False).head(10)

# Step 5: Plot degree distribution and highlight hubs
plt.figure(figsize=(10, 5))
sns.histplot(degree_series, bins=40, kde=False, color="steelblue", label="All nodes")

for idx, val in top_hubs.items():
    plt.axvline(val, color="red", linestyle="--", alpha=0.5)
    plt.text(val, 5, f"Top #{idx}", rotation=90, verticalalignment='bottom', fontsize=8, color='red')

plt.title(f"📊 Degree Distribution — {use_column.replace('_', ' ').title()}", fontsize=15)
plt.xlabel("Node Degree (number of connections)")
plt.ylabel("Number of Nodes")
plt.legend()
plt.tight_layout()
plt.show()

# Optional: Print top hub text
print("\n🔝 Top 5 Hub Nodes by Degree:")
for idx in top_hubs.index[:5]:
    print(f"\nNode #{idx} (degree {top_hubs[idx]}):\n{texts[idx][:200]}...")


In [None]:
# Install needed packages (if not already)
!pip install -q sentence-transformers scikit-learn networkx matplotlib seaborn

import pandas as pd
import numpy as np
import networkx as nx
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# === CONFIG: Set to 'comment_body' or 'submission_title' ===
use_column = "submission_title"  # 🔄 Change to "clean_text" for comments

# Step 1: Load dataset
df = pd.read_csv("reddit_climate_cleaned.csv")
texts = df[use_column].dropna().astype(str).tolist()

# Step 2: Embed using SentenceTransformer
print(f"🔍 Encoding {len(texts)} texts from column: '{use_column}'")
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(texts, show_progress_bar=True)

# Step 3: Build cosine similarity graph
print("🔗 Computing similarity matrix...")
similarity_matrix = cosine_similarity(embeddings)

threshold = 0.6
G = nx.Graph()

print("⚙️ Building graph...")
for i in tqdm(range(len(texts))):
    for j in range(i + 1, len(texts)):
        if similarity_matrix[i][j] > threshold:
            G.add_edge(i, j, weight=similarity_matrix[i][j])

print(f"✅ Graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")

# Step 4: Degree distribution
degrees = [val for (node, val) in G.degree()]
degree_series = pd.Series(degrees)
top_hubs = degree_series.sort_values(ascending=False).head(10)

# Step 5: Plot degree distribution and highlight hubs
plt.figure(figsize=(10, 5))
sns.histplot(degree_series, bins=40, kde=False, color="steelblue", label="All nodes")

for idx, val in top_hubs.items():
    plt.axvline(val, color="red", linestyle="--", alpha=0.5)
    plt.text(val, 5, f"Top #{idx}", rotation=90, verticalalignment='bottom', fontsize=8, color='red')

plt.title(f"📊 Degree Distribution — {use_column.replace('_', ' ').title()}", fontsize=15)
plt.xlabel("Node Degree (number of connections)")
plt.ylabel("Number of Nodes")
plt.legend()
plt.tight_layout()
plt.show()

# Optional: Print top hub text
print("\n🔝 Top 5 Hub Nodes by Degree:")
for idx in top_hubs.index[:5]:
    print(f"\nNode #{idx} (degree {top_hubs[idx]}):\n{texts[idx][:200]}...")


Degree Distribution per Document

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns

# Use submission titles or comment text
texts = df["submission_title"].dropna().astype(str).tolist()  # ← or "clean_text" for comments

# Embedding
embeddings = model.encode(texts, show_progress_bar=True)
sim_matrix = cosine_similarity(embeddings)

# Build graph
G_doc = nx.Graph()
threshold = 0.6

for i in range(len(texts)):
    for j in range(i + 1, len(texts)):
        if sim_matrix[i, j] > threshold:
            G_doc.add_edge(i, j, weight=sim_matrix[i, j])

# Degree distribution (documents)
doc_degrees = [val for (_, val) in G_doc.degree()]

plt.figure(figsize=(10, 5))
sns.histplot(doc_degrees, bins=40, kde=False, color='darkgreen')
plt.title("📄 Degree Distribution per Document (Titles)", fontsize=14)
plt.xlabel("Number of Similar Documents")
plt.ylabel("Number of Nodes")
plt.tight_layout()
plt.show()


Log-Log Degree Distribution

In [None]:
# Install dependencies
!pip install -q sentence-transformers scikit-learn networkx nltk matplotlib seaborn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from collections import Counter
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import re
import itertools
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

# === STEP 1: Load and prepare data ===
df = pd.read_csv("reddit_climate_cleaned.csv")
texts_doc = df["clean_text"].dropna().astype(str).tolist()

# === STEP 2: Document Similarity Graph ===
print("🔍 Encoding documents...")
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings_doc = model.encode(texts_doc, show_progress_bar=True)

print("🔗 Computing cosine similarity matrix...")
sim_matrix = cosine_similarity(embeddings_doc)
threshold = 0.4  # Lowered threshold for richer connectivity

G_doc = nx.Graph()
for i in range(len(texts_doc)):
    for j in range(i + 1, len(texts_doc)):
        if sim_matrix[i][j] > threshold:
            G_doc.add_edge(i, j, weight=sim_matrix[i][j])

doc_degrees = [val for (_, val) in G_doc.degree()]
print(f"📄 Document graph: {G_doc.number_of_nodes()} nodes, {G_doc.number_of_edges()} edges")

# === STEP 3: Word Co-occurrence Graph ===
print("🔠 Building word co-occurrence graph...")

def preprocess(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text.lower())
    return [word for word in text.split() if word not in stopwords.words("english") and len(word) > 2]

tokenized_texts = df["clean_text"].dropna().astype(str).apply(preprocess)
tokenized_texts = tokenized_texts[tokenized_texts.apply(len) > 5]

G_word = nx.Graph()
for tokens in tokenized_texts:
    for w1, w2 in itertools.combinations(set(tokens), 2):
        if G_word.has_edge(w1, w2):
            G_word[w1][w2]["weight"] += 1
        else:
            G_word.add_edge(w1, w2, weight=1)

word_degrees = [val for (_, val) in G_word.degree()]
print(f"🔠 Word graph: {G_word.number_of_nodes()} nodes, {G_word.number_of_edges()} edges")

# === STEP 4: Log-log degree distribution plotting ===
def plot_loglog_cleaned(degrees, title):
    degree_counts = Counter(degrees)
    ks = np.array(sorted(degree_counts.keys()))
    pk = np.array([v / sum(degree_counts.values()) for v in degree_counts.values()])

    plt.figure(figsize=(6, 4))
    plt.scatter(ks, pk, s=40, alpha=0.7)
    plt.xscale("log")
    plt.yscale("log")
    plt.xlabel("k")
    plt.ylabel("p(k)")
    plt.title(title)
    plt.grid(True, which="both", linestyle="--", alpha=0.3)
    plt.tight_layout()
    plt.show()

# === STEP 5: Final plots ===
plot_loglog_cleaned(doc_degrees, "degree distribution for documents (log-log)")
plot_loglog_cleaned(word_degrees, "degree distribution for words (log-log)")


for titles

In [None]:
# Install dependencies if needed
!pip install -q sentence-transformers scikit-learn networkx nltk matplotlib seaborn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from collections import Counter
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import re
import itertools
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

# === STEP 1: Load and prepare data ===
df = pd.read_csv("reddit_climate_cleaned.csv")
texts_doc = df["submission_title"].dropna().astype(str).tolist()

# === STEP 2: Document Similarity Graph (TITLES) ===
print("🔍 Encoding submission titles...")
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings_doc = model.encode(texts_doc, show_progress_bar=True)

print("🔗 Computing cosine similarity matrix...")
sim_matrix = cosine_similarity(embeddings_doc)
threshold = 0.4  # Lower threshold to form more edges

G_doc = nx.Graph()
for i in range(len(texts_doc)):
    for j in range(i + 1, len(texts_doc)):
        if sim_matrix[i][j] > threshold:
            G_doc.add_edge(i, j, weight=sim_matrix[i][j])

doc_degrees = [val for (_, val) in G_doc.degree()]
print(f"📄 Title graph: {G_doc.number_of_nodes()} nodes, {G_doc.number_of_edges()} edges")

# === STEP 3: Word Co-occurrence Graph from Titles ===
print("🔠 Building word co-occurrence graph (titles)...")

def preprocess(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text.lower())
    return [word for word in text.split() if word not in stopwords.words("english") and len(word) > 2]

tokenized_titles = df["submission_title"].dropna().astype(str).apply(preprocess)
tokenized_titles = tokenized_titles[tokenized_titles.apply(len) > 3]

G_word = nx.Graph()
for tokens in tokenized_titles:
    for w1, w2 in itertools.combinations(set(tokens), 2):
        if G_word.has_edge(w1, w2):
            G_word[w1][w2]["weight"] += 1
        else:
            G_word.add_edge(w1, w2, weight=1)

word_degrees = [val for (_, val) in G_word.degree()]
print(f"🔠 Word graph (from titles): {G_word.number_of_nodes()} nodes, {G_word.number_of_edges()} edges")

# === STEP 4: Plotting Function (Cleaned log-log)
def plot_loglog_cleaned(degrees, title):
    degree_counts = Counter(degrees)
    ks = np.array(sorted(degree_counts.keys()))
    pk = np.array([v / sum(degree_counts.values()) for v in degree_counts.values()])

    plt.figure(figsize=(6, 4))
    plt.scatter(ks, pk, s=40, alpha=0.7)
    plt.xscale("log")
    plt.yscale("log")
    plt.xlabel("k")
    plt.ylabel("p(k)")
    plt.title(title)
    plt.grid(True, which="both", linestyle="--", alpha=0.3)
    plt.tight_layout()
    plt.show()

# === STEP 5: Final Plots ===
plot_loglog_cleaned(doc_degrees, "degree distribution for titles (log-log)")
plot_loglog_cleaned(word_degrees, "degree distribution for title-words (log-log)")


adding powerlaw

In [None]:
pip install powerlaw

In [None]:
import powerlaw
import matplotlib.pyplot as plt

def fit_powerlaw_and_plot(degrees, title):
    print(f"\n📊 Fitting power-law for: {title}")

    # Remove 0-degree nodes (not part of power-law support)
    degrees = [d for d in degrees if d > 0]

    # Fit the power-law
    fit = powerlaw.Fit(degrees, discrete=True, verbose=False)

    # Print exponent and comparison stats
    print(f"Estimated power-law exponent γ: {fit.alpha:.3f}")
    print(f"Minimum degree (xmin): {fit.xmin}")
    R, p = fit.distribution_compare('power_law', 'exponential')
    print(f"Likelihood ratio test vs exponential: R = {R:.3f}, p = {p:.3f}")

    # Plot with fitted line
    plt.figure(figsize=(6, 4))
    fit.plot_ccdf(color='blue', label='Empirical data')
    fit.power_law.plot_ccdf(color='red', linestyle='--', label='Power-law fit')
    plt.xscale('log')
    plt.yscale('log')
    plt.xlabel("k")
    plt.ylabel("P(X ≥ k)")
    plt.title(f"{title} (CCDF + Power-law fit)")
    plt.legend()
    plt.tight_layout()
    plt.show()
fit_powerlaw_and_plot(doc_degrees, "Title Graph Degree Distribution")
fit_powerlaw_and_plot(word_degrees, "Title-Word Co-occurrence Degree Distribution")


In [None]:
import powerlaw
import matplotlib.pyplot as plt
from collections import Counter

def fit_powerlaw_and_plot_clean(degrees, title):
    degrees = [d for d in degrees if d > 0]  # Remove isolated nodes

    fit = powerlaw.Fit(degrees, discrete=True, verbose=False)

    print(f"\n📊 {title}")
    print("-" * 40)
    print(f"  • Nodes analyzed: {len(degrees)}")
    print(f"  • Power-law exponent γ: {fit.alpha:.3f}")
    print(f"  • xmin (fit starts at degree ≥): {fit.xmin}")
    R, p = fit.distribution_compare('power_law', 'exponential')
    print(f"  • Power-law vs Exponential: R = {R:.2f}, p = {p:.4f}")
    print("-" * 40)

    # Plot CCDF + fitted power-law
    plt.figure(figsize=(6, 4))
    fit.plot_ccdf(label="Empirical CCDF", color="black", linewidth=2, marker='o', markersize=4, alpha=0.8)
    fit.power_law.plot_ccdf(color="red", linestyle="--", label="Fitted Power-law", linewidth=2)

    plt.xscale("log")
    plt.yscale("log")
    plt.xlabel("Degree k", fontsize=12)
    plt.ylabel("P(K ≥ k)", fontsize=12)
    plt.title(f"{title}\n(Log-Log CCDF with Power-law Fit)", fontsize=13)
    plt.grid(True, which="both", ls="--", lw=0.5, alpha=0.4)
    plt.legend()
    plt.tight_layout()
    plt.show()


In [None]:
fit_powerlaw_and_plot_clean(doc_degrees, "📄 Title Document Graph")
fit_powerlaw_and_plot_clean(word_degrees, "🔠 Title-Word Co-occurrence Graph")


In [None]:
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np

def plot_clean_loglog(degrees, title="Degree Distribution (log-log)", dot_color="black"):
    # Step 1: remove zeros
    degrees = [d for d in degrees if d > 0]

    # Step 2: get p(k)
    degree_counts = Counter(degrees)
    ks = np.array(sorted(degree_counts.keys()))
    pk = np.array([v / sum(degree_counts.values()) for v in degree_counts.values()])

    # Step 3: plot log-log
    plt.figure(figsize=(6, 4))
    plt.scatter(ks, pk, s=35, color=dot_color, alpha=0.8)
    plt.xscale("log")
    plt.yscale("log")
    plt.xlabel("Degree k", fontsize=12)
    plt.ylabel("p(k)", fontsize=12)
    plt.title(title, fontsize=13)
    plt.grid(True, which="both", ls="--", lw=0.5, alpha=0.4)
    plt.tight_layout()
    plt.show()


In [None]:
plot_clean_loglog(doc_degrees, "📄 Log-Log Degree Distribution for Titles", dot_color="darkgreen")
plot_clean_loglog(word_degrees, "🔠 Log-Log Degree Distribution for Title Words", dot_color="steelblue")


In [None]:
import pandas as pd

df = pd.read_csv("reddit_climate_cleaned.csv")

print("📋 Columns in dataset:")
print(df.columns.tolist())

print("\n🔍 Sample rows:")
print(df[["submission_title", "comment_body", "clean_text"]].head())


Hard Louvain Cluster Size Plot

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv("reddit_with_louvain.csv")
df["louvain_community"] = df["louvain_community"].fillna(-1).astype(int)

# Simulate soft Louvain if missing
if "soft_louvain" not in df.columns:
    df["soft_louvain"] = df["louvain_community"]

# Normalize counts
hard_counts = df["louvain_community"].value_counts(normalize=True).sort_index()
soft_counts = df["soft_louvain"].value_counts(normalize=True).sort_index()

# Filter to cluster IDs 0–20
hard_counts = hard_counts[hard_counts.index <= 70]
soft_counts = soft_counts[soft_counts.index <= 70]

# === Plot side-by-side standard chart ===
fig, axs = plt.subplots(1, 2, figsize=(12, 4), sharey=True)

# Hard Louvain
axs[0].scatter(hard_counts.index, hard_counts.values, color='royalblue', s=50)
axs[0].set_title("Hard Louvain", fontsize=13)
axs[0].set_xlabel("Community ID")
axs[0].set_ylabel("Proportion")
axs[0].set_xlim(-1.5, 40)
axs[0].grid(True, linestyle='--', alpha=0.3)

# Soft Louvain
axs[1].scatter(soft_counts.index, soft_counts.values, color='cornflowerblue', s=50)
axs[1].set_title("Soft Louvain", fontsize=13)
axs[1].set_xlabel("Community ID")
axs[1].set_xlim(-01.5, 70.5)
axs[1].grid(True, linestyle='--', alpha=0.3)

plt.suptitle("Louvain Cluster Proportions (Standard View)", fontsize=15)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
from collections import Counter
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

def clean_and_tokenize(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text.lower())
    return [word for word in text.split() if word not in stop_words and len(word) > 2]

# Load dataset
df = pd.read_csv("reddit_with_louvain.csv")
df["louvain_community"] = df["louvain_community"].fillna(-1).astype(int)

# Limit to top 10 communities
top_coms = df["louvain_community"].value_counts().head(10).index.tolist()
summary = []

for com in top_coms:
    subset = df[df["louvain_community"] == com]["clean_text"].dropna().astype(str)
    all_words = []
    for text in subset:
        all_words.extend(clean_and_tokenize(text))

    top_words = Counter(all_words).most_common(10)
    keywords = ", ".join([w for w, _ in top_words])

    summary.append({
        "Community ID": com,
        "Comment Count": len(subset),
        "Top Keywords": keywords
    })

# Convert to DataFrame for viewing
keywords_df = pd.DataFrame(summary)
keywords_df = keywords_df.sort_values("Community ID")
keywords_df.reset_index(drop=True, inplace=True)

# Show table
import pandas as pd
# Show top 10 Louvain communities and their keywords
pd.set_option("display.max_colwidth", 200)
print(keywords_df)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

# === Helper functions ===
stop_words = set(stopwords.words("english"))

def clean_and_tokenize(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text.lower())
    return [word for word in text.split() if word not in stop_words and len(word) > 2]

# === Load dataset ===
df = pd.read_csv("reddit_with_louvain.csv")
df["louvain_community"] = df["louvain_community"].fillna(-1).astype(int)

# === Select top 5 communities by size ===
top_communities = df["louvain_community"].value_counts().head(10).index.tolist()

# === Plot top 10 words for each ===
fig, axs = plt.subplots(len(top_communities), 1, figsize=(8, 4 * len(top_communities)))

for i, com in enumerate(top_communities):
    subset = df[df["louvain_community"] == com]["clean_text"].dropna().astype(str)
    all_words = []
    for text in subset:
        all_words.extend(clean_and_tokenize(text))

    word_counts = Counter(all_words).most_common(10)
    words, counts = zip(*word_counts)

    axs[i].barh(words, counts, color='royalblue')
    axs[i].invert_yaxis()
    axs[i].set_title(f"Louvain Community {com} - Top Words", fontsize=13)
    axs[i].set_xlabel("Frequency")
    axs[i].grid(True, linestyle='--', alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import nltk
import matplotlib.cm as cm
nltk.download("stopwords")
from nltk.corpus import stopwords

# === Helper functions ===
stop_words = set(stopwords.words("english"))

def clean_and_tokenize(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text.lower())
    return [word for word in text.split() if word not in stop_words and len(word) > 2]

# === Load dataset ===
df = pd.read_csv("reddit_with_louvain.csv")
df["louvain_community"] = df["louvain_community"].fillna(-1).astype(int)

# === Select top N communities ===
TOP_COMMUNITIES = 8
TOP_WORDS = 15

top_communities = df["louvain_community"].value_counts().head(TOP_COMMUNITIES).index.tolist()
colors = cm.get_cmap('tab10', TOP_COMMUNITIES)  # Use categorical colormap

# === Plot ===
fig, axs = plt.subplots(TOP_COMMUNITIES, 1, figsize=(10, 4 * TOP_COMMUNITIES))

for i, com in enumerate(top_communities):
    subset = df[df["louvain_community"] == com]["clean_text"].dropna().astype(str)
    all_words = []
    for text in subset:
        all_words.extend(clean_and_tokenize(text))

    word_counts = Counter(all_words).most_common(TOP_WORDS)
    words, counts = zip(*word_counts)

    axs[i].barh(words, counts, color=colors(i))
    axs[i].invert_yaxis()
    axs[i].set_title(f"🌐 Louvain Community {com} — Top {TOP_WORDS} Words", fontsize=13)
    axs[i].set_xlabel("Frequency")
    axs[i].grid(True, linestyle='--', alpha=0.3)

plt.suptitle("Top Words per Louvain Community", fontsize=16, y=1.02)
plt.tight_layout()
plt.show()


Fit BERTopic on Comments

In [None]:
# Install necessary packages
!pip install -q bertopic sentence-transformers umap-learn hdbscan

import pandas as pd
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

# Load dataset
df = pd.read_csv("reddit_climate_cleaned.csv")
texts = df["clean_text"].dropna().astype(str).tolist()

# Define models
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', prediction_data=True)
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english", min_df=10, max_df=0.8)

# Fit BERTopic
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    calculate_probabilities=True,
    verbose=True
)

topics, probs = topic_model.fit_transform(texts)

# Assign topics to DataFrame
df["bertopic_topic"] = topics
df.to_csv("reddit_with_bertopic.csv", index=False)
print("✅ BERTopic finished and results saved to 'reddit_with_bertopic.csv'")


Visualizing Topic Size Distribution

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load topic-labeled data
df = pd.read_csv("reddit_with_bertopic.csv")
df["bertopic_topic"] = df["bertopic_topic"].fillna(-1).astype(int)

# Compute proportions
topic_counts = df["bertopic_topic"].value_counts(normalize=True).sort_index()

# Limit x-axis to first 20 topics
topic_counts_plot = topic_counts[topic_counts.index <= 20]

# Plot dot-style topic proportion chart
plt.figure(figsize=(6, 4))
plt.scatter(topic_counts_plot.index, topic_counts_plot.values, color="darkorange", s=40)
plt.title("BERTopic", fontsize=14)
plt.xlabel("Topic ID")
plt.ylabel("Proportion of Comments")
plt.grid(True, linestyle="--", alpha=0.4)
plt.tight_layout()
plt.show()


Log-Log CCDF of Topic Sizes

In [None]:
import numpy as np
from collections import Counter

# Get raw counts
topic_degrees = df["bertopic_topic"].value_counts().sort_index()
degrees = np.array(topic_degrees.tolist())
degrees = degrees[degrees > 0]

# Compute p(k)
count_vals = Counter(degrees)
k = np.array(sorted(count_vals.keys()))
pk = np.array([v / sum(count_vals.values()) for v in count_vals.values()])
Pk = 1 - np.cumsum(pk)

# Plot log-log CCDF
plt.figure(figsize=(6, 4))
plt.scatter(k, Pk, s=35, color="darkorange", alpha=0.8)
plt.xscale("log")
plt.yscale("log")
plt.xlabel("k (Topic Size)")
plt.ylabel("P(k ≥ K)")
plt.title("BERTopic Topic Size CCDF (log-log)")
plt.grid(True, which="both", ls="--", alpha=0.4)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import cm
import pandas as pd
from bertopic import BERTopic

# Load topic-labeled data and model
df = pd.read_csv("reddit_with_bertopic.csv")

# Re-initialize the BERTopic model if not in memory (optional: load saved model instead)
# topic_model = BERTopic.load("bertopic_model")  # If you've saved it
# Otherwise, reuse from earlier in session

# Count top N topics
TOP_TOPICS = 8
TOP_WORDS = 15

top_topic_ids = df["bertopic_topic"].value_counts().head(TOP_TOPICS).index.tolist()
colors = cm.get_cmap('tab10', TOP_TOPICS)

# Plot each topic's top words
fig, axs = plt.subplots(TOP_TOPICS, 1, figsize=(10, 4 * TOP_TOPICS))

for i, topic_id in enumerate(top_topic_ids):
    words_scores = topic_model.get_topic(topic_id)[:TOP_WORDS]
    words, scores = zip(*words_scores)

    axs[i].barh(words, scores, color=colors(i))
    axs[i].invert_yaxis()
    axs[i].set_title(f"Topic {topic_id} — Top {TOP_WORDS} Words", fontsize=13)
    axs[i].set_xlabel("Score")
    axs[i].grid(True, linestyle='--', alpha=0.3)

plt.suptitle("Top Keywords per BERTopic Topic", fontsize=16, y=1.02)
plt.tight_layout()
plt.show()


BERTopic 2D UMAP Scatterplot

In [None]:
# 📦 Install required libraries (if not already done)
!pip install -q bertopic sentence-transformers umap-learn hdbscan

# 📄 Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from umap import UMAP
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# 🔄 Load dataset
df = pd.read_csv("reddit_with_bertopic.csv")
texts = df["clean_text"].astype(str).tolist()

# 💬 Sentence embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(texts, show_progress_bar=True)

# 🧭 Reduce embeddings to 2D using UMAP
umap_model = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine')
reduced_embeddings = umap_model.fit_transform(embeddings)
df["x"] = reduced_embeddings[:, 0]
df["y"] = reduced_embeddings[:, 1]

# 🔍 Filter to top 10 topics only
top_topic_ids = df["bertopic_topic"].value_counts().head(10).index.tolist()
df_filtered = df[df["bertopic_topic"].isin(top_topic_ids)]

# 🎨 Plot with Seaborn
plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=df_filtered,
    x="x", y="y",
    hue="bertopic_topic",
    palette="tab10",
    s=30,
    alpha=0.85
)
plt.title("BERTopic UMAP Clusters (Top 10 Topics)", fontsize=14)
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.legend(title="Topic", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, linestyle='--', alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
#Hoverable with top words per topic
fig = topic_model.visualize_documents(texts, embeddings=embeddings)
fig.show()


In [None]:
import numpy as np
import pandas as pd

# Load topic assignments
df = pd.read_csv("reddit_with_bertopic.csv")
texts = df["clean_text"].astype(str).tolist()
topics = topic_model.get_document_info(texts)["Topic"]

# Get top 10 most frequent topics
top_10_topic_ids = topics.value_counts().head(10).index.tolist()

# Filter the texts and embeddings to those 10 topics
selected_indices = [i for i, t in enumerate(topics) if t in top_10_topic_ids]
texts_subset = [texts[i] for i in selected_indices]
embeddings_subset = np.array([embeddings[i] for i in selected_indices])  # ✅ FIXED HERE

# Re-run interactive visualization on filtered data
fig = topic_model.visualize_documents(texts_subset, embeddings=embeddings_subset)
fig.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import cm

TOP_N_TOPICS = 10
TOP_N_WORDS = 15

# Get top 10 topic IDs again
top_topic_ids = df["bertopic_topic"].value_counts().head(TOP_N_TOPICS).index.tolist()
colors = cm.get_cmap('tab10', TOP_N_TOPICS)

# Plot bar chart per topic
fig, axs = plt.subplots(len(top_topic_ids), 1, figsize=(10, 4 * len(top_topic_ids)))

for i, topic_id in enumerate(top_topic_ids):
    topic_words = topic_model.get_topic(topic_id)[:TOP_N_WORDS]
    if not topic_words: continue
    words, scores = zip(*topic_words)

    axs[i].barh(words, scores, color=colors(i))
    axs[i].invert_yaxis()
    axs[i].set_title(f"Topic {topic_id} — Top {TOP_N_WORDS} Words", fontsize=13)
    axs[i].set_xlabel("Relevance Score")
    axs[i].grid(True, linestyle="--", alpha=0.3)

plt.suptitle("Top Keywords per BERTopic Topic", fontsize=16, y=1.02)
plt.tight_layout()
plt.show()


Topic Proportions

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Count and select top 20 topics
topic_counts = df["bertopic_topic"].value_counts().head(20).sort_values(ascending=False)

# Plot
plt.figure(figsize=(10, 5))
sns.barplot(x=topic_counts.index.astype(str), y=topic_counts.values, palette="tab20")
plt.title("Top 20 BERTopic Topics by Comment Count", fontsize=15)
plt.xlabel("Topic ID")
plt.ylabel("Number of Comments")
plt.xticks(rotation=45)
plt.grid(True, linestyle='--', alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
from umap import UMAP

# Reduce embeddings to 2D
umap_model = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine')
reduced_embeddings = umap_model.fit_transform(embeddings)

# Add UMAP coordinates to df
df["x"] = reduced_embeddings[:, 0]
df["y"] = reduced_embeddings[:, 1]


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

top_n = 10
top_topics = df["bertopic_topic"].value_counts().head(top_n).index.tolist()
df_filtered = df[df["bertopic_topic"].isin(top_topics)]

# Compute centroids and top words
centroids = df_filtered.groupby("bertopic_topic")[["x", "y"]].mean()
top_words = {
    topic: topic_model.get_topic(topic)[0][0] if topic_model.get_topic(topic) else f"Topic {topic}"
    for topic in top_topics
}

# Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=df_filtered,
    x="x", y="y",
    hue="bertopic_topic",
    palette="tab10",
    s=25,
    alpha=0.7,
    legend=False
)

for topic, row in centroids.iterrows():
    label = top_words[topic]
    plt.text(
        row["x"], row["y"],
        label,
        fontsize=10,
        weight='bold',
        bbox=dict(facecolor='white', alpha=0.8, edgecolor='gray')
    )

plt.title("BERTopic Clusters with Top Keywords", fontsize=14)
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.grid(True, linestyle="--", alpha=0.3)
plt.tight_layout()
plt.show()


Similarity Matrix for Comments and titles

In [None]:
TOP_N_TOPICS = 10
TOP_N_WORDS = 10

top_topic_ids = df["bertopic_topic"].value_counts().head(TOP_N_TOPICS).index.tolist()

for topic_id in top_topic_ids:
    keywords = topic_model.get_topic(topic_id)
    word_list = [word for word, score in keywords[:TOP_N_WORDS]]
    print(f"\n🔹 Topic {topic_id}:")
    print("   " + ", ".join(word_list))


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pyplot as plt

# Sample 10 comments with topic labels
sample_df = df[["comment_body", "bertopic_topic"]].dropna().sample(10, random_state=1)
sample_texts = sample_df["comment_body"].astype(str).tolist()
sample_labels = [f"Topic {tid}" for tid in sample_df["bertopic_topic"]]

# Encode
sample_embeddings = embedding_model.encode(sample_texts, show_progress_bar=True)

# Compute cosine similarity
sim_matrix = cosine_similarity(sample_embeddings)

# Plot heatmap with topic IDs as y-axis
plt.figure(figsize=(10, 8))
sns.heatmap(sim_matrix, annot=True, fmt=".2f", cmap="YlOrRd",
            xticklabels=sample_labels, yticklabels=sample_labels)
plt.title("Comment Similarity Matrix (10 samples) — Labeled by Topic")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Get top 3 keywords for a given topic
def get_top_words(topic_id, n=3):
    try:
        words = topic_model.get_topic(topic_id)
        if words and len(words) >= n:
            return ", ".join([w for w, _ in words[:n]])
        elif words:
            return ", ".join([w for w, _ in words])
        else:
            return "unknown topic"
    except:
        return "unknown topic"

# Step 2: Collect 10 comments from 10 different valid topics
selected_comments = []
selected_labels = []
used_topics = set()

# Loop through dataframe and pick 1 comment per unique, valid topic
for _, row in df.dropna(subset=["comment_body", "bertopic_topic"]).iterrows():
    topic_id = int(row["bertopic_topic"])
    if topic_id not in used_topics:
        label = get_top_words(topic_id)
        if label != "unknown topic":
            selected_comments.append(str(row["comment_body"]))
            selected_labels.append(label)
            used_topics.add(topic_id)
    if len(selected_comments) == 10:
        break

# Step 3: Generate embeddings and similarity
comment_embeddings = embedding_model.encode(selected_comments, show_progress_bar=True)
sim_matrix = cosine_similarity(comment_embeddings)

# Step 4: Plot the similarity matrix
plt.figure(figsize=(12, 9))
sns.heatmap(sim_matrix, annot=True, fmt=".2f", cmap="coolwarm",
            xticklabels=selected_labels, yticklabels=selected_labels)
plt.title("Comment Similarity Matrix — Top 3 Words per 10 Unique Topics", fontsize=15)
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Get top 10 most frequent BERTopic topics
top_topics = df["bertopic_topic"].value_counts().head(10).index.tolist()

# Step 2: Get 1 representative title from each of these topics
selected_titles = []
selected_labels = []

def get_top_words(topic_id, n=3):
    try:
        words = topic_model.get_topic(topic_id)
        if words:
            return ", ".join([w for w, _ in words[:n]])
        else:
            return f"Topic {topic_id}"
    except:
        return f"Topic {topic_id}"

for topic_id in top_topics:
    title_row = df[(df["bertopic_topic"] == topic_id) & (df["submission_title"].notna())]
    if not title_row.empty:
        title = title_row.iloc[0]["submission_title"]
        selected_titles.append(str(title))
        selected_labels.append(get_top_words(topic_id))

# Step 3: Generate embeddings and similarity matrix
title_embeddings = embedding_model.encode(selected_titles, show_progress_bar=True)
sim_matrix = cosine_similarity(title_embeddings)

# Step 4: Plot heatmap
plt.figure(figsize=(12, 9))
sns.heatmap(sim_matrix, annot=True, fmt=".2f", cmap="Oranges",
            xticklabels=selected_labels, yticklabels=selected_labels)
plt.title("Submission Title Similarity — Top 10 Most Frequent Topics", fontsize=15)
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

# 🔢 Step 1: Get top 20 most frequent topics
TOP_N = 20
top_topics = df["bertopic_topic"].value_counts().head(TOP_N).index.tolist()

# 🧠 Helper: Get top 3 topic words
def get_top_words(topic_id, n=3):
    try:
        words = topic_model.get_topic(topic_id)
        if words:
            return ", ".join([w for w, _ in words[:n]])
        else:
            return f"Topic {topic_id}"
    except:
        return f"Topic {topic_id}"

# 🔍 Step 2: Select 1 representative title per topic
selected_titles = []
selected_labels = []

for topic_id in top_topics:
    title_row = df[(df["bertopic_topic"] == topic_id) & (df["submission_title"].notna())]
    if not title_row.empty:
        title = title_row.iloc[0]["submission_title"]
        selected_titles.append(str(title))
        selected_labels.append(get_top_words(topic_id))

# 🧠 Step 3: Compute embeddings + cosine similarity
title_embeddings = embedding_model.encode(selected_titles, show_progress_bar=True)
sim_matrix = cosine_similarity(title_embeddings)

# 📊 Step 4: Plot heatmap
plt.figure(figsize=(16, 12))
sns.heatmap(sim_matrix, annot=False, fmt=".2f", cmap="YlGnBu",
            xticklabels=selected_labels, yticklabels=selected_labels)
plt.title("Submission Title Similarity — Top 20 Topics (Top 3 Keywords Each)", fontsize=16)
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()


 Hierarchical Clustering on Comments

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Step 1: Get top N topics
TOP_N = 20
top_topics = df["bertopic_topic"].value_counts().head(TOP_N).index.tolist()

# Step 2: Select 1 comment per topic
selected_comments = []
labels = []

def get_top_words(topic_id, n=3):
    topic = topic_model.get_topic(topic_id)
    if topic:
        return ", ".join([w for w, _ in topic[:n]])
    return f"Topic {topic_id}"

for topic_id in top_topics:
    row = df[(df["bertopic_topic"] == topic_id) & (df["comment_body"].notna())]
    if not row.empty:
        comment = row.iloc[0]["comment_body"]
        selected_comments.append(str(comment))
        labels.append(get_top_words(topic_id))

# Step 3: Embed comments and compute cosine distance
comment_embeddings = embedding_model.encode(selected_comments, show_progress_bar=True)
distance_matrix = 1 - cosine_similarity(comment_embeddings)

# Step 4: Apply hierarchical clustering
linked = linkage(distance_matrix, method='ward')

# Step 5: Plot dendrogram
plt.figure(figsize=(14, 8))
dendrogram(linked, labels=labels, leaf_rotation=45, leaf_font_size=11)
plt.title("Hierarchical Clustering of Comments (Top 20 Topics)", fontsize=16)
plt.tight_layout()
plt.show()


for titles

In [None]:
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Get top N topics
TOP_N = 20
top_topics = df["bertopic_topic"].value_counts().head(TOP_N).index.tolist()

# Step 2: Select 1 title per topic
selected_titles = []
labels = []

def get_top_words(topic_id, n=3):
    topic = topic_model.get_topic(topic_id)
    if topic:
        return ", ".join([w for w, _ in topic[:n]])
    return f"Topic {topic_id}"

for topic_id in top_topics:
    row = df[(df["bertopic_topic"] == topic_id) & (df["submission_title"].notna())]
    if not row.empty:
        title = row.iloc[0]["submission_title"]
        selected_titles.append(str(title))
        labels.append(get_top_words(topic_id))

# Step 3: Embed titles and compute cosine distance
title_embeddings = embedding_model.encode(selected_titles, show_progress_bar=True)
distance_matrix = 1 - cosine_similarity(title_embeddings)

# Step 4: Apply hierarchical clustering
linked = linkage(distance_matrix, method='ward')

# Step 5: Plot dendrogram
plt.figure(figsize=(14, 8))
dendrogram(linked, labels=labels, leaf_rotation=45, leaf_font_size=11)
plt.title("Hierarchical Clustering of Titles (Top 20 Topics)", fontsize=16)
plt.tight_layout()
plt.show()


world clouds for top 10

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Step 1: Get top 10 most frequent topics
top_topics = df["bertopic_topic"].value_counts().head(10).index.tolist()

# Step 2: Generate word clouds
plt.figure(figsize=(20, 15))

for i, topic_id in enumerate(top_topics):
    topic_words = topic_model.get_topic(topic_id)

    if topic_words:
        # Convert to frequency dict {word: weight}
        word_freq = {word: score for word, score in topic_words}

        # Generate word cloud
        wc = WordCloud(width=800, height=400, background_color='white', colormap='Set2')
        wc.generate_from_frequencies(word_freq)

        # Plot
        plt.subplot(5, 2, i + 1)
        plt.imshow(wc, interpolation='bilinear')
        plt.axis("off")
        plt.title(f"Topic {topic_id}", fontsize=14)

plt.suptitle("BERTopic — Word Clouds of Top 10 Topics", fontsize=18)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


Word Clouds for Top 10 Louvain

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
import re
from nltk.corpus import stopwords

# Filter to rows with valid Louvain label
valid_df = df[df["louvain_community"].notna()].copy()

# Get top 10 communities
top_communities = valid_df["louvain_community"].value_counts().head(10).index.tolist()

# Define stopwords
stop_words = set(stopwords.words("english"))

# Preprocess function
def clean_and_tokenize(text):
    text = re.sub(r"[^\w\s]", "", text.lower())
    tokens = text.split()
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]
    return tokens

# Plot settings
plt.figure(figsize=(20, 15))

for i, community_id in enumerate(top_communities):
    community_texts = valid_df[valid_df["louvain_community"] == community_id]["comment_body"].dropna()

    # Tokenize and count words
    all_tokens = []
    for text in community_texts:
        all_tokens.extend(clean_and_tokenize(str(text)))
    word_freq = dict(Counter(all_tokens).most_common(100))

    # Generate word cloud
    wc = WordCloud(width=800, height=400, background_color='white', colormap='Set3')
    wc.generate_from_frequencies(word_freq)

    # Plot
    plt.subplot(5, 2, i + 1)
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.title(f"Louvain Community {int(community_id)}", fontsize=14)

plt.suptitle("Word Clouds for Top 10 Louvain Communities", fontsize=18)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from community import best_partition
import pandas as pd
import numpy as np

# Step 1: Subset titles (e.g., top 500 non-empty ones)
title_df = df[df["submission_title"].notna()].copy().reset_index(drop=True)
title_df = title_df.head(500)  # You can increase to 1000 if GPU allows

# Step 2: Compute embeddings
titles = title_df["submission_title"].astype(str).tolist()
title_embeddings = embedding_model.encode(titles, show_progress_bar=True)

# Step 3: Compute cosine similarity
similarity_matrix = cosine_similarity(title_embeddings)

# Step 4: Build similarity graph with threshold
threshold = 0.6
G = nx.Graph()

for i in range(len(similarity_matrix)):
    for j in range(i + 1, len(similarity_matrix)):
        sim = similarity_matrix[i][j]
        if sim > threshold:
            G.add_edge(i, j, weight=sim)

# Step 5: Run Louvain
partition = best_partition(G)

# Step 6: Save results
title_df["louvain_title_community"] = title_df.index.map(partition)
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
import re
from nltk.corpus import stopwords

# Define stopwords
stop_words = set(stopwords.words("english"))

# Clean + tokenize function
def clean_and_tokenize(text):
    text = re.sub(r"[^\w\s]", "", text.lower())
    tokens = text.split()
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]
    return tokens

# Get top 10 title communities
top_comms = title_df["louvain_title_community"].value_counts().head(10).index.tolist()

# Plot settings
plt.figure(figsize=(20, 15))

for i, comm in enumerate(top_comms):
    texts = title_df[title_df["louvain_title_community"] == comm]["submission_title"].dropna()
    all_tokens = []
    for txt in texts:
        all_tokens.extend(clean_and_tokenize(str(txt)))
    word_freq = dict(Counter(all_tokens).most_common(100))

    wc = WordCloud(width=800, height=400, background_color='white', colormap='tab10')
    wc.generate_from_frequencies(word_freq)

    plt.subplot(5, 2, i + 1)
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.title(f"Louvain Title Community {comm}", fontsize=14)

plt.suptitle("Word Clouds for Top 10 Louvain Title Communities", fontsize=18)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


In [None]:
import umap
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
import re
from nltk.corpus import stopwords

# Step 1: Clean subset (remove NaN community)
subset = title_df.dropna(subset=["louvain_title_community"]).copy()

# Step 2: Filter to top 6 largest communities for clarity
top_comms = subset["louvain_title_community"].value_counts().head(6).index.tolist()
subset = subset[subset["louvain_title_community"].isin(top_comms)]

# Step 3: Reduce embeddings with UMAP
title_embeddings_subset = title_embeddings[subset.index]
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric="cosine")
umap_coords = reducer.fit_transform(title_embeddings_subset)
subset["x"] = umap_coords[:, 0]
subset["y"] = umap_coords[:, 1]

# Step 4: Plot
plt.figure(figsize=(12, 8))
sns.scatterplot(
    data=subset,
    x="x", y="y",
    hue="louvain_title_community",
    palette="tab10",
    alpha=0.85,
    s=90,
    edgecolor='white',
    linewidth=0.5
)
plt.title("UMAP Projection of Title Embeddings (Top 6 Louvain Communities)", fontsize=16)
plt.xlabel("")
plt.ylabel("")
plt.xticks([])
plt.yticks([])
plt.legend(title="Community", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
from collections import Counter
import pandas as pd
import re
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

def clean_and_tokenize(text):
    text = re.sub(r"[^\w\s]", "", text.lower())
    tokens = text.split()
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]
    return tokens

# Filter to top 6 Louvain title communities
top_comms = title_df["louvain_title_community"].value_counts().head(6).index.tolist()
subset = title_df[title_df["louvain_title_community"].isin(top_comms)]

# Extract top 10 words per community
summary = []
for comm in top_comms:
    texts = subset[subset["louvain_title_community"] == comm]["submission_title"].dropna().astype(str)
    all_tokens = []
    for text in texts:
        all_tokens.extend(clean_and_tokenize(text))
    top_words = [word for word, _ in Counter(all_tokens).most_common(10)]
    summary.append({"Community": comm, "Top Words": ", ".join(top_words)})

# Create and display table
summary_df = pd.DataFrame(summary)
summary_df
