## A Network Analysis On The Subreddit r/mentalhealth:
**Who leads the conversation on mental health?**

In [None]:
!pip install praw
!pip install networkx

In [None]:
import networkx as nx
import praw
import networkx as nx
from google.colab import userdata
import pandas as pd
import time
import warnings
import re
import logging
from sentence_transformers import SentenceTransformer

logging.getLogger("praw").setLevel(logging.ERROR)
logging.getLogger("prawcore").setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

In [None]:
reddit = (
    userdata.get('client_id'),
    userdata.get('client_secret'),
    userdata.get('user_agent')
)


In [None]:
SUBREDDIT_NAME = "mentalhealth"
TARGET_COMMENTS = 5000
TARGET_USERS = 5000
TOP_POSTS_LIMIT = 50
NEW_POSTS_LIMIT = 50

g = nx.DiGraph()
unique_users = set()
total_comments = 0
comment_data = []

# Unpack your credentials
client_id, client_secret, user_agent = reddit

# Initialize Reddit instance
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent
)

subreddit = reddit.subreddit(SUBREDDIT_NAME)

def process_submission(submission):
    global total_comments

    submission.comments.replace_more(limit=0)
    comment_lookup = {}

    # Map post author
    if submission.author:
        comment_lookup[submission.id] = str(submission.author)
        unique_users.add(str(submission.author))
        g.add_node(str(submission.author))

    for comment in submission.comments.list():
        if comment.author is None or comment.body is None:
            continue

        author = str(comment.author)
        comment_id = comment.id
        parent_id = comment.parent_id.split("_")[1]

        comment_lookup[comment_id] = author
        unique_users.add(author)
        g.add_node(author)

        parent_author = comment_lookup.get(parent_id)
        if parent_author and parent_author != author:
            if g.has_edge(author, parent_author):
                g[author][parent_author]['weight'] += 1
            else:
                g.add_edge(author, parent_author, weight=1)

        # Store comment
        comment_data.append({"user": author, "comment": comment.body})
        total_comments += 1

        # Stop if targets met
        if total_comments >= TARGET_COMMENTS or len(unique_users) >= TARGET_USERS:
            return True

    return False

# Collect comments from top posts
for submission in subreddit.top(limit=TOP_POSTS_LIMIT):
    if process_submission(submission):
        break

# If still below targets, collect from new posts
if total_comments < TARGET_COMMENTS and len(unique_users) < TARGET_USERS:
    for submission in subreddit.new(limit=NEW_POSTS_LIMIT):
        if process_submission(submission):
            break

# Create DataFrame once
df = pd.DataFrame(comment_data)

print("\n=== Collection Complete ===")
print(f"Total comments collected: {len(df)}")
print(f"Unique users collected: {len(unique_users)}")
print(f"Total nodes in graph: {len(g.nodes)}")
print(f"Total edges in graph: {len(g.edges)}")

# Save graph and DataFrame
nx.write_graphml(g, "reddit_mentalhealth_combined.graphml")
df.to_csv("reddit_mentalhealth_comments.csv", index=False)
print("Graph and comments DataFrame saved.")


**Measuring Influence Based on Degree Centrality, PageRank, and Betweenness Centrality**

In [None]:
top_k = 10
centrality_degree = nx.degree_centrality(g)

print("\nTop 10 users by degree centrality:")
for user in sorted(centrality_degree, key=centrality_degree.get, reverse=True)[:top_k]:
    # No 'name' attribute assumed, so print username directly
    print(f"{user}: Degree Centrality = {centrality_degree[user]:.4f}")

In [None]:
pagerank = nx.pagerank(g, weight='weight')
print("\nTop 10 users by PageRank:")
for user in sorted(pagerank, key=pagerank.get, reverse=True)[:top_k]:
    print(f"{user}: PageRank = {pagerank[user]:.4f}")

In [None]:
btw = nx.betweenness_centrality(g,k=10)
print("\nTop 10 users by Betweenness:")
for user in sorted(btw, key=btw.get, reverse=True)[:top_k]:
        print(f"{user}: Betweenness = {btw[user]:.4f}")

## Extension: Answering an additional question for our stakeholder
**“What sub-communities of users exist in the subreddit r/mentalhealth based on the semantic patterns in their comments?”**

**aggregate comments per user**

In [None]:
df = pd.read_csv("reddit_mentalhealth_comments.csv")

# Aggregate all comments per user
user_text = (
    df.groupby("user")["comment"]
      .apply(lambda x: " ".join(x))
      .reset_index()
)

print("Users:", len(user_text))


**create semantic embeddings**

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = model.encode(
    user_text["comment"].tolist(),
    show_progress_bar=True
)


**dimensionality reduction (UMAP)**

We use this to reduce the noise from the Reddit comments

In [None]:
import umap

reducer = umap.UMAP(
    n_neighbors=15,
    min_dist=0.1,
    n_components=2,
    random_state=42
)

embedding_2d = reducer.fit_transform(embeddings)

**similarity-based clustering**

See the visualization at the very bottom. The elbow method was used to determine the number of clusters (k)

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(embeddings)

user_text["cluster"] = clusters
user_text["x"] = embedding_2d[:, 0]
user_text["y"] = embedding_2d[:, 1]

In [None]:
for c in sorted(user_text["cluster"].unique()):
    print(f"\nCluster {c}")
    print(user_text[user_text["cluster"] == c]["user"].head(5).tolist())

**Name Clusters Meaningfully**

Look at what each these users talk about

In [None]:
import numpy as np

# Get cluster centers
centers = kmeans.cluster_centers_

n_clusters = 4 # Defined from the KMeans initialization

def closest_users_to_center(X, labels, center_idx, users, top_n=5):
    cluster_points = X[labels == center_idx]
    cluster_users = users[labels == center_idx]
    center = centers[center_idx]

    distances = np.linalg.norm(cluster_points - center, axis=1)
    closest_idx = np.argsort(distances)[:top_n]

    return cluster_users.iloc[closest_idx]

for c in range(n_clusters):
    print(f"\nCluster {c} representative users:")
    print(closest_users_to_center(embeddings, clusters, c, user_text["user"]))

**Representative Users Per Cluster: Identify Highest PageRank Users**

In [None]:
import pandas as pd

pagerank_df = pd.DataFrame.from_dict(pagerank, orient="index", columns=["pagerank"])
pagerank_df["user"] = pagerank_df.index

final_df = user_text.merge(pagerank_df, on="user", how="left")

for cluster_id in final_df["cluster"].unique():
    print(f"\nCluster {cluster_id} – top influential users:")
    print(
        final_df[final_df["cluster"] == cluster_id]
        .sort_values("pagerank", ascending=False)
        .head(5)[["user", "pagerank"]]
    )

**Top distinguishing keywords per cluster**

Interpret the clusters and assign label names

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

cluster_keywords = {}

for cluster_id in final_df["cluster"].unique():
    cluster_text = final_df[final_df["cluster"] == cluster_id]["comment"].tolist()

    vectorizer = TfidfVectorizer(
        stop_words="english",
        max_features=1000,
        min_df=3
    )

    X = vectorizer.fit_transform(cluster_text)
    tfidf_scores = X.mean(axis=0).A1

    keywords = pd.Series(
        tfidf_scores,
        index=vectorizer.get_feature_names_out()
    ).sort_values(ascending=False).head(10)

    cluster_keywords[cluster_id] = keywords

# Print keywords
for k, words in cluster_keywords.items():
    print(f"\nCluster {k} top keywords:")
    print(words.index.tolist())

**Merge clusters with PageRank**

In [None]:
cluster_influence = (
    final_df.groupby("cluster")["pagerank"]
    .mean()
    .sort_values(ascending=False)
)

print(cluster_influence)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Convert to a DataFrame for easier plotting
plot_df = cluster_influence.reset_index()
plot_df.columns = ['cluster', 'mean_pagerank']

plt.figure(figsize=(8, 5))
sns.barplot(x='cluster', y='mean_pagerank', data=plot_df, palette='viridis')

plt.title("Mean PageRank by Cluster")
plt.xlabel("Cluster")
plt.ylabel("Mean PageRank")
plt.show()


In [None]:
import networkx as nx

# Load the graph you created earlier
g = nx.read_graphml("reddit_mentalhealth_combined.graphml")

# Ensure all users in your DataFrame are in the graph
for idx, row in final_df.iterrows():
    user = row['user']
    if user in g.nodes:
        g.nodes[user]['cluster'] = int(row['cluster'])
        g.nodes[user]['pagerank'] = float(row['pagerank'])

nx.write_graphml(g, "reddit_mentalhealth_clusters.graphml")
print("Graph saved with clusters and PageRank for Gephi visualization.")


**Visualizations to support the analysis**

In [None]:
cluster_sizes = final_df["cluster"].value_counts().sort_index()

plt.figure()
plt.bar(cluster_sizes.index.astype(str), cluster_sizes.values)
plt.xlabel("Cluster")
plt.ylabel("Number of Users")
plt.title("Number of Users per Cluster")
plt.show()


In [None]:

plt.figure()
plt.scatter(
user_text["x"],
user_text["y"],
user_text["cluster"]
)
plt.xlabel("Semantic Dimension 1")
plt.ylabel("Semantic Dimension 2")
plt.title("Semantic Clustering of Users in r/mentalhealth")
plt.show()

**Model Justification for KMeans**

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Calculate inertia for a range of k values
k_range = range(1, 10)
inertias = []

for k in k_range:
    kmeans_model = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans_model.fit(embeddings)
    inertias.append(kmeans_model.inertia_)

plt.figure()
plt.plot(list(k_range), inertias, marker='o')
plt.xlabel("Number of clusters (k)")
plt.ylabel("Inertia")
plt.title("Elbow Method for Selecting Number of Clusters")
plt.show()

In [None]:
import json

def remove_widgets_metadata(notebook_path: str):
    # Open and parse the notebook JSON
    with open(notebook_path, "r", encoding="utf-8") as f:
        nb = json.load(f)

    # Remove top-level metadata.widgets if it exists
    if "metadata" in nb and "widgets" in nb["metadata"]:
        del nb["metadata"]["widgets"]
        print(f"Removed metadata.widgets from {notebook_path}")

    # Optional: Write the cleaned notebook back to the same file
    with open(notebook_path, "w", encoding="utf-8") as f:
        json.dump(nb, f, indent=2)

# Example usage
remove_widgets_metadata("INST414_Final_Project (1).ipynb")
