In [1]:
from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .appName("network") \
        .getOrCreate()

24/11/26 13:53:40 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
import networkx as nx
import pandas as pd

In [3]:
nodes_file = "nodes_network2/nodes.csv"  
edges_file = "edges_network2/edges.csv" 

# Load nodes
nodes_df = pd.read_csv(nodes_file)
nodes = nodes_df["node"].tolist()  # Convert to a list of nodes

In [4]:
# Load edges
edges_df = pd.read_csv(edges_file)

In [5]:
# Create graph
G = nx.Graph()

#Add nodes
G.add_nodes_from(nodes)  

#Add edges
for _, row in edges_df.iterrows():
    G.add_edge(row['node1_norm'], row['node2_norm'], weight=row['weight'])

Basic exploration

In [6]:
nx.number_of_nodes(G)

728

In [7]:
nx.number_of_edges(G)

6424

In [8]:
nx.density(G)

0.0242755868615566

In [10]:
nx.number_connected_components(G)

7

In [11]:
components = list(nx.connected_components(G))
print(components)

[{'main', 'seek', 'face', 'communicate', 'level', 'technique', 'memory', 'loss', 'actually', 'circle', 'head', 'dark', 'beat', 'traits', 'toxic', 'truth', 'community', 'task', 'consider', 'intention', 'option', 'degree', 'friendship', 'decision', 'text', 'fire', 'genuine', 'quit', 'anxious', 'awesome', 'adhd', 'stress', 'broken', 'regret', 'teach', 'wish', 'bear', 'feeling', 'able', 'alcohol', 'journal', 'mentally', 'complain', 'break', 'university', 'sit', 'romantic', 'role', 'show', 'reason', 'stupid', 'real', 'death', 'tv', 'happy', 'aspect', 'build', 'wait', 'pretty', 'rude', 'fear', 'believe', 'mistakes', 'girlfriend', 'waste', 'outcome', 'effect', 'contact', 'brother', 'reaction', 'school', 'choose', 'information', 'system', 'movie', 'behavior', 'pull', 'education', 'deeply', 'app', 'serious', 'inner', 'normal', 'perhaps', 'muscle', 'necessary', 'fun', 'cool', 'list', 'rule', 'rather', 'sad', 'hair', 'girl', 'program', 'stay', 'journey', 'unique', 'word', 'lol', 'play', 'delete',

Great! Less components than in the previous notebook. Exclude last 6 components since they are too small

In [13]:
#Asked ChatGPT: how can I stay with only the largest component in networkx?
largest_component = max(components, key=len) 
G_largest = G.subgraph(largest_component).copy()

In [15]:
nx.number_connected_components(G_largest)

1

Now, run community detection algorithm

In [17]:
communities = nx.community.louvain_communities(G_largest, seed=123)

In [19]:
len(communities)

11

In [20]:
for community in communities:
    print(community)
    print("------------------------------")

{'ton', 'room', 'actually', 'important', 'eat', 'consistent', 'workout', 'motivated', 'useless', 'extremely', 'food', 'fast', 'gym', 'future', 'tough', 'read', 'allow', 'music', 'ever', 'look', 'eating', 'sleep', 'doctor', 'quit', 'spend', 'addicted', 'rid', 'schedule', 'mentally', 'bed', 'zero', 'become', 'version', 'huge', 'motivation', 'activity', 'already', 'extra'}
------------------------------
{'insecure', 'provide', 'forget', 'nobody', 'consider', 'somehow', 'personality', 'check', 'compassion', 'absolutely', 'joke', 'worry', 'idk', 'wish', 'laugh', 'online', 'boyfriend', 'made', 'meet', 'setting', 'abuse', 'date', 'funny', 'loser', 'male', 'ugly', 'stupid', 'seem', 'country', 'horrible', 'awful', 'wait', 'interested', 'pretty', 'cause', 'attract', 'contact', 'nice', 'cry', 'smart', 'insecurities', 'approach', 'beautiful', 'shitty', 'special', 'woman', 'struggle', 'hurts', 'guy', 'give', 'sex', 'bc', 'cool', 'man', 'ignore', 'graduate', 'sad', 'willing', 'accomplish', 'girl', '

Not very informative. Try with other method

In [34]:
from networkx.algorithms.community import girvan_newman
communities = nx.community.girvan_newman(G_largest)

In [35]:
import itertools
for communities in itertools.islice(communities, 2):
    print(tuple(c for c in communities))

({'main', 'seek', 'face', 'communicate', 'level', 'technique', 'memory', 'loss', 'actually', 'circle', 'head', 'dark', 'beat', 'traits', 'toxic', 'truth', 'community', 'task', 'consider', 'intention', 'option', 'degree', 'friendship', 'decision', 'text', 'fire', 'genuine', 'quit', 'anxious', 'awesome', 'adhd', 'stress', 'broken', 'regret', 'teach', 'wish', 'bear', 'feeling', 'able', 'alcohol', 'journal', 'mentally', 'complain', 'break', 'university', 'sit', 'romantic', 'role', 'show', 'reason', 'stupid', 'real', 'death', 'tv', 'happy', 'aspect', 'build', 'wait', 'pretty', 'rude', 'fear', 'believe', 'mistakes', 'girlfriend', 'waste', 'outcome', 'effect', 'contact', 'brother', 'reaction', 'school', 'choose', 'information', 'system', 'movie', 'behavior', 'pull', 'education', 'deeply', 'app', 'serious', 'inner', 'normal', 'perhaps', 'muscle', 'necessary', 'fun', 'cool', 'list', 'rule', 'rather', 'sad', 'hair', 'girl', 'program', 'stay', 'journey', 'unique', 'word', 'lol', 'play', 'delete',

It is observed that using Girvan-Newman algorithm created very unbalanced communities. I'll run it with more iterations to see if eventually it is able to create larger groups, even if some of the first communities remain very small.

Asked ChatGTP:

"communities = nx.community.girvan_newman(G_largest)
for communities in itertools.islice(communities, 10):
    print(tuple(c for c in communities))

how can I edit to print only the last iteration?"

In [43]:
# Get the first 10 iterations using islice
communities = nx.community.girvan_newman(G_largest)
found_communities = list(itertools.islice(communities, 20))

# Print the last iteration
print(tuple(c for c in found_communities[-1]))

({'main', 'seek', 'face', 'communicate', 'level', 'technique', 'memory', 'loss', 'actually', 'circle', 'head', 'dark', 'beat', 'traits', 'toxic', 'truth', 'community', 'task', 'consider', 'intention', 'option', 'degree', 'friendship', 'decision', 'text', 'fire', 'genuine', 'quit', 'anxious', 'awesome', 'adhd', 'stress', 'broken', 'regret', 'teach', 'wish', 'bear', 'feeling', 'able', 'alcohol', 'journal', 'mentally', 'complain', 'break', 'university', 'sit', 'romantic', 'show', 'reason', 'stupid', 'real', 'death', 'tv', 'happy', 'aspect', 'build', 'rude', 'fear', 'believe', 'mistakes', 'girlfriend', 'waste', 'outcome', 'effect', 'contact', 'brother', 'reaction', 'school', 'choose', 'information', 'system', 'movie', 'behavior', 'pull', 'education', 'deeply', 'app', 'serious', 'inner', 'normal', 'perhaps', 'muscle', 'necessary', 'fun', 'cool', 'list', 'rule', 'rather', 'sad', 'hair', 'girl', 'program', 'stay', 'journey', 'unique', 'word', 'lol', 'play', 'delete', 'reward', 'expectation', 