In [7]:
# Install required packages.
from pyspark.sql import SparkSession
import sparknlp
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt

spark = SparkSession \
        .builder \
        .appName("network") \
        .getOrCreate()

def visualize_graph(G, color):
    plt.figure(figsize=(7,7))
    plt.xticks([])
    plt.yticks([])
    nx.draw_networkx(G, pos=nx.spring_layout(G, seed=42), with_labels=False,
                     node_color=color, cmap="Set2")
    plt.show()


def visualize_embedding(h, color, epoch=None, loss=None):
    plt.figure(figsize=(7,7))
    plt.xticks([])
    plt.yticks([])
    h = h.detach().cpu().numpy()
    plt.scatter(h[:, 0], h[:, 1], s=140, c=color, cmap="Set2")
    if epoch is not None and loss is not None:
        plt.xlabel(f'Epoch: {epoch}, Loss: {loss.item():.4f}', fontsize=16)
    plt.show()

24/11/25 16:44:13 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [8]:
#Asked ChatGPT: How can I create a graph from two csv files? (one with nodes and one with edges)

# File paths (replace with your file paths)
nodes_file = "vertices/vertices.csv"  # CSV with one column, no header
edges_file = "edges/edges.csv"  # CSV with two columns, no header

# Load nodes
nodes_df = pd.read_csv(nodes_file, header=None, names=["node"])
nodes = nodes_df["node"].tolist()  # Convert to a list of nodes

# Load edges
edges_df = pd.read_csv(edges_file, header=None, names=["source", "target"])
edges = edges_df.values.tolist()  # Convert to a list of edge tuples

# Create graph
G = nx.DiGraph()  
G.add_nodes_from(nodes)  
G.add_edges_from(edges) 

In [17]:
print(nx.is_directed(G))

True


In [18]:
G_undirected = G.to_undirected()

# Check if the graph is now undirected
print(nx.is_directed(G_undirected))  # Should return False

False


In [9]:
nx.number_of_nodes(G)

2281

In [10]:
nx.number_of_edges(G)

2466

In [11]:
nx.density(G)

0.0004741687625464362

In [19]:
components = list(nx.connected_components(G_undirected))

In [20]:
# Number of connected components
print(f"Number of connected components: {len(components)}")

Number of connected components: 222


In [12]:
from networkx.algorithms.community import girvan_newman
communities = nx.community.girvan_newman(G)

In [14]:
communities

<generator object girvan_newman at 0x7fb7c3ecdeb0>

In [15]:
import itertools
for communities in itertools.islice(communities, 2):
    print(tuple(c for c in communities))

({'rselfhelp', 'rgetmotivatedbuddies', 'eta', 'clarification', 'rgetdisciplined', 'rdecidingtobebetter', 'rgoodbyedepression', 'rgetmotivated'}, {'volunteer', 'face', 'circle', 'avoided', 'disability', 'journeys', 'afford', 'academy', 'toxic', 'anyones', 'sexuality', 'beings', 'brief', 'degree', 'fall', 'solved', 'genuine', 'friends', 'luckily', 'looking', 'initiative', 'noticed', 'journal', 'complain', 'cat', 'planet', 'sword', 'dread', 'consistently', 'restaurant', 'pillblack', 'fear', 'mistakes', 'reasons', 'centres', 'compass', 'wisely', 'problems', 'whether', 'included', 'overtime', 'normal', 'islam', 'bl', 'instant', 'workouts', 'beating', 'scheduled', 'looks', 'sad', 'ounce', 'output', 'excited', 'writers', 'shaming', 'experimenting', 'clothes', 'black', 'anyone', 'techniques', 'proud', 'hating', 'elementary', 'intermediate', 'heartedly', 'brain', 'achieving', 'halfway', 'instigating', 'spaghetti', 'tips', 'decency', 'retraining', 'listener', 'kid', 'shows', 'east', 'brutally', 