In [61]:
from graphframes import *
from pyspark.sql import SparkSession
from itertools import combinations
import networkx as nx
import pandas as pd
from pyspark.sql import functions as F

In [1]:
spark = SparkSession \
        .builder \
        .appName("community detection") \
        .getOrCreate()

24/12/11 12:36:11 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
import time
import heapq

In [14]:
df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
df.withColumnRenamed('age', 'age2').show()

[Stage 0:>                                                          (0 + 1) / 1]

+----+-----+
|age2| name|
+----+-----+
|   2|Alice|
|   5|  Bob|
+----+-----+



                                                                                

In [28]:
# Load nodes
nodes_df = spark.read.csv("../data/network-topics-morality/finalized_nodes_topics_morality_net.csv", 
                         header=True).withColumnRenamed('_c0', 'id')

In [29]:
nodes_df.show(5)

24/12/11 13:07:45 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , node, category
 Schema: _c0, node, category
Expected: _c0 but found: 
CSV file: file:///home/natashacarpcast/Documents/selfimprovement/data/network-topics-morality/finalized_nodes_topics_morality_net.csv
+---+------------+----------+
| id|        node|  category|
+---+------------+----------+
|  0|      online|    topic3|
|  1|  misogynist|liwc_moral|
|  2|  productive|    topic1|
|  3|      absurd|liwc_moral|
|  4|conversation|    topic7|
+---+------------+----------+
only showing top 5 rows



In [34]:
# Load edges
edges_df = spark.read.csv("../data/network-topics-morality/filtered_edges_topics_morality_net.csv", 
                         header=True).withColumnRenamed("node1_norm", "src")

edges_df = edges_df.withColumnRenamed("node2_norm", "dst")

edges_df

DataFrame[_c0: string, src: string, dst: string, weight: string]

In [35]:
#Create GraphFrame
g = GraphFrame(nodes_df, edges_df)




In [36]:
g

GraphFrame(v:[id: string, node: string ... 1 more field], e:[src: string, dst: string ... 2 more fields])

In [37]:
g.vertices.show()

24/12/11 13:12:19 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , node, category
 Schema: _c0, node, category
Expected: _c0 but found: 
CSV file: file:///home/natashacarpcast/Documents/selfimprovement/data/network-topics-morality/finalized_nodes_topics_morality_net.csv
+---+------------+----------+
| id|        node|  category|
+---+------------+----------+
|  0|      online|    topic3|
|  1|  misogynist|liwc_moral|
|  2|  productive|    topic1|
|  3|      absurd|liwc_moral|
|  4|conversation|    topic7|
|  5|    pettiest|liwc_moral|
|  6|   character|    topic1|
|  7|  accusation|liwc_moral|
|  8|     dissing|liwc_moral|
|  9|    elitists|liwc_moral|
| 10|    outlawed|liwc_moral|
| 11|   grandiose|liwc_moral|
| 12|        evil|liwc_moral|
| 13| disapproved|liwc_moral|
| 14|    buffoons|liwc_moral|
| 15|  admonishes|liwc_moral|
| 16|   instagram|    topic2|
| 17| absurdities|liwc_moral|
| 18|        lewd|liwc_moral|
| 19|   worthless|liwc_moral|
+---+------

In [38]:
g.edges.show()

24/12/11 13:12:33 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , node1_norm, node2_norm, weight
 Schema: _c0, node1_norm, node2_norm, weight
Expected: _c0 but found: 
CSV file: file:///home/natashacarpcast/Documents/selfimprovement/data/network-topics-morality/filtered_edges_topics_morality_net.csv
+---+------------+--------+------+
|_c0|         src|     dst|weight|
+---+------------+--------+------+
|  0|  depression|   woman|  2390|
|  1|    generous|   loyal|     8|
|  2|        body|     job|  7010|
|  3|     college|  family|  7990|
|  4|       fault| student|   266|
|  5|       drink| hobbies|  1441|
|  6|     healthy|   voice|   838|
|  7|        read|  worthy|  1136|
|  8|       money|  social|  9445|
|  9|  productive|specific|   660|
| 10|        male|    talk|  3761|
| 11|   happiness|   wrong|  2792|
| 12|     deserve| respect|  1928|
| 13|conversation| emotion|  2952|
| 14|        girl| respect|  3490|
| 15|      family|    high|  8198|
| 16|

In [39]:
#Create undirected edges
undirected_edges = g.edges.union(g.edges.withColumnRenamed("src", "dst").withColumnRenamed("dst", "src"))

In [42]:
undirected_edges.show(5)

24/12/11 13:17:32 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , node1_norm, node2_norm, weight
 Schema: _c0, node1_norm, node2_norm, weight
Expected: _c0 but found: 
CSV file: file:///home/natashacarpcast/Documents/selfimprovement/data/network-topics-morality/filtered_edges_topics_morality_net.csv
+---+----------+-------+------+
|_c0|       src|    dst|weight|
+---+----------+-------+------+
|  0|depression|  woman|  2390|
|  1|  generous|  loyal|     8|
|  2|      body|    job|  7010|
|  3|   college| family|  7990|
|  4|     fault|student|   266|
+---+----------+-------+------+
only showing top 5 rows



Check it is undirected

In [43]:
edges_df.count()

58001

In [44]:
undirected_edges.count()

116002

Create new graph

In [46]:
g = GraphFrame(g.vertices, undirected_edges)


In [47]:
g

GraphFrame(v:[id: string, node: string ... 1 more field], e:[src: string, dst: string ... 2 more fields])

In [64]:
# Louvain https://github.com/DS4AI-UPB/CommunityDetection-DataframesSpark/blob/main/communitydetection_graphframes.py
def get_graph_vertices_list(g):
    vertices = []
    dataframe_vertices = g.vertices.select("id").collect()
    for row in dataframe_vertices:
        vertices.append(row.id)
    return vertices

def get_graph_edges_list(g):
    edges = []
    dataframe_edges = g.edges.select("src","dst","weight").collect()
    for row in dataframe_edges:
        edges.append((row.src, row.dst,row.weight))
    return edges

#Modified to suit my data
def get_graph_weights_sum(g):
    return g.edges.agg(F.sum('weight')).collect()[0][0]

def get_nodes_degrees(g):
    nodes = get_graph_vertices_list(g)
    in_degree = {}
    out_degree = {}
    idx = 0
    for node in nodes:
        out_degree[node] = g.edges.select("weight").where("src=={}".format(node)).groupBy().sum().collect()[0]["sum(weight)"]
        in_degree[node] = g.edges.select("weight").where("dst=={}".format(node)).groupBy().sum().collect()[0]["sum(weight)"]
        if in_degree[node] == None:
            in_degree[node] = 0
        if out_degree[node] == None:
            out_degree[node] = 0
        idx += 1  
    return in_degree, out_degree

def get_degree_to_adjacent_communities(g, node, community_of):
    degree = {}
    neighbors = list(map(lambda x: (x.dst,x.weight), g.edges.select("dst", "weight").where("src=={}".format(node)).collect()))
    for (neighbor, weight) in neighbors:
        adjacent_community = community_of[neighbor]
        if adjacent_community not in degree:
            degree[adjacent_community] = 0
        degree[adjacent_community] += weight
    return degree

def get_global_community(g, node):
    enclosed_nodes = set(list(g.vertices.select("community").where("id=={}".format(node)).collect()[0].community))
    return enclosed_nodes

def calc_modularity(g, partition, m):
    m = get_graph_weights_sum(g)
    modularity = 0
    in_degree, out_degree = get_nodes_degrees(g)
    edges = get_graph_edges_list(g)
    community_of = {}
    for idx, part in enumerate(partition):
        for node in part:
            community_of[node] = idx
    for edge in edges:
        src, dst = edge[0], edge[1]
        community_src, community_dst = community_of[src], community_of[dst]
        if community_src != community_dst:
            continue
        weight = edge[2]
        modularity += (weight/2*m) - (in_degree[src]*out_degree[dst])/(2*m**2)
    return modularity

def first_phase_louvain(g, global_partition, m, gamma):
    community_of = {node: idx for idx, node in enumerate(get_graph_vertices_list(g))}
    nodes = get_graph_vertices_list(g)
    new_partition = [set() for node in nodes]
    (in_degree, out_degree), (community_in, community_out) = get_nodes_degrees(g), get_nodes_degrees(g)
    while True:
        stop = True
        for node in nodes:
            chosen_comunity = community_of[node]
            max_improvement = 0
            community_in[chosen_comunity] -= in_degree[node]
            community_out[chosen_comunity] -= out_degree[node]
            degree_to_adj_communities = get_degree_to_adjacent_communities(g, node, community_of)
            for (adjacent_community, adjacent_degree) in degree_to_adj_communities.items():
                improvement = (adjacent_degree - gamma * (in_degree[node] * community_out[adjacent_community] + out_degree[node] * community_in[adjacent_community])/m)
                if improvement > max_improvement:
                    max_improvement = improvement
                    chosen_comunity = adjacent_community
            community_in[chosen_comunity] += in_degree[node]
            community_out[chosen_comunity] += out_degree[node]
            if chosen_comunity != community_of[node]: 
                community_of[node] = chosen_comunity
                stop = False
        if stop:
            break
    for node, community in community_of.items():
        new_partition[community].add(node)
        global_community = get_global_community(g, node)               
        global_partition[node].difference_update(global_community)
        global_partition[community].update(global_community)
    new_partition = list(filter(lambda x: x != set(), new_partition))
    global_partition = list(filter(lambda x: x != set(), global_partition))

    return global_partition, new_partition, stop

def second_phase_louvain(g, new_partition):
    community_of = {}
    vertices_columns = ["name", "id", "community"]
    new_vertices = []
    for idx, partition in enumerate(new_partition):
        enclosed_nodes = []
        for node in partition:
            community_of[node] = idx
            sub_nodes = g.vertices.select("community").where("id=={}".format(node)).collect()[0].community
            enclosed_nodes += sub_nodes
        new_vertices.append((str(idx), idx, enclosed_nodes))
    
    edges = get_graph_edges_list(g)
    weights_between_communities = {}
    for edge in edges:
        src, dst, weight = edge[0], edge[1], edge[2]
        community_src, community_dst = community_of[src], community_of[dst]
        if not (community_src, community_dst) in weights_between_communities:
            weights_between_communities[(community_src, community_dst)] = 0
        weights_between_communities[(community_src, community_dst)] += weight
    
    new_edges = []
    edges_columns = ["src", "dst", "relationship", "weight"]
    for k, v in weights_between_communities.items():
        new_edges.append((k[0], k[1], "friend", v))
    
    v = ss.createDataFrame(new_vertices, vertices_columns)
    e = ss.createDataFrame(new_edges, edges_columns)
    new_g = GraphFrame(v, e)
    return new_g

def run_louvain(g, gamma=0.05): 
    m = get_graph_weights_sum(g)
    communities = [{node} for node in get_graph_vertices_list(g)]
    current_modularity = calc_modularity(g, communities, m)
    threshold=0.0000001
    iteration = 0
    while True:
        communities, next_partition, stop = first_phase_louvain(g, communities, m, gamma)
        if stop:
            break

        new_mod = calc_modularity(g, next_partition, m)
        if new_mod - current_modularity <= threshold:
            break

        current_modularity = new_mod
        g = second_phase_louvain(g, next_partition)
        iteration += 1
    print(iteration)
    for community in communities:
        print(sorted(community))
    return communities

In [65]:
run_louvain(g, gamma=0.05)

24/12/11 13:34:08 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 
 Schema: _c0
Expected: _c0 but found: 
CSV file: file:///home/natashacarpcast/Documents/selfimprovement/data/network-topics-morality/finalized_nodes_topics_morality_net.csv
24/12/11 13:34:08 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 
 Schema: _c0
Expected: _c0 but found: 
CSV file: file:///home/natashacarpcast/Documents/selfimprovement/data/network-topics-morality/finalized_nodes_topics_morality_net.csv


ValueError: sum(weight)

In [None]:
def get_graph_weights_sum(g):
    dataframe_weights_sum = g.edges.select('weight').groupBy().sum().collect()
    return dataframe_weights_sum[0]["sum(weight)"]

In [55]:
g.edges.select('weight').groupBy().sum().collect()

[Row()]

In [59]:
g.edges.select('weight').show(5)

+------+
|weight|
+------+
|  2390|
|     8|
|  7010|
|  7990|
|   266|
+------+
only showing top 5 rows



In [62]:
total_weight = g.edges.agg(F.sum('weight')).collect()[0][0]

In [63]:
total_weight

85892490.0