In [1]:
# %load nbheader.py
%reload_ext autoreload
%autoreload 2

from pyspark.sql import SparkSession
import pyspark.sql.types as T
import pyspark.sql.functions as F
from pyspark.sql.functions import col as S
from pyspark.sql import DataFrame, Row, Window
import os
import sys
import json
import datetime
import re
import pandas as pd
import numpy as np
from graphframes import GraphFrame
# spark = SparkSession.builder.master("local[4]").getOrCreate()
# spark.getActiveSession()
# spark.stop()

! mkdir -p ./tmp/graphframes-checkpoints

In [2]:
# Intialise Spark session for GraphFrame, use equivalent of pyspark --packages io.graphframes:graphframes-spark4_2.13:0.9.2
spark = SparkSession.builder \
    .master("local[4]") \
    .config("spark.jars.packages", "io.graphframes:graphframes-spark4_2.13:0.9.2") \
    .getOrCreate()
spark.sparkContext.setCheckpointDir("./tmp/graphframes-checkpoints")
spark.getActiveSession()
# spark.stop()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/10/08 17:10:13 WARN Utils: Your hostname, RCBM8368-DIII.local, resolves to a loopback address: 127.0.0.1; using 10.250.32.131 instead (on interface en0)
25/10/08 17:10:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/Users/pmolnar/.base/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/pmolnar/.ivy2.5.2/cache
The jars for the packages stored in: /Users/pmolnar/.ivy2.5.2/jars
io.graphframes#graphframes-spark4_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a6dfc678-3459-4c6a-96da-cf7adb2b2a0c;1.0
	confs: [default]
	found io.graphframes#graphframes-spark4_2.13;0.9.2 in central
:: resolution report :: resolve 47ms :: artifacts dl 1ms
	:: modules in use:
	io.graphframes#graphframes-spark4_2.13;0.9.2 from central in [d

In [3]:
# load social_network_edges.csv and social_network_vertices.csv and create a GraphFrame 
social_network_edges = spark.read.csv("social_network_edges.csv", header=True, inferSchema=True)
social_network_vertices = spark.read.csv("social_network_vertices.csv", header=True, inferSchema=True)
g = GraphFrame(social_network_vertices, social_network_edges)
g.cache()
g.vertices.show(5)
g.edges.show(5)
g.inDegrees.show(5)


+------+-----+---------------+------------+--------+---------+
|    id|music|         sports|     vehicle|    food| vacation|
+------+-----+---------------+------------+--------+---------+
|user_0| Jazz|         tennis|pickup truck| mexican|mountains|
|user_1| Rock|       football|       sedan| mexican|    beach|
|user_2|Blues|         hockey|       sedan| mexican|     city|
|user_3|  R&B|       football|  muscle car|american|   desert|
|user_4|  R&B|track and field|  muscle car|    thai|     city|
+------+-----+---------------+------------+--------+---------+
only showing top 5 rows
+------+------+------------+
|   src|   dst|relationship|
+------+------+------------+
|user_0|user_1|      friend|
|user_0|user_2|      friend|
|user_0|user_3|      friend|
|user_0|user_4|      friend|
|user_0|user_5|      friend|
+------+------+------------+
only showing top 5 rows
+---------+--------+
|       id|inDegree|
+---------+--------+
| user_242|       4|
|user_1048|       4|
|user_1425|       4

In [4]:
# load categories from social_network_categories.json
categories = json.load(open("social_network_categories.json", "r"))
display(categories)


{'music': ['R&B', 'Hip Hop', 'Jazz', 'Blues', 'Rock'],
 'sports': ['track and field',
  'football',
  'baseball',
  'basketball',
  'hockey',
  'soccer',
  'tennis'],
 'vehicle': ['muscle car', 'pickup truck', 'SUV', 'sedan', 'off-road'],
 'food': ['american', 'mexican', 'thai', 'french'],
 'vacation': ['beach', 'mountains', 'city', 'desert']}

In [5]:
# for each attribute (key) in categories, group g.vertices by the attribute and count
for key in categories:
    g.vertices.groupBy(key).count().orderBy(key).show()

+-------+-----+
|  music|count|
+-------+-----+
|  Blues|  404|
|Hip Hop|  394|
|   Jazz|  390|
|    R&B|  390|
|   Rock|  422|
+-------+-----+

+---------------+-----+
|         sports|count|
+---------------+-----+
|       baseball|  236|
|     basketball|  327|
|       football|  310|
|         hockey|  266|
|         soccer|  309|
|         tennis|  321|
|track and field|  231|
+---------------+-----+

+------------+-----+
|     vehicle|count|
+------------+-----+
|         SUV|  368|
|  muscle car|  409|
|    off-road|  330|
|pickup truck|  442|
|       sedan|  451|
+------------+-----+

+--------+-----+
|    food|count|
+--------+-----+
|american|  494|
|  french|  483|
| mexican|  542|
|    thai|  481|
+--------+-----+

+---------+-----+
| vacation|count|
+---------+-----+
|    beach|  504|
|     city|  512|
|   desert|  472|
|mountains|  512|
+---------+-----+



In [6]:
# for each pair of two attributes (key) in categories, group g.vertices by the attributes and count, create a pivot table
for key1 in categories:
    for key2 in categories:
        if key1 != key2:
            df = g.vertices.groupBy(key1, key2).count().orderBy(key1, key2).toPandas()
            df = df.pivot(index=key1, columns=key2, values="count")
            display(df)

sports,baseball,basketball,football,hockey,soccer,tennis,track and field
music,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Blues,55,52,61,67,59,56,54
Hip Hop,42,68,46,57,62,72,47
Jazz,50,70,57,48,66,56,43
R&B,47,70,66,43,57,65,42
Rock,42,67,80,51,65,72,45


vehicle,SUV,muscle car,off-road,pickup truck,sedan
music,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Blues,74,92,81,56,101
Hip Hop,81,68,67,95,83
Jazz,76,75,68,90,81
R&B,72,82,45,92,99
Rock,65,92,69,109,87


food,american,french,mexican,thai
music,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Blues,81,86,127,110
Hip Hop,96,102,98,98
Jazz,102,104,99,85
R&B,104,78,107,101
Rock,111,113,111,87


vacation,beach,city,desert,mountains
music,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Blues,107,99,86,112
Hip Hop,92,106,101,95
Jazz,97,96,92,105
R&B,90,107,100,93
Rock,118,104,93,107


music,Blues,Hip Hop,Jazz,R&B,Rock
sports,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
baseball,55,42,50,47,42
basketball,52,68,70,70,67
football,61,46,57,66,80
hockey,67,57,48,43,51
soccer,59,62,66,57,65
tennis,56,72,56,65,72
track and field,54,47,43,42,45


vehicle,SUV,muscle car,off-road,pickup truck,sedan
sports,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
baseball,45,42,30,49,70
basketball,67,81,46,70,63
football,50,49,65,66,80
hockey,57,64,38,59,48
soccer,53,59,53,74,70
tennis,54,65,61,67,74
track and field,42,49,37,57,46


food,american,french,mexican,thai
sports,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baseball,51,47,74,64
basketball,76,88,93,70
football,80,77,90,63
hockey,63,53,80,70
soccer,72,77,83,77
tennis,97,78,76,70
track and field,55,63,46,67


vacation,beach,city,desert,mountains
sports,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baseball,58,62,47,69
basketball,91,70,76,90
football,74,82,85,69
hockey,72,67,55,72
soccer,76,82,77,74
tennis,83,80,76,82
track and field,50,69,56,56


music,Blues,Hip Hop,Jazz,R&B,Rock
vehicle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SUV,74,81,76,72,65
muscle car,92,68,75,82,92
off-road,81,67,68,45,69
pickup truck,56,95,90,92,109
sedan,101,83,81,99,87


sports,baseball,basketball,football,hockey,soccer,tennis,track and field
vehicle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
SUV,45,67,50,57,53,54,42
muscle car,42,81,49,64,59,65,49
off-road,30,46,65,38,53,61,37
pickup truck,49,70,66,59,74,67,57
sedan,70,63,80,48,70,74,46


food,american,french,mexican,thai
vehicle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SUV,81,82,116,89
muscle car,100,103,105,101
off-road,87,73,103,67
pickup truck,111,110,111,110
sedan,115,115,107,114


vacation,beach,city,desert,mountains
vehicle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SUV,88,102,79,99
muscle car,94,109,96,110
off-road,92,77,89,72
pickup truck,114,109,105,114
sedan,116,115,103,117


music,Blues,Hip Hop,Jazz,R&B,Rock
food,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
american,81,96,102,104,111
french,86,102,104,78,113
mexican,127,98,99,107,111
thai,110,98,85,101,87


sports,baseball,basketball,football,hockey,soccer,tennis,track and field
food,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
american,51,76,80,63,72,97,55
french,47,88,77,53,77,78,63
mexican,74,93,90,80,83,76,46
thai,64,70,63,70,77,70,67


vehicle,SUV,muscle car,off-road,pickup truck,sedan
food,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
american,81,100,87,111,115
french,82,103,73,110,115
mexican,116,105,103,111,107
thai,89,101,67,110,114


vacation,beach,city,desert,mountains
food,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
american,120,116,124,134
french,126,113,120,124
mexican,138,160,120,124
thai,120,123,108,130


music,Blues,Hip Hop,Jazz,R&B,Rock
vacation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
beach,107,92,97,90,118
city,99,106,96,107,104
desert,86,101,92,100,93
mountains,112,95,105,93,107


sports,baseball,basketball,football,hockey,soccer,tennis,track and field
vacation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
beach,58,91,74,72,76,83,50
city,62,70,82,67,82,80,69
desert,47,76,85,55,77,76,56
mountains,69,90,69,72,74,82,56


vehicle,SUV,muscle car,off-road,pickup truck,sedan
vacation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
beach,88,94,92,114,116
city,102,109,77,109,115
desert,79,96,89,105,103
mountains,99,110,72,114,117


food,american,french,mexican,thai
vacation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
beach,120,126,138,120
city,116,113,160,123
desert,124,120,120,108
mountains,134,124,124,130


In [7]:
communities = g.labelPropagation(maxIter=10)

# Show results
communities.groupBy("label").count().show()

25/10/08 17:10:42 WARN BlockManager: Block rdd_448_0 already exists on this machine; not re-adding it


+------------+-----+
|       label|count|
+------------+-----+
| 25769803782|  220|
|163208757254| 1780|
+------------+-----+



In [19]:
communities.printSchema()

root
 |-- id: string (nullable = true)
 |-- music: string (nullable = true)
 |-- sports: string (nullable = true)
 |-- vehicle: string (nullable = true)
 |-- food: string (nullable = true)
 |-- vacation: string (nullable = true)
 |-- label: long (nullable = true)



In [10]:
communities.show(5)

+------+-----+---------------+------------+--------+---------+------------+
|    id|music|         sports|     vehicle|    food| vacation|       label|
+------+-----+---------------+------------+--------+---------+------------+
|user_0| Jazz|         tennis|pickup truck| mexican|mountains|163208757254|
|user_1| Rock|       football|       sedan| mexican|    beach|163208757254|
|user_2|Blues|         hockey|       sedan| mexican|     city|163208757254|
|user_3|  R&B|       football|  muscle car|american|   desert|163208757254|
|user_4|  R&B|track and field|  muscle car|    thai|     city|163208757254|
+------+-----+---------------+------------+--------+---------+------------+
only showing top 5 rows


In [9]:
# run label propagation algorithms (LPA) for each key and see how the pairwise distribution with the other attributes changes
for attribute in categories:
    # Build similarity graph just for that attribute (e.g., user-user for both liking classical music)
    g_attr = GraphFrame(g.vertices, edges_for_attribute(attribute))
    lpa_result = g_attr.labelPropagation(maxIter=5)
    lpa_result.withColumnRenamed('label', f'{attribute}_community').persist()


NameError: name 'edges_for_attribute' is not defined