In [2]:
# %load nbheader.py
%reload_ext autoreload
%autoreload 2

from pyspark.sql import SparkSession
import pyspark.sql.types as T
import pyspark.sql.functions as F
from pyspark.sql.functions import col as S
from pyspark.sql import DataFrame, Row, Window
import os
import sys
import json
import datetime
import re
import pandas as pd
import numpy as np

In [3]:
# Intialise Spark session for GraphFrame, use equivalent of pyspark --packages io.graphframes:graphframes-spark4_2.13:0.9.2
spark = SparkSession.builder \
    .master("local[4]") \
    .config("spark.jars.packages", "io.graphframes:graphframes-spark4_2.13:0.9.2") \
    .getOrCreate()
# spark = SparkSession.builder.master("local[4]").getOrCreate()
spark.getActiveSession()
# spark.stop()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/10/01 07:03:33 WARN Utils: Your hostname, RCBM8368-DIII.local, resolves to a loopback address: 127.0.0.1; using 192.168.1.54 instead (on interface en0)
25/10/01 07:03:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/Users/pmolnar/.base/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/pmolnar/.ivy2.5.2/cache
The jars for the packages stored in: /Users/pmolnar/.ivy2.5.2/jars
io.graphframes#graphframes-spark4_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-cafc7b87-1b1d-402a-a434-b0e787352e98;1.0
	confs: [default]
	found io.graphframes#graphframes-spark4_2.13;0.9.2 in central
downloading https://repo1.maven.org/maven2/io/graphframes/graphframes-spark4_2.13/0.9.2/graphframes-spark4_2.13-0.9.2.jar ...
	[SUCCESSFUL ] i

In [10]:
! mkdir -p ./tmp/graphframes-checkpoints

In [11]:
from graphframes import GraphFrame

spark.sparkContext.setCheckpointDir("./tmp/graphframes-checkpoints")

In [7]:
# Install graphframes-py and GraphFrames jar if running in Colab or new environment
# !pip install graphframes-py

# from pyspark.sql import SparkSession
# from pyspark.sql.functions import col, lit


# 1. Prepare vertices and edges DataFrames
vertices = spark.createDataFrame([
    ("a", "Alice"),
    ("b", "Bob"),
    ("c", "Charlie"),
    ("d", "David")
], ["id", "name"])

edges = spark.createDataFrame([
    ("a", "b", "knows"),
    ("b", "c", "knows"),
    ("c", "a", "knows"),
    ("a", "d", "knows")
], ["src", "dst", "relationship"])

# 2. Create GraphFrame
g = GraphFrame(vertices, edges)

# 3. Run PageRank example
results = g.pageRank(resetProbability=0.15, maxIter=5)
results.vertices.select("id", "name", "pagerank").orderBy(S("pagerank").desc()).show()

+---+-------+------------------+
| id|   name|          pagerank|
+---+-------+------------------+
|  a|  Alice| 1.313776242780802|
|  c|Charlie|1.0007283492186585|
|  d|  David|0.8427477040002699|
|  b|    Bob|0.8427477040002699|
+---+-------+------------------+



In [12]:
# 4. Find Connected Components
## fix this: components = g.connectedComponents()
components = g.connectedComponents()
components.select("id", "component").show()

                                                                                

+---+------------+
| id|   component|
+---+------------+
|  a|807453851648|
|  b|807453851648|
|  c|807453851648|
|  d|807453851648|
+---+------------+



25/10/01 07:24:19 WARN ConnectedComponents$: The DataFrame returned by ConnectedComponents is persisted and loaded.


In [13]:
# 5. Find triangle counts (clustering)
triangles = g.triangleCount()
triangles.select("id", "count").show()

# 6. Motif finding: look for mutual relationships
motifs = g.find("(a)-[e]->(b); (b)-[e2]->(a)")
motifs.show()

+---+-----+
| id|count|
+---+-----+
|  a|    1|
|  b|    1|
|  c|    1|
|  d|    0|
+---+-----+

+---+---+---+---+
|  a|  e|  b| e2|
+---+---+---+---+
+---+---+---+---+



In [15]:

# Create vertices DataFrame: user id, name, and age
vertices = spark.createDataFrame([
    ("a", "Alice", 34),
    ("b", "Bob", 28),
    ("c", "Charlie", 31),
    ("d", "David", 22)
], ["id", "name", "age"])

# Create edges DataFrame: who liked whose post
edges = spark.createDataFrame([
    ("a", "b", "likes"),
    ("b", "a", "likes"),
    ("a", "c", "likes"),
    ("c", "a", "likes"),
    ("b", "c", "likes"),
    ("c", "b", "likes"),
    ("d", "a", "likes")  # David likes Alice only (non-mutual)
], ["src", "dst", "relationship"])

# Build the graph
g = GraphFrame(vertices, edges)

# Motif query: look for mutual 'likes' relationships
motifs = g.find("(u1)-[e1]->(u2); (u2)-[e2]->(u1)")

# Filter for cases where either user is older than 30
filtered = motifs.filter("u1.age > 30 or u2.age > 30")

filtered.show()



+----------------+-------------+----------------+-------------+
|              u1|           e1|              u2|           e2|
+----------------+-------------+----------------+-------------+
|    {b, Bob, 28}|{b, a, likes}|  {a, Alice, 34}|{a, b, likes}|
|{c, Charlie, 31}|{c, a, likes}|  {a, Alice, 34}|{a, c, likes}|
|  {a, Alice, 34}|{a, b, likes}|    {b, Bob, 28}|{b, a, likes}|
|{c, Charlie, 31}|{c, b, likes}|    {b, Bob, 28}|{b, c, likes}|
|  {a, Alice, 34}|{a, c, likes}|{c, Charlie, 31}|{c, a, likes}|
|    {b, Bob, 28}|{b, c, likes}|{c, Charlie, 31}|{c, b, likes}|
+----------------+-------------+----------------+-------------+



In [16]:
filtered.printSchema()

root
 |-- u1: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- age: long (nullable = true)
 |-- e1: struct (nullable = false)
 |    |-- src: string (nullable = true)
 |    |-- dst: string (nullable = true)
 |    |-- relationship: string (nullable = true)
 |-- u2: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- age: long (nullable = true)
 |-- e2: struct (nullable = false)
 |    |-- src: string (nullable = true)
 |    |-- dst: string (nullable = true)
 |    |-- relationship: string (nullable = true)



In [17]:
filtered.select("u1.name", "u1.age", "u2.name", "u2.age").show()


+-------+---+-------+---+
|   name|age|   name|age|
+-------+---+-------+---+
|    Bob| 28|  Alice| 34|
|Charlie| 31|  Alice| 34|
|  Alice| 34|    Bob| 28|
|Charlie| 31|    Bob| 28|
|  Alice| 34|Charlie| 31|
|    Bob| 28|Charlie| 31|
+-------+---+-------+---+



In [14]:
# 7. Run BFS example
paths = g.bfs("name = 'Alice'", "name = 'Charlie'")

In [18]:
g

GraphFrame(v:[id: string, name: string ... 1 more field], e:[src: string, dst: string ... 1 more field])

In [19]:
import networkx as nx
# from yfiles_jupyter_graphs import GraphWidget

ModuleNotFoundError: No module named 'yfiles_jupyter_graphs'

In [22]:
spark.stop()