In [None]:
# df.repartition(2).select(F.spark_partition_id().alias("pid")).collect()

# Preparation

In [None]:
import sys

from collections import namedtuple
from typing import NamedTuple, Callable, TypeVar
from enum import StrEnum, verify, UNIQUE
from dataclasses import dataclass
from random import randrange

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.sql.dataframe import DataFrame, Column

## Spark Configuration

In [None]:
spark = (
    SparkSession
        .builder
        .appName("graphs")
        .master("local[4]")
        .config("spark.sql.warehouse.dir", "/tmp/warehouse")
        .getOrCreate()
)
sc = spark.sparkContext
sc.setCheckpointDir("/tmp/plan/checkpoint")

## Miscellaneous Graph Functions and Classes

In [None]:
class Graph(NamedTuple):
    """The `Graph` class is the central class to express graphs"""
    edges: DataFrame
    vertices: DataFrame

In [None]:
@dataclass
class Scope:
    """The `Scope` class abstarcts away inward, outward and all neighbours of a vertex"""
    @verify(UNIQUE)
    class Direction(StrEnum):
        IN = "IN"
        OUT = "OUT"
        INOUT = "INOUT"

    inward: DataFrame
    outward: DataFrame
    all_nbr: DataFrame

    def __init__(self, edges: DataFrame):
        def __wrapper(id: str, nbr: str, dir: 'Scope.Direction'):
            return (
                edges_df
                    .withColumnRenamed(id, "id")
                    .withColumnRenamed(nbr, "nbr")
                    .withColumn("dir", F.lit(dir))
                    .select("id", "nbr", "weight", "dir")
            )
        self.inward = __wrapper("dst", "src", Scope.Direction.IN)
        self.outward = __wrapper("src", "dst", Scope.Direction.OUT)
        self.all_nbr = self.outward.union(self.inward)

    def __call__(self, dir: 'Scope.Direction') -> DataFrame:
        match dir:
            case Scope.Direction.IN:
                return self.inward
            case Scope.Direction.OUT:
                return self.outward
            case Scope.Direction.INOUT:
                return self.all_nbr
            case _:
                raise ValueError(f"Unknown {dir}")

    def __iter__(self):
        yield self.inward
        yield self.outward
        yield self.all_nbr
        

In [None]:
def graph(disjoint_sets_number: int = 1000, disjoint_set_vertices: int = 500, disjoint_set_edges: int = 1000) -> Graph:
    """Generates a graph
        Parameters:
            disjoint_sets_number (int) : total number of disjoint sets in the graph
            disjoint_set_elements (int) : total number of vertices in a disjoint set in the graph
            disjoint_set_edges (int) : total number of edges in a disjoint set in the graph

        Returns:
            Graph (Graph): a graph according to the specifications
    """
    start = 0
    end = disjoint_set_vertices - 1
    
    disjoint_set_step = max(disjoint_set_vertices, 1000)
    edges = []
    for x in range(disjoint_sets_number):
        edges += [ (x, x + 1, randrange(10)) for x in range(start, end) ] + [ (randrange(start, end), randrange(start, end), randrange(10)) for _ in range(disjoint_set_edges) ]
        start += disjoint_set_step
        end += disjoint_set_step

    edges_df = (
        spark
            .createDataFrame(edges)
            .toDF("src", "dst", "weight")
    )

    vertices = set()

    for x, y, _ in edges:
        vertices.add(x)
        vertices.add(y)

    vertices_df = (
        spark
            .createDataFrame([(x,) for x in vertices])
            .toDF("id")
    )
    return Graph(edges_df, vertices_df)

In [None]:
def small_graph() -> Graph:
    """Generates a small graph for tests"""
    verties = [ (x,) for x in range(1, 7) ]
    verties_df = (
        spark
            .createDataFrame(verties)
            .toDF("id")
    )
    edges = [ (1, 2, 1), (1, 3, 5), (2, 3, 1), (3, 4, 1), (4, 5, 1), (6, 6, 1) ]
    edges_df = (
        spark
            .createDataFrame(edges)
            .toDF("src", "dst", "weight")
    )
    return Graph(edges_df, verties_df)

In [None]:
def dag(graph: Graph) -> Graph:
    """Turns a graph into a directed graph"""
    edges_df, vertices_df = graph
    edges_df = edges_df.where("src < dst")

    return Graph(edges_df, vertices_df)

In [None]:
def get_min_dist_init_df(values: DataFrame, start: int = 1) -> DataFrame:
    return values.withColumn("dist",
        F.when(
            col("id") == F.lit(start),
            F.lit(0)
        )
        .otherwise(F.lit(sys.maxsize // 2))
    )

In [None]:
from datetime import datetime

V = TypeVar("V")
def timer(handler: Callable[[], V]):
    """Timer"""
    start = datetime.now()
    result = handler()
    print(datetime.now() - start)
    return result

# Connected Components

## Pregel

### Implementation

In [None]:
def pregel_cc(edges: DataFrame, values: DataFrame, steps: int):
    result_df = values
    for i in range(steps):
        result_df = (
            edges.join(result_df, col("src") == col("id"))
                .select(col("dst").alias("id"), col("value").alias("message"))
                .groupby(col("id")).agg(F.min("message").alias("message"))
                .join(result_df, "id", "right")
                .select("id", F.least("message", "value").alias("value"))
        )
        if i % 5 == 0:
            result_df = result_df.checkpoint()

    return result_df

### Testing

#### Small Graph

In [None]:
edges_df, vertices_df = small_graph()

In [None]:
edges_df.printSchema()

In [None]:
vertices_df.printSchema()

In [None]:
pregel_cc_df = vertices_df.withColumn("value", col("id"))
pregel_cc_df = pregel_cc(edges_df, pregel_cc_df, 5)

In [None]:
pregel_cc_df.show()

#### Large Graph

In [None]:
edges_df, vertices_df = graph()

In [None]:
edges_df.printSchema()

In [None]:
vertices_df.printSchema()

In [None]:
pregel_cc_df = vertices_df.withColumn("value", col("id"))

In [None]:
pregel_cc_df = timer(lambda: pregel_cc(edges_df, pregel_cc_df, 15))

The `cc_df` DataFrame is large, so only the total number of connected components is displayed

In [None]:
pregel_cc_df.groupBy("value").count().count()

## GraphLab

### Implementation

In [None]:
def graphlab_cc(scope: DataFrame, values: DataFrame, steps: int):
    result_df = values
    for i in range(steps):
        result_nbr_df = (
            scope.alias("s")
                .join(result_df.alias("r"), col("s.nbr") == col("r.id"))
                .groupBy(col("s.id"))
                .agg(F.min("cc").alias("acc_cc"))
        )

        result_df = (
            result_nbr_df.alias("a")
                .join(result_df.alias("r"), "id", "right")
                .select("id", F.least(col("r.cc"), col("a.acc_cc")).alias("cc"))
        )

        if i % 5 == 0:
            result_df = result_df.checkpoint()
    return result_df

### Testing

#### Small Graph

In [None]:
edges_df, vertices_df = small_graph()

In [None]:
vertices_df.printSchema()

In [None]:
edges_df.printSchema()

In [None]:
scope_df = Scope(edges_df)

In [None]:
gl_cc_result_df = vertices_df.withColumn("cc", col("id"))

In [None]:
gl_cc_result_df = graphlab_cc(scope_df.all_nbr, gl_cc_result_df, 5)

In [None]:
gl_cc_result_df.show()

In [None]:
gl_cc_result_df.groupBy("cc").count().count()

#### Large Graph

In [None]:
edges_df, vertices_df = graph()

In [None]:
vertices_df.printSchema()

In [None]:
edges_df.printSchema()

In [None]:
scope_df = Scope(edges_df)

In [None]:
gl_cc_result_df = vertices_df.withColumn("cc", col("id"))

In [None]:
gl_cc_result_df = timer(lambda: graphlab_cc(scope_df.all_nbr, gl_cc_result_df, 8))

In [None]:
gl_cc_result_df.groupBy("cc").count().count()

## PowerGraph

### Implementation

In [None]:
def powergraph_cc(scope: DataFrame, values: DataFrame, steps: int):
    result_df = values

    for i in range(steps):
        gather_df = (
            scope.where("id != nbr").alias("s")
                .join(result_df.alias("r"), col("r.id") == col("s.nbr"))
                .select(col("s.id"), col("cc").alias("nbr_cc"))
        )
        sum_df = (
            gather_df
                .groupBy("id")
                .agg(F.min("nbr_cc").alias("acc_cc"))
        )
        apply_df = (
            sum_df
                .join(result_df, "id", "right")
                .select(col("id"), F.least("acc_cc", "cc").alias("cc"))
        )
        scatter_df = apply_df
        result_df = scatter_df
        if i % 5 == 0:
            result_df = result_df.checkpoint()

    return result_df

### Testing

#### Small Graph

In [None]:
edges_df, vertices_df = small_graph()

In [None]:
vertices_df.printSchema()

In [None]:
edges_df.printSchema()

In [None]:
scope = Scope(edges_df)

In [None]:
pg_cc_result_df = vertices_df.select(col("id"), col("id").alias("cc"))
pg_cc_result_df = powergraph_cc(scope.all_nbr, pg_cc_result_df, 5)

In [None]:
pg_cc_result_df.show()

#### Large Graph

In [None]:
edges_df, vertices_df = graph()

In [None]:
vertices_df.printSchema()

In [None]:
edges_df.printSchema()

In [None]:
scope = Scope(edges_df)

In [None]:
pg_cc_result_df = vertices_df.select(col("id"), col("id").alias("cc"))
pg_cc_result_df = timer(lambda: powergraph_cc(scope.all_nbr, pg_cc_result_df, 8))

In [None]:
pg_cc_result_df.groupBy("cc").count().count()

# Single Source Shortest Path

## Pregel

### Implementation

In [None]:
def pregel_min_dist(edges: DataFrame, values: DataFrame, steps: int):
    result_df = values
    for i in range(steps):
        result_df = (
            edges.join(result_df, col("src") == col("id"))
                .select(col("dst").alias("id"), F.expr("dist + weight").alias("message"))
                .groupby(col("id")).agg(F.min("message").alias("message"))
                .join(result_df, "id", "right")
                .select("id", F.least("message", "dist").alias("dist"))
        )
        if i % 5 == 0:
            result_df = result_df.checkpoint()

    return result_df

### Testing

#### Small Graph

In [None]:
edges_df, vertices_df = small_graph()

In [None]:
vertices_df.printSchema()

In [None]:
edges_df.printSchema()

In [None]:
min_dist_df = get_min_dist_init_df(vertices_df, start=1)
min_dist_df = pregel_min_dist(edges_df, min_dist_df, 5)

In [None]:
min_dist_df.show()

#### Large Graph

In [None]:
disjoint_set_vertices=500
edges_df, vertices_df = graph(disjoint_set_vertices)

In [None]:
vertices_df.printSchema()

In [None]:
edges_df.printSchema()

In [None]:
min_dist_df = get_min_dist_init_df(vertices_df, start=1)
min_dist_df = timer(lambda: pregel_min_dist(edges_df, min_dist_df, 12))

All vertices from the same disjoint set as the vertex `1` should have a distance less that `INF (sys.maxsize)`

In [None]:
df = min_dist_df.where(col("id") < disjoint_set_vertices).where(col("dist") < sys.maxsize)
timer(lambda: df.count())

All vertices from the other disjoint sets comparing to the vertex `1` should have the minimum distance exactly `INF (sys.maxsize)`

In [None]:
df = min_dist_df.where(col("id") > disjoint_set_vertices).groupby("dist").count()
timer(lambda: df.show())

## GraphLab

### Implementation

In [None]:
def graphlab_min_dist(scope: DataFrame, values: DataFrame, steps: int) -> DataFrame:
    min_dist_df = values

    for i in range(steps):
        min_dist_nbr_df = (
            scope.alias("s")
                .join(min_dist_df.alias("r"), col("s.nbr") == col("r.id"))
                .groupBy(col("s.id"))
                .agg(F.min(F.expr("dist + weight")).alias("acc_min_dist"))
        )

        min_dist_df = (
            min_dist_nbr_df
                .join(min_dist_df, "id", "right")
                .select("id", F.least(col("dist"), col("acc_min_dist")).alias("dist"))
        )
        if i % 5 == 0:
            min_dist_df = min_dist_df.checkpoint()

    return min_dist_df

### Testing

#### Small Graph

In [None]:
edges_df, vertices_df = small_graph()

In [None]:
vertices_df.printSchema()

In [None]:
edges_df.printSchema()

In [None]:
scope = Scope(edges_df)

In [None]:
min_dist_df = get_min_dist_init_df(vertices_df, start=1)
min_dist_df = graphlab_min_dist(scope.inward, min_dist_df, steps=5)

In [None]:
min_dist_df.show()

#### Large Graph

In [None]:
disjoint_set_vertices = 500
edges_df, vertices_df = graph(disjoint_set_vertices)

In [None]:
vertices_df.printSchema()

In [None]:
edges_df.printSchema()

In [None]:
scope = Scope(edges_df)

In [None]:
min_dist_df = get_min_dist_init_df(vertices_df, start=1)
min_dist_df = timer(lambda: graphlab_min_dist(scope.inward, min_dist_df, steps=12))

All vertices from the same disjoint set as the vertex `1` should have a distance less that `INF (sys.maxsize)`

In [None]:
df = min_dist_df.where(col("id") < disjoint_set_vertices).where(col("dist") < sys.maxsize)
timer(lambda: df.count())

All vertices from the other disjoint sets comparing to the vertex `1` should have the minimum distance exactly `INF (sys.maxsize)`

In [None]:
df = min_dist_df.where(col("id") > disjoint_set_vertices).groupby("dist").count()
timer(lambda: df.show())

## PowerGraph

### Implementation

In [None]:
def powergraph_min_dist(scope: DataFrame, values: DataFrame, steps: int):
    result_df = values

    for i in range(steps):
        gather_df = (
            scope.where("id != nbr").alias("s")
                .join(result_df.alias("r"), col("r.id") == col("s.nbr"))
                .select(col("s.id"), F.expr("r.dist + s.weight").alias("dist"))
        )
        sum_df = (
            gather_df
                .groupBy("id")
                .agg(F.min("dist").alias("acc_dist"))
        )
        apply_df = (
            sum_df
                .join(result_df, "id", "right")
                .select(col("id"), F.least("acc_dist", "dist").alias("dist"))
        )
        scatter_df = apply_df
        result_df = scatter_df
        if i % 5 == 0:
            result_df = result_df.checkpoint()

    return result_df

### Testing

#### Small Graph

In [None]:
edges_df, vertices_df = small_graph()

In [None]:
vertices_df.printSchema()

In [None]:
edges_df.printSchema()

In [None]:
scope = Scope(edges_df)

In [None]:
pg_min_dist_result_df = get_min_dist_init_df(vertices_df, start=1)
pg_min_dist_result_df = powergraph_min_dist(scope.inward, pg_min_dist_result_df, steps=5)

In [None]:
pg_min_dist_result_df.show()

#### Large Graph

In [None]:
disjoint_set_vertices = 500
edges_df, vertices_df = graph(disjoint_set_vertices)

In [None]:
vertices_df.printSchema()

In [None]:
edges_df.printSchema()

In [None]:
scope = Scope(edges_df)

In [None]:
pg_min_dist_result_df = get_min_dist_init_df(vertices_df, start=1)
pg_min_dist_result_df = timer(lambda: powergraph_min_dist(scope.inward, pg_min_dist_result_df, steps=12))

All vertices from the same disjoint set as the vertex `1` should have a distance less that `INF (sys.maxsize)`

In [None]:
df = pg_min_dist_result_df.where(col("id") < disjoint_set_vertices).where(col("dist") < sys.maxsize)
timer(lambda: df.count())

All vertices from the other disjoint sets comparing to the vertex `1` should have the minimum distance exactly `INF (sys.maxsize)`

In [None]:
df = pg_min_dist_result_df.where(col("id") > disjoint_set_vertices).groupby("dist").count()
timer(lambda: df.show())

# Topological Sort

## Pregel

### Implementation

In [None]:
def pregel_topological_sort(edges: DataFrame, values: DataFrame, steps: int):
    result_df = values
    for i in range(steps):
        result_df = (
            edges.where("src != dst")
                .join(result_df, col("src") == col("id"))
                .select(col("dst").alias("id"), col("ord").alias("message"))
                .groupby(col("id")).agg(F.max("message").alias("message"))
                .join(result_df, "id", "right")
                .select("id", F.greatest(F.expr("message + 1"), "ord").alias("ord"))
        )
        if i % 5 == 0:
            result_df = result_df.checkpoint()

    return result_df

### Testing

#### Small Graph

In [None]:
edges_df, vertices_df = small_graph()

In [None]:
vertices_df.printSchema()

In [None]:
edges_df.printSchema()

In [None]:
sorted_df = vertices_df.withColumn("ord", F.lit(1))
sorted_df = pregel_topological_sort(edges_df, sorted_df, 5)

In [None]:
sorted_df.orderBy("ord").show()

#### Large Graph

In [None]:
disjoint_set_vertices=50
edges_df, vertices_df = graph(disjoint_set_vertices=disjoint_set_vertices)

In [None]:
vertices_df.printSchema()

In [None]:
edges_df.printSchema()

In [None]:
edges_dag_df = edges_df.where("src < dst").cache()

In [None]:
sorted_df = vertices_df.withColumn("ord", F.lit(1))
sorted_df = timer(lambda: pregel_topological_sort(edges_dag_df, sorted_df, disjoint_set_vertices))

The total number of disjoint sets is equal to 1000 (defautl) and the total number of vertices in each disjoint set is equal to `disjoint_set_vertices` (50), so there should be:
- exactly `1000` vertices for each `ord` number,
- exactly `disjoint_set_vertices` (`50`) unique `ord` numbers.

In [None]:
df = sorted_df.groupBy("ord").count().orderBy(col("count").desc(), col("ord"))
timer(lambda: df.show(disjoint_set_vertices))

In [None]:
edges_dag_df.unpersist()

## GraphLab

### Implementation

In [None]:
def graphlab_topological_sort(scope: DataFrame, values: DataFrame, steps: int):
    result_df = values
    for i in range(steps):
        result_nbr_df = (
            scope.where("id != nbr").alias("s")
                .join(result_df.alias("r"), col("s.nbr") == col("r.id"))
                .groupBy(col("s.id"))
                .agg(F.max("ord").alias("acc_ord"))
        )

        result_df = (
            result_nbr_df
                .join(result_df, "id", "right")
                .select("id", F.greatest(F.expr("acc_ord + 1"), col("ord")).alias("ord"))
        )

        if i % 5 == 0:
            result_df = result_df.checkpoint()
    return result_df

### Testing

#### Small Graph

In [None]:
edges_df, vertices_df = dag(small_graph())

In [None]:
vertices_df.printSchema()

In [None]:
edges_df.printSchema()

In [None]:
scope = Scope(edges_df)

In [None]:
sorted_df = vertices_df.withColumn("ord", F.lit(1))
sorted_df = graphlab_topological_sort(scope.inward, sorted_df, 5)

In [None]:
sorted_df.orderBy("ord").show()

#### Large Graph

In [None]:
disjoint_set_vertices=50
edges_df, vertices_df = dag(graph(disjoint_set_vertices=disjoint_set_vertices))

In [None]:
vertices_df.printSchema()

In [None]:
edges_df.printSchema()

In [None]:
scope = Scope(edges_df)

In [None]:
sorted_df = vertices_df.withColumn("ord", F.lit(1))
sorted_df = timer(lambda: graphlab_topological_sort(scope.inward, sorted_df, steps=disjoint_set_vertices))

The total number of disjoint sets is equal to 1000 (defautl) and the total number of vertices in each disjoint set is equal to `disjoint_set_vertices` (50), so there should be:
- exactly `1000` vertices for each `ord` number,
- exactly `disjoint_set_vertices` (`50`) unique `ord` numbers.

In [None]:
df = sorted_df.groupBy("ord").count().orderBy(col("count").desc(), col("ord"))
timer(lambda: df.show(disjoint_set_vertices))

## PowerGraph

### Implementation

In [None]:
def powergraph_topological_sort(scope: DataFrame, values: DataFrame, steps: int):
    result_df = values

    for i in range(steps):
        gather_df = (
            scope.where("id != nbr").alias("s")
                .join(result_df.alias("r"), col("r.id") == col("s.nbr"))
                .select(col("s.id"), col("ord"))
        )
        sum_df = (
            gather_df
                .groupBy("id")
                .agg(F.max("ord").alias("acc_ord"))
        )
        apply_df = (
            sum_df
                .join(result_df, "id", "right")
                .select(col("id"), F.greatest(F.expr("acc_ord + 1"), "ord").alias("ord"))
        )
        scatter_df = apply_df
        result_df = scatter_df
        if i % 5 == 0:
            result_df = result_df.checkpoint()

    return result_df

### Testing

#### Small Graph

In [None]:
edges_df, vertices_df = dag(small_graph())

In [None]:
vertices_df.printSchema()

In [None]:
edges_df.printSchema()

In [None]:
scope = Scope(edges_df)

In [None]:
sorted_df = vertices_df.withColumn("ord", F.lit(1))
sorted_df = powergraph_topological_sort(scope.inward, sorted_df, 5)

In [None]:
sorted_df.orderBy("ord").show()

#### Large Graph

In [None]:
disjoint_set_vertices=50
edges_df, vertices_df = dag(graph(disjoint_set_vertices=disjoint_set_vertices))

In [None]:
disjoint_set_vertices=50
edges_df, vertices_df = graph(disjoint_set_vertices=disjoint_set_vertices, disjoint_sets_number=10000, disjoint_set_edges=0)

In [None]:
vertices_df.printSchema()

In [None]:
edges_df.printSchema()

In [None]:
scope = Scope(edges_df)

In [None]:
sorted_df = vertices_df.withColumn("ord", F.lit(1))
sorted_df = timer(lambda: powergraph_topological_sort(scope.inward, sorted_df, steps=disjoint_set_vertices))

The total number of disjoint sets is equal to 1000 (defautl) and the total number of vertices in each disjoint set is equal to `disjoint_set_vertices` (50), so there should be:
- exactly `1000` vertices for each `ord` number,
- exactly `disjoint_set_vertices` (`50`) unique `ord` numbers.

In [None]:
df = sorted_df.groupBy("ord").count().orderBy(col("count").desc(), col("ord"))
timer(lambda: df.show(disjoint_set_vertices))

# General Algorithms

## Pregel

In [None]:
def pregel_superstep(edges: DataFrame, values: DataFrame, message: Column, combiner: Callable[[Column], Column], computer: Column, **columns: Column) -> DataFrame:
    message_box_df = (
        edges.where(col("src") != col("dst"))
            .join(values, col("src") == col("id"))
            .select(col("dst").alias("id"), message.alias("message"))
    )
    accumulator_df = (
        message_box_df
            .groupby(col("id"))
            .agg(combiner(col("message")).alias("message"))
    )
    return (
        accumulator_df
            .join(values, "id", "right")
            .select("id", computer.alias("value"), *columns.values())
    )

In [None]:
edges_df, vertices_df = small_graph()

### Connected Components

In [None]:
cc_df = vertices_df.withColumn("value", col("id"))

for _ in range(5):
    cc_df = pregel_superstep(
        edges=edges_df,
        values=cc_df,
        message=col("value"),
        combiner=F.min,
        computer=F.least(col("value"), col("message"))
    )

cc_df.show()

### Single Point Shortest Path

In [None]:
pregel_min_dist_df = get_min_dist_init_df(vertices_df, start=1).withColumnRenamed("dist", "value")

for _ in range(5):
    pregel_min_dist_df = pregel_superstep(
        edges=edges_df,
        values=pregel_min_dist_df,
        message=F.expr("value + weight"),
        combiner=F.min,
        computer=F.least("message", "value")
    )

pregel_min_dist_df.show()

### Topological Sort

In [None]:
pregel_sort_df = vertices_df.withColumn("value", F.lit(1))

for _ in range(5):
    pregel_sort_df = pregel_superstep(
        edges=edges_df,
        values=pregel_sort_df,
        message=col("value"),
        combiner=F.max,
        computer=F.greatest(F.expr("message + 1"), "value")
    )

pregel_sort_df.orderBy("value").show()

### PageRank

In [None]:
in_deg = edges_df.groupBy("dst").agg(F.count(F.lit(1)).alias("in_deg")).withColumnRenamed("dst", "id")
out_deg = edges_df.groupBy("src").agg(F.count(F.lit(1)).alias("out_deg")).withColumnRenamed("src", "id")
deg_vert_df = vertices_df.join(in_deg, "id", "left").join(out_deg, "id", "left").select("id", F.coalesce("in_deg", F.lit(1)).alias("in_deg"), F.coalesce("out_deg", F.lit(1)).alias("out_deg"))

In [None]:
page_rank_result_df = deg_vert_df.withColumn("value", F.lit(1))

In [None]:
for i in range(10):
    page_rank_result_df = pregel_superstep(
        edges=edges_df,
        values=page_rank_result_df,
        message=col("value"),
        combiner=F.sum,
        computer=F.expr("(0.15 + 0.85 * nvl(message, value)) / out_deg"),
        out_deg=col("out_deg")
    )
    if i % 5 == 0:
        page_rank_result_df = page_rank_result_df.checkpoint()

page_rank_result_df.show()

## GraphLab

In [None]:
def graphlab_step(scope: DataFrame, values: DataFrame, nbr_combiner: Column, result: Column, **columns: Column):
    values_nbr_df = (
        scope.where("id != nbr").alias("s")
            .join(values.alias("r"), col("s.nbr") == col("r.id"))
            .groupBy(col("s.id"))
            .agg(nbr_combiner)
    )
    values_df = (
        values_nbr_df
            .join(values, "id", "right")
            .select("id", result, *columns.values())
    )
    return values_df

In [None]:
edges_df, vertices_df = small_graph()
nbrs = Scope(edges_df)

### Connected Components

In [None]:
cc_df = vertices_df.withColumn("cc", col("id"))

for _ in range(5):
    cc_df = graphlab_step(
        scope=nbrs(Scope.Direction.INOUT),
        values=cc_df,
        nbr_combiner=F.min("cc").alias("acc_cc"),
        result=F.least(col("cc"), col("acc_cc")).alias("cc")
    )

cc_df.show()

### Single Source Shortest Path

In [None]:
min_dist_df = get_min_dist_init_df(vertices_df, start=1)

for _ in range(5):
    min_dist_df = graphlab_step(
        scope=nbrs(Scope.Direction.IN),
        values=min_dist_df,
        nbr_combiner=F.min(F.expr("dist + weight")).alias("nbr_min_dist"),
        result=F.least(col("dist"), col("nbr_min_dist")).alias("dist")
    )

min_dist_df.show()

### Topological Sort

In [None]:
sorted_df = vertices_df.withColumn("ord", F.lit(1))

for _ in range(5):
    sorted_df = graphlab_step(
        scope=nbrs(Scope.Direction.IN),
        values=sorted_df,
        nbr_combiner=F.max("ord").alias("acc_ord"),
        result=F.greatest(col("ord"), F.expr("acc_ord + 1")).alias("ord")
    )

sorted_df.orderBy("ord").show()

### Page Rank

In [None]:
in_deg = edges_df.groupBy("dst").agg(F.count(F.lit(1)).alias("in_deg")).withColumnRenamed("dst", "id")
out_deg = edges_df.groupBy("src").agg(F.count(F.lit(1)).alias("out_deg")).withColumnRenamed("src", "id")
deg_vert_df = vertices_df.join(in_deg, "id", "left").join(out_deg, "id", "left").select("id", F.expr("nvl(in_deg, 1)").alias("in_deg"), F.expr("nvl(out_deg, 1)").alias("out_deg"))

In [None]:
page_rank_result_df = deg_vert_df.withColumn("value", F.lit(1))

for i in range(10):
    page_rank_result_df = graphlab_step(
        scope=nbrs(Scope.Direction.IN),
        values=page_rank_result_df,
        nbr_combiner=F.sum("value").alias("nbr_value"),
        result=F.expr("(0.15 + 0.85 * nvl(nbr_value, value)) / out_deg").alias("value"),
        out_deg=col("out_deg")
    )    
    if i % 5 == 0:
        page_rank_result_df = page_rank_result_df.checkpoint()

page_rank_result_df.show()

## PowerGraph

In [None]:
def powergraph_superstep(scope: DataFrame, values: DataFrame, gather: Column, combiner: Column, apply: Column, **columns: Column):
    gather_df = (
        scope.where("id != nbr").alias("s")
            .join(
                values.alias("r"),
                col("r.id") == col("s.nbr"))
            .select(col("s.id"), gather)
    )
    sum_df = gather_df.groupBy("id").agg(combiner)
    apply_df = (
        sum_df
            .join(values, "id", "right")
            .select(col("id"), apply, *columns.values())
    )
    scatter_df = apply_df
    return scatter_df

In [None]:
edges_df, vertices_df = small_graph()
nbrs = Scope(edges_df)

### Connected Components

In [None]:
cc_df = vertices_df.withColumn("cc", col("id"))

for _ in range(5):
    cc_df = powergraph_superstep(
        scope=nbrs(Scope.Direction.INOUT),
        values=cc_df,
        gather=col("cc"),
        combiner=F.min("cc").alias("acc_cc"),
        apply=F.least("cc", "acc_cc").alias("cc")
    )

cc_df.show()

### Single Source Shortest Path

In [None]:
min_dist_df = get_min_dist_init_df(vertices_df, start=1)

for _ in range(5):
    min_dist_df = powergraph_superstep(
        scope=nbrs(Scope.Direction.IN),
        values=min_dist_df,
        gather=F.expr("dist + weight").alias("dist"),
        combiner=F.min("dist").alias("acc_dist"),
        apply=F.least("acc_dist", "dist").alias("dist")
    )

min_dist_df.show()

### Topological Sort

In [None]:
sorted_df = vertices_df.withColumn("ord", F.lit(1))

for _ in range(5):
    sorted_df = powergraph_superstep(
        scope=nbrs(Scope.Direction.IN),
        values=sorted_df,
        gather=col("ord"),
        combiner=F.max("ord").alias("acc_ord"),
        apply=F.greatest(F.expr("acc_ord + 1"), "ord").alias("ord")
    )

sorted_df.orderBy("ord").show()

### Page Rank

In [None]:
in_deg = edges_df.groupBy("dst").agg(F.count(F.lit(1)).alias("in_deg")).withColumnRenamed("dst", "id")
out_deg = edges_df.groupBy("src").agg(F.count(F.lit(1)).alias("out_deg")).withColumnRenamed("src", "id")
deg_vert_df = vertices_df.join(in_deg, "id", "left").join(out_deg, "id", "left").select("id", F.expr("nvl(in_deg, 1)").alias("in_deg"), F.expr("nvl(out_deg, 1)").alias("out_deg"))

In [None]:
page_rank_result_df = deg_vert_df.withColumn("value", F.lit(1))

for i in range(10):
    page_rank_result_df = powergraph_superstep(
        scope=nbrs(Scope.Direction.IN),
        values=page_rank_result_df,
        gather=col("value"),
        combiner=F.sum("value").alias("in_value"),
        apply=F.expr("(0.15 + 0.85 * nvl(in_value, value)) / out_deg").alias("value"),
        out_deg=col("out_deg")
    )    
    if i % 5 == 0:
        page_rank_result_df = page_rank_result_df.checkpoint()

page_rank_result_df.show()

# Distributed Graph

In [None]:
edges_df, _ = graph()

In [None]:
edges_df_with_part = edges_df.withColumn("partition_id", F.spark_partition_id())

In [None]:
"""
select *
  from t1
     , t1 t2
 where t1.part_id != t2.part_id
   and t1.src == t2.src
"""
df = (
    edges_df_with_part.alias("t1")
        .crossJoin(edges_df_with_part.alias("t2"))
        .where("t1.partition_id != t2.partition_id")
        .where("t1.src == t2.src")
        .select("t1.src", "t1.partition_id", "t2.src", "t2.partition_id")
)
df.show(5)