In [1]:
import sys

from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import col
from pyspark.sql import functions as F

In [2]:
spark = (
    SparkSession
        .builder
        .appName("Pregel")
        .master("local[4]")
        .getOrCreate()
)

## Компоненты связности графа

In [None]:
verties = [ (x, x) for x in range(1, 7) ]
verties_df = (
    spark
        .createDataFrame(verties)
        .toDF("id", "value")
)

In [None]:
edges = [ (1, 2), (1,3), (2, 3), (3, 4), (4,5), (6, 6) ]
edges_df = (
    spark
        .createDataFrame(edges)
        .toDF("src", "dst")
)

In [None]:
for _ in range(5):
    verties_df = (
        edges_df.join(verties_df, col("src") == col("id"))
          .select(col("dst").alias("id"), col("value"))
          .groupby(col("id")).agg(F.min("value").alias("message"))
          .join(verties_df, "id", "right")
          .select("id", F.coalesce("message", "value").alias("message"), "value")
          .select("id", F.least("message", "value").alias("value"))
    )
verties_df.show()

## Минимальное расстояние

In [None]:
start = 1

In [None]:
verties = [ (x, 0 if x == start else sys.maxsize // 2) for x in range(1, 6) ]
verties_df = (
    spark
        .createDataFrame(verties)
        .toDF("id", "value")
)

In [None]:
edges = [ (1, 2, 1), (1,3, 3), (2, 3, 1), (3, 4, 3), (4,5, 4) ]
edges_df = (
    spark
        .createDataFrame(edges)
        .toDF("src", "dst", "weight")
)

In [None]:
for _ in range(5):
    verties_df = (
        edges_df
            .join(verties_df, col("src") == col("id"))
            .select(col("dst").alias("id"), F.expr("value + weight").alias("value"))
            .groupby("id").agg(F.min("value").alias("message"))
            .join(verties_df, "id", "right")
            .select("id", F.coalesce("message", "value").alias("message"), "value")
            .select("id", F.least("message", "value").alias("value"))

    )
verties_df.show()

## PageRank

In [3]:
vertices = [ (chr(ord('A') + x), 1) for x in range(11) ]
verties_df = (
    spark
        .createDataFrame(vertices)
        .toDF("id", "value")
)

In [4]:
edges = [
    ('D', 'A'),
    ('D', 'B'), ('E', 'B'), ('F', 'B'), ('C', 'B'), ('G', 'B'), ('H', 'B'), ('I', 'B'),
    ('E', 'D'), ('E', 'F'), ('F', 'E'),
    ('G', 'E'), ('H', 'E'), ('I', 'E'), ('J', 'E'), ('K', 'E'),
    ('B', 'C')
]

edges_df = (
    spark
        .createDataFrame(edges)
        .toDF("src", "dst")
)

In [5]:
verties_df1 = verties_df

In [156]:
verties_df1 = (
    edges_df
        .join(verties_df, col("src") == col("id"))
        .select(col("dst").alias("id"), col("value"))
        .groupby("id").agg(F.sum(col("value")).alias("message"))
        .join(verties_df, "id", "right")
        .select("id", F.coalesce("message", "value").alias("message"), "value")
        .join(edges_df, col("id") == col("src"), "right")
        .groupby("id", "message", "value").count()
        .select("id", F.expr("(0.15 + 0.85 * message) / count").alias("value"))
)
verties_df1.show()

+---+------------------+
| id|             value|
+---+------------------+
|  J|1.0000000000000000|
|  B|6.1000000000000000|
|  H|0.5000000000000000|
|  I|0.5000000000000000|
|  F|0.5000000000000000|
|  K|1.0000000000000000|
|  C|1.0000000000000000|
|  D|0.5000000000000000|
|  E|1.7500000000000000|
|  G|0.5000000000000000|
+---+------------------+



In [8]:
spark.sparkContext.setCheckpointDir("plan/checkpoint")
for _ in range(10):
    verties_df1 = (
        edges_df
            .join(verties_df1, col("src") == col("id"))
            .select(col("dst").alias("id"), col("value"))
            .groupby("id").agg(F.sum(col("value")).alias("message"))
            .join(verties_df1, "id", "right")
            .select("id", F.coalesce("message", "value").alias("message"), "value")
            .join(edges_df, col("id") == col("src"), "right")
            .groupby("id", "message", "value").count()
            .select("id", F.expr("(0.15 + 0.85 * message) / count").alias("value"))
    ).checkpoint()

In [9]:
verties_df1.orderBy("id").show()

+---+--------+
| id|   value|
+---+--------+
|  B|7.620551|
|  C|6.312955|
|  D|0.436815|
|  E|0.851311|
|  F|0.436815|
|  G|0.130438|
|  H|0.130438|
|  I|0.130438|
|  J|1.000000|
|  K|1.000000|
+---+--------+

