### Init

In [None]:
from pyspark.sql import SparkSession
from pathlib import Path

In [None]:
# spark = SparkSession.builder \
#             .config("spark.driver.memory", "6g") \
#             .config("spark.sql.adaptive.enabled", "True")\
#             .master("local").getOrCreate();

# Spark session & context
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext

### Basic rdd example

In [None]:
# Sum of the first 100 whole numbers
rdd = sc.parallelize(range(100 + 1))
rdd.sum()
# 5050

### Join Example

In [None]:
# Create DataFrames
data1 = [("Alice", 1), ("Bob", 2), ("Charlie", 3)]
data2 = [("Alice", "Engineer"), ("Bob", "Doctor"), ("David", "Lawyer")]
df1 = spark.createDataFrame(data1, ["Name", "ID"])
df2 = spark.createDataFrame(data2, ["Name", "Profession"])

# Join DataFrames
joined_df = df1.join(df2, "Name")

# Show the result
joined_df.show()

### Demo - shift data across executors over and over

In [None]:
# Dummy graph data representing nodes and their outgoing links
links = spark.sparkContext.parallelize([
    ("A", ["B", "C"]),
    ("B", ["C"]),
    ("C", ["A"]),
    ("D", ["A", "C"])
]).partitionBy(4)  # Partition the data into 4 partitions

# Initialize ranks for each node
ranks = links.map(lambda pair: (pair[0], 1.0))

# Perform 10 iterations of PageRank
for _ in range(10):
    # Compute contributions of each node to its neighbors
    contributions = links.join(ranks).flatMap(lambda pair: [(dest, pair[1][1] / len(pair[1][0])) for dest in pair[1][0]])

    # Aggregate contributions to compute new ranks
    ranks = contributions.reduceByKey(lambda a, b: a + b).mapValues(lambda rank: 0.15 + 0.85 * rank)

# Collect the final ranks
final_ranks = ranks.collect()
print(final_ranks)