In [38]:
import findspark
from pyspark import SparkContext
from pyspark.sql import SparkSession

findspark.init()

spark = SparkSession.builder \
    .appName("JupyterLocalSpark") \
    .master("local[*]") \
    .getOrCreate()

sc: SparkContext = spark.sparkContext

In [39]:
lines = sc.textFile("data/fakefriends.csv")
lines.take(5)

['0,Will,33,385',
 '1,Jean-Luc,26,2',
 '2,Hugh,55,221',
 '3,Deanna,40,465',
 '4,Quark,68,21']

In [40]:
def split_lines(x:str):
    values = x.split(",")
    age = int(values[2])
    friends_count = int(values[3])
    return age, friends_count

rdd = lines.map(split_lines)
rdd.take(5)

[(33, 385), (26, 2), (55, 221), (40, 465), (68, 21)]

In [41]:
rdd = rdd.mapValues(lambda x: (x,1))
rdd.take(5)

[(33, (385, 1)), (26, (2, 1)), (55, (221, 1)), (40, (465, 1)), (68, (21, 1))]

In [42]:
rdd = rdd.reduceByKey(lambda a, b : (a[0] + b[0], a[1] + b[1]))
rdd.take(5)

[(26, (4115, 17)),
 (40, (4264, 17)),
 (68, (2696, 10)),
 (54, (3615, 13)),
 (38, (2903, 15))]

In [43]:
rdd = rdd.mapValues(lambda x: x[0]/x[1])
rdd.take(5)

[(26, 242.05882352941177),
 (40, 250.8235294117647),
 (68, 269.6),
 (54, 278.0769230769231),
 (38, 193.53333333333333)]

In [44]:
rdd = rdd.sortByKey()
rdd.take(10)

[(18, 343.375),
 (19, 213.27272727272728),
 (20, 165.0),
 (21, 350.875),
 (22, 206.42857142857142),
 (23, 246.3),
 (24, 233.8),
 (25, 197.45454545454547),
 (26, 242.05882352941177),
 (27, 228.125)]