In [34]:
import findspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as func

findspark.init()

spark: SparkSession = SparkSession.builder \
    .appName("JupyterLocalSpark") \
    .master("local[*]") \
    .getOrCreate()

sc: SparkContext = spark.sparkContext

In [35]:
df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("data/fakefriends-header.csv")

df.take(5)

[Row(userID=0, name='Will', age=33, friends=385),
 Row(userID=1, name='Jean-Luc', age=26, friends=2),
 Row(userID=2, name='Hugh', age=55, friends=221),
 Row(userID=3, name='Deanna', age=40, friends=465),
 Row(userID=4, name='Quark', age=68, friends=21)]

In [36]:
df.printSchema()

root
 |-- userID: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- friends: integer (nullable = true)



In [37]:
df = df.select("age", "friends")
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- friends: integer (nullable = true)



In [None]:
df = df \
    .groupBy("age") \
    .agg(func.round(func.avg("friends"), 2).alias("friends_avg"))
    
df.take(5)

[Row(age=31, friends_avg=267.25),
 Row(age=65, friends_avg=298.2),
 Row(age=53, friends_avg=222.86),
 Row(age=34, friends_avg=245.5),
 Row(age=28, friends_avg=209.1)]

In [39]:
df = df.sort("age")
df.take(5)

[Row(age=18, friends_avg=343.38),
 Row(age=19, friends_avg=213.27),
 Row(age=20, friends_avg=165.0),
 Row(age=21, friends_avg=350.88),
 Row(age=22, friends_avg=206.43)]

In [41]:
df.show(5)

+---+-----------+
|age|friends_avg|
+---+-----------+
| 18|     343.38|
| 19|     213.27|
| 20|      165.0|
| 21|     350.88|
| 22|     206.43|
+---+-----------+
only showing top 5 rows

