In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Window

In [3]:
spark = SparkSession.builder\
      .config("spark.sql.shuffle.partitions", 64)\
      .master("local[*]")\
      .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/15 09:35:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
races = spark.read.format("csv")\
    .option("header", "true")\
    .option("sep", ",")\
    .load("../data/races.csv")\
    .select("raceId", "year")

In [5]:
seasonWindow = Window.partitionBy("year")
driverRaceWindow = Window.partitionBy("driverId", "raceId")
raceDriverLapWindow = Window.partitionBy("driverId", "raceId").orderBy("lap")

In [11]:
overtakes2 = spark.read.format("csv")\
    .option("header", "true")\
    .option("sep", ",")\
    .load("../data/lap_times.csv")\
    .withColumn("position", F.col("position").cast(T.IntegerType()))\
    .withColumn("lap", F.col("lap").cast(T.IntegerType()))\
    .join(races, "raceId")\
    .withColumn("positionNextLap", F.lead(F.col("position"), 1).over(raceDriverLapWindow))\
    .withColumn("positionsGainedLap", F.when(F.col("positionNextLap") < F.col("position") , F.abs(F.col("position") - F.col("positionNextLap"))).otherwise(0))\
    .groupBy("year")\
    .agg(F.sum(F.col("positionsGainedLap")).alias("positionsGainedSeason"))\
    .withColumn("rankPositionsGained", F.rank().over(Window.orderBy(F.col("positionsGainedSeason").desc())))

In [9]:
leadersTroughoutSeason = spark.read.format("csv")\
    .option("header", "true")\
    .option("sep", ",")\
    .load("../data/driver_standings.csv")\
    .join(races, "raceId")\
    .where(F.col("position") == 1)\
    .dropDuplicates(["driverId", "position", "year"])\
    .groupBy("year")\
    .agg(F.approx_count_distinct(F.col("driverId")).alias("distinctLeaders"))\
    .withColumn("rankDistinctLeaders", F.rank().over(Window.orderBy(F.col("distinctLeaders").desc())))

In [10]:
winnersTroughoutSeason = spark.read.format("csv")\
    .option("header", "true")\
    .option("sep", ",")\
    .load("../data/results.csv")\
    .join(races, "raceId")\
    .where(F.col("position") == 1)\
    .dropDuplicates(["driverId", "position", "year"])\
    .groupBy("year")\
    .agg(F.approx_count_distinct(F.col("driverId")).alias("distinctWinners"))\
    .withColumn("rankDistinctWinners", F.rank().over(Window.orderBy(F.col("distinctWinners").desc())))

In [12]:
def averageRank(cols):
    return sum(cols) / len(cols)

In [13]:
averageRank = F.udf(averageRank, T.IntegerType())

Usar la UDF definida arriba da resultados erróneos. Por ello se suman las columnas y se divide entre 3. Esto es menos flexible ya que si quisiera hacer la media del contenido de 6 columnas tendría que hacerlo a mano. Para solucionarlo también se puede castear la columna avgRank a Integer. Por defecto debe ser que las UDF devuelven un String en Python. Es importante establecer el tipo del output de la función.

In [15]:
overtakes\
    .join(leadersTroughoutSeason, "year", "inner")\
    .join(winnersTroughoutSeason, "year", "inner")\
    .withColumn("avgRank", averageRank(F.array(F.col("rankDistinctWinners"), F.col("rankDistinctLeaders"), F.col("rankPositionsGained"))))\
    .withColumn("overallRank", F.rank().over(Window.orderBy("avgRank")))\
    .drop("rankDistinctWinners", "rankDistinctLeaders", "rankPositionsGained", "avgRank")\
    .sort("overallRank")\
    .show()

22/02/13 16:16:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/13 16:16:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/13 16:16:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/13 16:16:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/13 16:16:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/13 16:16:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/13 1

+----+---------------------+---------------+---------------+-----------+
|year|positionsGainedSeason|distinctLeaders|distinctWinners|overallRank|
+----+---------------------+---------------+---------------+-----------+
|2012|                 5077|              4|              8|          1|
|2013|                 4697|              2|              5|          1|
|2011|                 4627|              1|              5|          1|
|2016|                 4613|              2|              4|          1|
|2014|                 3873|              2|              3|          1|
|2019|                 3201|              2|              5|          1|
|2004|                 3194|              1|              5|          1|
|2008|                 3123|              4|              7|          1|
|2007|                 2974|              3|              4|          1|
|2003|                 2955|              3|              8|          1|
|2021|                 2901|              2|       

22/02/13 16:16:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
