In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Window

In [2]:
spark = SparkSession.builder\
      .config("spark.sql.shuffle.partitions", 4)\
      .master("local[4]")\
      .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/04 21:23:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
races = spark.read.parquet("../../data/parquet/races.parquet")\
    .select("raceId", "year")

In [4]:
driverInfo = spark.read.parquet("../../data/parquet/drivers.parquet")\
    .select(F.col("driverId"),
        F.concat(F.col("forename"), F.lit(" "), F.col("surname")).alias("fullName"))

In [5]:
raceDriverLapWindow = Window.partitionBy("driverId", "raceId").orderBy("lap")
driverWindow = Window.partitionBy("driverId")
seasonWindow = Window.partitionBy("year")

In [6]:
lastRaces = spark.read.parquet("../../data/parquet/races.parquet")\
    .withColumn("round", F.col("round").cast(T.IntegerType()))\
    .withColumn("max", F.max(F.col("round")).over(Window.partitionBy("year")))\
    .where(F.col("round") == F.col("max"))\
    .select("raceId", "year")

In [7]:
driverConstSeasonMap = spark.read.parquet("../../data/parquet/drivers_constr_season.parquet")

In [8]:
teammateWindow = Window.partitionBy("year", "constructorId")

In [9]:
driverDomination = spark.read.parquet("../../data/parquet/driver_standings.parquet")\
    .join(lastRaces, ["raceId"], "right")\
    .join(driverConstSeasonMap, ["driverId", "year"], "left")\
    .withColumn("teamPointsPerc", F.col("points") / F.sum(F.col("points")).over(teammateWindow))\
    .withColumn("bestOfTeam", F.max(F.col("teamPointsPerc")).over(teammateWindow))\
    .withColumn("dominatedTeammate", F.when(F.col("teamPointsPerc") == F.col("bestOfTeam"), 1).otherwise(0))\
    .withColumn("dominationPerc", F.round(F.sum(F.col("dominatedTeammate")).over(driverWindow) / F.count(F.col("year")).over(driverWindow) * 100, 2))\
    .select("year", "driverId", "dominatedTeammate", "dominationPerc")

                                                                                

In [10]:
driverFilter = spark.read.parquet("../../data/parquet/results.parquet")\
    .withColumn("finished", F.when(F.col("statusId") == 1, 1).otherwise(0))\
    .withColumn("numberOfFinishes", F.sum(F.col("finished")).over(driverWindow))\
    .where(F.col("numberOfFinishes") < 5)\
    .select("driverId")\
    .distinct()

                                                                                

In [11]:
raceConstructorWindow = Window.partitionBy("raceId", "constructorId")
seasonConstructorWindow = Window.partitionBy("year", "constructorId")
driverSeasonWindow = Window.partitionBy("driverId", "year")

In [12]:
teammateComparison = spark.read.parquet("../../data/parquet/results.parquet")\
    .join(driverFilter, ["driverId"], "leftanti")\
    .join(races, "raceId")\
    .withColumn("position", F.col("position").cast(T.IntegerType()))\
    .withColumn("grid", F.col("grid").cast(T.IntegerType()))\
    .na.fill({"position" : 100, "grid" : 100})\
    .withColumn("grid", F.when(F.col("grid") == 0, 100).otherwise(F.col("grid")))\
    .withColumn("topPos", F.min(F.col("position")).over(raceConstructorWindow))\
    .withColumn("constructorBestPos", F.when(F.col("topPos") == F.col("position"), 1).otherwise(0))\
    .withColumn("topPosPerc", F.sum(F.col("constructorBestPos")).over(driverSeasonWindow) / 
                F.count(F.col("raceId")).over(driverSeasonWindow) * 100)\
    .withColumn("constTopPosPerc", F.max(F.col("topPosPerc")).over(seasonConstructorWindow))\
    .withColumn("driverDomConstPos", F.when(F.col("constTopPosPerc") == F.col("topPosPerc"), 1).otherwise(0))\
    .withColumn("topGrid", F.min(F.col("grid")).over(raceConstructorWindow))\
    .withColumn("constructorBestGridPos", F.when(F.col("topGrid") == F.col("grid"), 1).otherwise(0))\
    .withColumn("topGridPerc", F.sum(F.col("constructorBestGridPos")).over(driverSeasonWindow) / 
                F.count(F.col("raceId")).over(driverSeasonWindow) * 100)\
    .withColumn("constTopGridPerc", F.max(F.col("topGridPerc")).over(seasonConstructorWindow))\
    .withColumn("driverDomConstGrid", F.when(F.col("constTopGridPerc") == F.col("topGridPerc"), 1).otherwise(0))\
    .dropDuplicates(["driverId", "year"])\
    .withColumn("avgTopPosPerc", F.avg(F.col("topPosPerc")).over(driverWindow))\
    .withColumn("avgTopGridPerc", F.avg(F.col("topGridPerc")).over(driverWindow))\
    .withColumn("avgPosDom", F.avg(F.col("driverDomConstPos")).over(driverWindow))\
    .withColumn("avgGridDom", F.avg(F.col("driverDomConstGrid")).over(driverWindow))\
    .dropDuplicates(["driverId"])\
    .select("driverId", "avgTopPosPerc", "avgTopGridPerc", "avgPosDom", "avgGridDom")

                                                                                

In [13]:
def averageRank(cols):
    return sum(cols) / len(cols)

In [14]:
averageRank = F.udf(averageRank, T.DoubleType())

Usar la UDF definida arriba da resultados erróneos. Por ello se suman las columnas y se divide entre 3. Esto es menos flexible ya que si quisiera hacer la media del contenido de 6 columnas tendría que hacerlo a mano. Para solucionarlo también se puede castear la columna avgRank a Double. Por defecto debe ser que las UDF devuelven un String en Python. Es importante establecer el tipo del output de la función.

In [15]:
results = spark.read.parquet("../../data/parquet/results.parquet")\
    .join(driverFilter, ["driverId"], "leftanti")\
    .join(races, "raceId")\
    .withColumn("grid", F.col("grid").cast(T.IntegerType()))\
    .withColumn("position", F.col("position").cast(T.IntegerType()))\
    .withColumn("firstRowStart", F.when((F.col("grid") == 1)  | (F.col("grid") == 2), 1).otherwise(0))\
    .withColumn("firstRowChance", F.round(F.sum(F.col("firstRowStart")).over(driverWindow) / F.count(F.col("firstRowStart")).over(driverWindow), 4) * 100)\
    .join(driverDomination, ["driverId", "year"], "left")\
    .join(teammateComparison, ["driverId"], "left")\
    .withColumn("avgGridStart", F.round(F.avg(F.col("grid")).over(driverWindow), 2))\
    .withColumn("avgFinish", F.round(F.avg(F.col("position")).over(driverWindow), 2))\
    .withColumn("pole", F.when(F.col("grid") == 1, 1).otherwise(0))\
    .withColumn("totalPolePositions", F.sum(F.col("pole")).over(driverWindow))\
    .withColumn("poleChance", F.round(F.col("totalPolePositions") / F.count(F.col("pole")).over(driverWindow), 4) * 100)\
    .withColumn("polesPerSeason", F.sum(F.col("pole")).over(driverSeasonWindow))\
    .withColumn("poleChance", F.round(F.col("totalPolePositions") / F.count(F.col("raceId")).over(driverWindow) * 100, 2))\
    .withColumn("hasPoleThisSeason", F.when(F.col("polesPerSeason") > 0, 1).otherwise(0))\
    .withColumn("percSeasonsWithPole", F.round(F.sum(F.col("hasPoleThisSeason")).over(driverWindow) / F.count(F.col("year")).over(driverWindow), 4) * 100)\
    .withColumn("win", F.when(F.col("position") == 1, 1).otherwise(0))\
    .withColumn("totalVictories", F.sum(F.col("win")).over(driverWindow))\
    .withColumn("victoryChance", F.round(F.col("totalVictories") / F.count(F.col("win")).over(driverWindow), 4) * 100)\
    .withColumn("winsPerSeason", F.sum(F.col("win")).over(driverSeasonWindow))\
    .withColumn("hasWonThisSeason", F.when(F.col("winsPerSeason") > 0, 1).otherwise(0))\
    .withColumn("percSeasonsWithWins", F.round(F.sum(F.col("hasWonThisSeason")).over(driverWindow) / F.count(F.col("year")).over(driverWindow), 4) * 100)\
    .withColumn("podium", F.when((F.col("position") == 1) | (F.col("position") == 2) | (F.col("position") == 3), 1).otherwise(0))\
    .withColumn("podiumChance", F.round(F.sum(F.col("podium")).over(driverWindow) / F.count(F.col("podium")).over(driverWindow), 4) * 100)\
    .dropDuplicates(["driverId"])\
    .select("driverId", "firstRowChance", "avgGridStart", "avgFinish", 
            "totalPolePositions", "poleChance", "percSeasonsWithPole", 
            "percSeasonsWithWins", "podiumChance", "dominationPerc", 
            "avgTopPosPerc", "avgTopGridPerc", "avgPosDom", "avgGridDom")\
    .withColumn("rankFRC", F.rank().over(Window.orderBy(F.col("firstRowChance").desc())))\
    .withColumn("rankAGS", F.rank().over(Window.orderBy(F.col("avgGridStart").asc())))\
    .withColumn("rankAF", F.rank().over(Window.orderBy(F.col("avgFinish").asc())))\
    .withColumn("rankTPP", F.rank().over(Window.orderBy(F.col("totalPolePositions").desc())))\
    .withColumn("rankPSWP", F.rank().over(Window.orderBy(F.col("percSeasonsWithPole").desc())))\
    .withColumn("rankPSWW", F.rank().over(Window.orderBy(F.col("percSeasonsWithWins").desc())))\
    .withColumn("rankPC", F.rank().over(Window.orderBy(F.col("podiumChance").desc())))\
    .withColumn("rankDom", F.rank().over(Window.orderBy(F.col("dominationPerc").desc())))\
    .withColumn("rankPoleC", F.rank().over(Window.orderBy(F.col("poleChance").desc())))\
    .withColumn("rankPosPerc", F.rank().over(Window.orderBy(F.col("avgTopPosPerc").desc())))\
    .withColumn("rankGridPerc", F.rank().over(Window.orderBy(F.col("avgTopGridPerc").desc())))\
    .withColumn("rankPosDom", F.rank().over(Window.orderBy(F.col("avgPosDom").desc())))\
    .withColumn("rankGridDom", F.rank().over(Window.orderBy(F.col("avgGridDom").desc())))\
    .withColumn("stats", averageRank(
        F.array(F.col("rankFRC"),
            F.col("rankAGS"),
            F.col("rankAF"),
            F.col("rankTPP"),
            F.col("rankPSWP"),
            F.col("rankPSWW"),
            F.col("rankPC"),
            F.col("rankDom"),
            F.col("rankPoleC"),
            F.col("rankPosPerc"),
            F.col("rankGridPerc"),
            F.col("rankPosDom"),
            F.col("rankGridDom")
           )
    ))\
    .withColumn("rank", F.rank().over(Window.orderBy(F.col("stats").asc())))\
    .sort(F.col("rank").asc())\
    .join(driverInfo, "driverId")

                                                                                

In [16]:
# results.collect()

In [17]:
import time
def current_milli_time():
    return round(time.time() * 1000)

def run():
    start = current_milli_time()
    results.collect()
    return current_milli_time() - start

def average(l):
    return sum(l)/len(l)
    
def time_test():
    l = list()
    for i in range(1):
        l.append(run())
    return average(l)

res = time_test()
print(res)

22/05/04 21:23:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/04 21:23:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/04 21:23:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/04 21:23:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/04 21:23:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/04 21:23:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/04 2

22/05/04 21:23:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/04 21:23:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/04 21:23:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/04 21:23:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/04 21:23:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/04 21:23:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/04 2

22/05/04 21:23:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/04 21:23:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/04 21:23:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/04 21:23:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/04 21:23:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/04 21:23:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/04 2

22/05/04 21:23:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/04 21:23:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/04 21:23:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/04 21:23:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/04 21:23:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/04 21:23:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/04 2

4850.0


22/05/04 21:23:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [18]:
# x = input()