In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import col, lit, current_timestamp, concat, desc, asc, rank, when, count, sum
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType
from config_file import transformed_path

In [2]:
spark = SparkSession.builder.appName('ETL').getOrCreate()
spark

### Read Transformed Dataframe

In [3]:
race_results_df = spark.read.parquet(f"{transformed_path}\Race_Results")
race_results_df.show()

+---------+--------------------+-------------------+----------------+--------------------+-------------+------------------+--------+----+-----------+-----------+------+--------+--------------------+
|race_Year|           race_Name|          race_Date|circuit_Location|         driver_Name|driver_Number|driver_Nationality|    team|grid|  race_Time|fastest_Lap|points|position|        created_Date|
+---------+--------------------+-------------------+----------------+--------------------+-------------+------------------+--------+----+-----------+-----------+------+--------+--------------------+
|     2005|Australian Grand ...|2005-03-06 14:00:00|       Melbourne|   Christijan Albers|            0|             Dutch| Minardi|  17|         \N|         11|   0.0|    null|2022-06-13 18:28:...|
|     2005|Australian Grand ...|2005-03-06 14:00:00|       Melbourne|       Nick Heidfeld|            0|            German|Williams|   7|         \N|         38|   0.0|    null|2022-06-13 18:28:...|
|    

In [4]:
race_results_df = race_results_df.na.fill(value=0,subset=["points"])

## Driver Standings

In [5]:
driver_standing_df = race_results_df.groupBy('race_Year','driver_Name','driver_Nationality','team')\
.agg(sum('points').alias('total_Points'),count(when(col('position')==1,True)).alias('wins'))

In [6]:
driver_standing_df.filter("race_Year==2020").orderBy(col('total_Points').desc()).show()

+---------+------------------+------------------+------------+------------+----+
|race_Year|       driver_Name|driver_Nationality|        team|total_Points|wins|
+---------+------------------+------------------+------------+------------+----+
|     2020|    Lewis Hamilton|           British|    Mercedes|       347.0|  11|
|     2020|   Valtteri Bottas|           Finnish|    Mercedes|       223.0|   2|
|     2020|    Max Verstappen|             Dutch|    Red Bull|       214.0|   2|
|     2020|      Sergio Pérez|           Mexican|Racing Point|       125.0|   1|
|     2020|  Daniel Ricciardo|        Australian|     Renault|       119.0|   0|
|     2020|      Carlos Sainz|           Spanish|     McLaren|       105.0|   0|
|     2020|   Alexander Albon|              Thai|    Red Bull|       105.0|   0|
|     2020|   Charles Leclerc|        Monegasque|     Ferrari|        98.0|   0|
|     2020|      Lando Norris|           British|     McLaren|        97.0|   0|
|     2020|      Pierre Gasl

In [7]:
window_cmd = Window.partitionBy('race_Year').orderBy(desc('total_points'),desc('wins'))
driver_standing_df = driver_standing_df.withColumn('rank',rank().over(window_cmd))

In [9]:
driver_standing_df.filter('race_Year==2020').show()

+---------+------------------+------------------+------------+------------+----+----+
|race_Year|       driver_Name|driver_Nationality|        team|total_Points|wins|rank|
+---------+------------------+------------------+------------+------------+----+----+
|     2020|    Lewis Hamilton|           British|    Mercedes|       347.0|  11|   1|
|     2020|   Valtteri Bottas|           Finnish|    Mercedes|       223.0|   2|   2|
|     2020|    Max Verstappen|             Dutch|    Red Bull|       214.0|   2|   3|
|     2020|      Sergio Pérez|           Mexican|Racing Point|       125.0|   1|   4|
|     2020|  Daniel Ricciardo|        Australian|     Renault|       119.0|   0|   5|
|     2020|      Carlos Sainz|           Spanish|     McLaren|       105.0|   0|   6|
|     2020|   Alexander Albon|              Thai|    Red Bull|       105.0|   0|   6|
|     2020|   Charles Leclerc|        Monegasque|     Ferrari|        98.0|   0|   8|
|     2020|      Lando Norris|           British|     

In [8]:
driver_standing_df.write.mode('overwrite').parquet(f"{transformed_path}\Driver_Standings")