In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import col, lit, current_timestamp, concat, desc, asc, rank, when, count, sum
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType
from config_file import transformed_path

In [2]:
spark = SparkSession.builder.appName('ETL').getOrCreate()
spark

### Read Transformed Dataframe

In [3]:
race_results_df = spark.read.parquet(f"{transformed_path}\Race_Results")
race_results_df.show()

+---------+--------------------+-------------------+----------------+--------------------+-------------+------------------+--------+----+-----------+-----------+------+--------+--------------------+
|race_Year|           race_Name|          race_Date|circuit_Location|         driver_Name|driver_Number|driver_Nationality|    team|grid|  race_Time|fastest_Lap|points|position|        created_Date|
+---------+--------------------+-------------------+----------------+--------------------+-------------+------------------+--------+----+-----------+-----------+------+--------+--------------------+
|     2005|Australian Grand ...|2005-03-06 14:00:00|       Melbourne|   Christijan Albers|            0|             Dutch| Minardi|  17|         \N|         11|   0.0|    null|2022-06-13 18:28:...|
|     2005|Australian Grand ...|2005-03-06 14:00:00|       Melbourne|       Nick Heidfeld|            0|            German|Williams|   7|         \N|         38|   0.0|    null|2022-06-13 18:28:...|
|    

In [4]:
race_results_df = race_results_df.na.fill(value=0,subset=["points"])

## Constructor Standings

In [5]:
constructor_standing_df = race_results_df.groupBy('race_Year','team')\
.agg(sum('points').alias('total_points'),count(when(col('position')==1,True)).alias('wins'))

In [6]:
constructor_standing_df.filter("race_Year==2020").orderBy(col('total_points').desc()).show()

+---------+------------+------------+----+
|race_Year|        team|total_points|wins|
+---------+------------+------------+----+
|     2020|    Mercedes|       573.0|  13|
|     2020|    Red Bull|       319.0|   2|
|     2020|Racing Point|       210.0|   1|
|     2020|     McLaren|       202.0|   0|
|     2020|     Renault|       181.0|   0|
|     2020|     Ferrari|       131.0|   0|
|     2020|  AlphaTauri|       107.0|   1|
|     2020|  Alfa Romeo|         8.0|   0|
|     2020|Haas F1 Team|         3.0|   0|
|     2020|    Williams|         0.0|   0|
+---------+------------+------------+----+



In [8]:
window_cmd = Window.partitionBy('race_Year').orderBy(desc('total_points'),desc('wins'))
constructor_standing_df = constructor_standing_df.withColumn('rank',rank().over(window_cmd))

In [9]:
constructor_standing_df.show()

+---------+---------------+------------+----+----+
|race_Year|           team|total_points|wins|rank|
+---------+---------------+------------+----+----+
|     1959|  Cooper-Climax|        97.5|   5|   1|
|     1959|        Ferrari|        67.0|   2|   2|
|     1959|            BRM|        22.5|   1|   3|
|     1959|         Watson|        14.0|   1|   4|
|     1959|       Lesovsky|         5.0|   0|   5|
|     1959|     Team Lotus|         5.0|   0|   5|
|     1959|        Epperly|         5.0|   0|   5|
|     1959|         Sutton|         0.0|   0|   8|
|     1959|            Fry|         0.0|   0|   8|
|     1959|       Maserati|         0.0|   0|   8|
|     1959|          Elder|         0.0|   0|   8|
|     1959|   Kurtis Kraft|         0.0|   0|   8|
|     1959|        Tec-Mec|         0.0|   0|   8|
|     1959|        Porsche|         0.0|   0|   8|
|     1959|      Connaught|         0.0|   0|   8|
|     1959|Cooper-Borgward|         0.0|   0|   8|
|     1959|    Cooper-OSCA|    

In [10]:
constructor_standing_df.write.mode('overwrite').parquet(f"{transformed_path}\Constructor_Standings")