##### Produce constructor standings

In [0]:
# Create a text widget for file date with a default value
dbutils.widgets.text("p_file_date", "2021-03-28")

# Retrieve the value of the file date widget
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
# Running the configuration notebook to load necessary configurations and settings
%run "../includes/configuration"

In [0]:
# Run the common functions notebook to make its functions available in the current notebook
%run "../includes/common_functions"

Find race years for which the data is to be reprocessed

In [0]:
# Load the race results data from the Delta table and filter it by the specified file date
race_results_df = spark.read.format("delta").load(f"{presentation_folder_path}/race_results") \
.filter(f"file_date = '{v_file_date}'")

In [0]:
# Convert the 'race_year' column of the race_results_df DataFrame to a list
race_year_list = df_column_to_list(race_results_df, 'race_year')

In [0]:
from pyspark.sql.functions import col

# Load the race results data from the Delta table and filter it by the race years in race_year_list
race_results_df = spark.read.format("delta").load(f"{presentation_folder_path}/race_results") \
.filter(col("race_year").isin(race_year_list))

In [0]:
from pyspark.sql.functions import sum, when, count, col

# Aggregate race results to compute total points and wins for each team per race year
constructor_standings_df = race_results_df \
.groupBy("race_year", "team") \
.agg(sum("points").alias("total_points"),  # Sum of points for each team per race year
     count(when(col("position") == 1, True)).alias("wins"))  # Count of wins for each team per race year

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc, rank

# Define a window specification to partition by race year and order by total points and wins in descending order
constructor_rank_spec = Window.partitionBy("race_year").orderBy(desc("total_points"), desc("wins"))

# Add a rank column to the DataFrame based on the window specification
final_df = constructor_standings_df.withColumn("rank", rank().over(constructor_rank_spec))

In [0]:
# Define the condition for merging the target and source tables
merge_condition = "tgt.team = src.team AND tgt.race_year = src.race_year"

# Merge the final DataFrame into the Delta table
merge_delta_data(
    final_df,  # Source DataFrame
    'f1_presentation',  # Database name
    'constructor_standings',  # Table name
    presentation_folder_path,  # Path to the Delta table
    merge_condition,  # Merge condition
    'race_year'  # Partition column
)

In [0]:
%sql
-- Select all columns from the constructor_standings table for the year 2021
SELECT * 
  FROM f1_presentation.constructor_standings 
 WHERE race_year = 2021;

In [0]:
%sql
-- Select race year and count of records for each year
SELECT race_year, COUNT(1)
  FROM f1_presentation.constructor_standings
-- Group by race year to aggregate the count
 GROUP BY race_year
-- Order the results by race year in descending order
 ORDER BY race_year DESC;