## Ingest races.csv file
Step 1- Read the CSV file using the spark datafram reader

In [0]:
dbutils.widgets.text("p_data_source","")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
dbutils.widgets.text("p_file_date","2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
from pyspark.sql.types import StructType, StringType, IntegerType, DoubleType, StructField, TimestampType, DateType

In [0]:
races_schema = StructType(fields=[
    StructField("raceId", IntegerType(), False),
    StructField("year", IntegerType(), True),
    StructField("round", IntegerType(), True),
    StructField("circuitId", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("date", DateType(), True),
    StructField("time", StringType(), True),
    StructField("url", StringType(), True),
])

In [0]:
races_df =spark.read\
.option("header", True)\
.schema(races_schema)\
.csv(f"{raw_folder_path}/{v_file_date}/races.csv")

In [0]:
display(races_df)

In [0]:
races_df.printSchema()

Select only the required columns

In [0]:
races_selected_df = races_df.select("raceId", "year", "round", "circuitId", "name", "date", "time", "url")

In [0]:
from pyspark.sql.functions import col
races_selected_df = races_df.select(col("raceId"), col("year"), col("round"), col("circuitId"), col("name"), col("date"), col("time"))

In [0]:
display(races_selected_df)

### Step 3 - Rename the Columns as required

In [0]:
races_renamed_df = races_selected_df.withColumnRenamed("raceId", "race_id")\
            .withColumnRenamed("year", "race_year")\
            .withColumnRenamed("circuitId", "circuit_id")
                

In [0]:
display(races_renamed_df)

### Step 4 - Add ingestion date and race_timestamp to the dataframe

In [0]:

from pyspark.sql.functions import lit, to_timestamp, concat,current_timestamp

In [0]:
races_final_df = races_renamed_df.withColumn("race_timestamp",to_timestamp(concat(col("date"),lit(" "),col("time")), "yyyy-MM-dd HH:mm:ss"))\
    .withColumn("data_source", lit(v_data_source))\
    .withColumn("file_date", lit(v_file_date)) 


In [0]:
races_final_df = add_ingestion_date(races_final_df)

In [0]:
display(races_final_df)

In [0]:
races_final_df = races_final_df.select(col("race_id"), col("race_year"), col("round"), col("circuit_id"), col("name"),  col("ingestion_date"), col("race_timestamp"))

### Write data to datalake as parquet 

In [0]:
display(dbutils.fs.mounts())

##### Step 5 - Write data to datalake as parquet

In [0]:
#races_final_df.write.mode("overwrite").partitionBy("race_year").parquet(f"{processed_folder_path}/races")

In [0]:
#races_final_df.write.mode("overwrite").format("parquet").partitionBy("race_year").saveAsTable("f1_processed.races")

In [0]:
races_final_df.write.mode("overwrite").format("delta").partitionBy("race_year").saveAsTable("f1_processed.races")

In [0]:
%fs
ls /mnt/formula1dl2025practice/processed/circuits

In [0]:
df = spark.read.format("delta").load(f"{processed_folder_path}/races") 

In [0]:
display(df)

In [0]:
dbutils.notebook.exit("Success")