In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [0]:
dbutils.widgets.text('p_data_source', '')
v_data_source = dbutils.widgets.get('p_data_source')

In [0]:
dbutils.widgets.text('p_file_date', '2021-03-21')
v_file_date = dbutils.widgets.get('p_file_date')

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
races_schema = StructType(
    [
        StructField('raceId', IntegerType(), False),
        StructField('year', IntegerType(), True),
        StructField('round', IntegerType(), True),
        StructField('circuitId', IntegerType(), False),
        StructField('name', StringType(), True),
        StructField('date', DateType(), True),
        StructField('time', StringType(), True),
        StructField('url', StringType(), True)
    ]
)

In [0]:
spark.read \
    .option('header',True) \
    .schema(races_schema) \
    .csv(f'{raw_folder_path}/{v_file_date}/races.csv') \
    .select(col('raceId').alias('race_id'), col('year').alias('race_year'), col('round'), col('circuitId').alias('circuit_id'), col('name'), col('date'), col('time')) \
    .withColumn('race_timestamp', to_timestamp(concat(col('date'), lit(' '), col('time')), 'yyyy-MM-dd HH:mm:ss')) \
    .drop('date', 'time') \
    .withColumn('ingestion_date', current_timestamp()) \
    .withColumn('data_source', lit(v_data_source)) \
    .withColumn('file_date',lit(v_file_date)) \
    .write.mode('overwrite').partitionBy('race_year').format('parquet').saveAsTable('f1_processed.races')

In [0]:
dbutils.notebook.exit('Success')

In [0]:
%sql
select * from f1_processed.races