In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [0]:
dbutils.widgets.text('p_data_source', '')
v_data_source = dbutils.widgets.get('p_data_source')

In [0]:
dbutils.widgets.text('p_file_date', '2021-03-21')
v_file_date = dbutils.widgets.get('p_file_date')

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
circuits_schema = StructType([
    StructField("circuitId", IntegerType(), False),
    StructField("circuitRef", StringType(), True),
    StructField("name", StringType(), True),
    StructField("location", StringType(), True),
    StructField("country", StringType(), True),
    StructField("lat", DoubleType(), True),
    StructField("lng", DoubleType(), True),
    StructField("alt", IntegerType(), True),
    StructField("url", StringType(), True)]
)

In [0]:
circuits_df = spark.read \
    .option('header',True) \
    .schema(circuits_schema) \
    .csv(f'{raw_folder_path}/{v_file_date}/circuits.csv')

In [0]:
circuits_selected_df = circuits_df.select(col('circuitId'), col('circuitRef'), col('name'), col('location'), col('country'), col('lat'), col('lng'), col('alt'))

In [0]:
circuit_renamed_df = circuits_selected_df.withColumnRenamed('circuidId', 'circuit_id') \
    .withColumnRenamed('circuidtRef', 'circuit_ref') \
    .withColumnRenamed('lat', 'latitude') \
    .withColumnRenamed('lng', 'longitude') \
    .withColumnRenamed('alt', 'altitude') \
    .withColumn('data_source', lit(v_data_source)) \
    .withColumn('file_date',lit(v_file_date))

In [0]:
circuits_final_df = add_ingestion_date(circuit_renamed_df)

In [0]:
circuits_final_df.write.mode('overwrite').format('parquet').saveAsTable('f1_processed.circuits')

In [0]:
df = spark.read.parquet(f'{processed_folder_path}/circuits')

In [0]:
display(df)

In [0]:
%sql
describe extended f1_processed.circuits

In [0]:
dbutils.notebook.exit('Success')