In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [0]:
dbutils.widgets.text('p_data_source', '')
v_data_source = dbutils.widgets.get('p_data_source')

In [0]:
dbutils.widgets.text('p_file_date', '2021-03-21')
v_file_date = dbutils.widgets.get('p_file_date')

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
qualifying_schema = StructType(
    [
        StructField('qualifyId', IntegerType(), False),
        StructField('raceId', IntegerType(), False),
        StructField('driverId', IntegerType(), False),
        StructField('constructorId', IntegerType(), False),
        StructField('number',IntegerType(), True),
        StructField('position', IntegerType(), True),
        StructField('q1', StringType(), True),
        StructField('q2', StringType(), True),
        StructField('q3', StringType(), True)
    ]
)

In [0]:
qualify_df = spark.read \
    .schema(qualifying_schema) \
    .option('multiline', True) \
    .json(f'{raw_folder_path}/{v_file_date}/qualifying/qualifying_split*.json') \
    .withColumnRenamed('driverId', 'driver_id') \
    .withColumnRenamed('raceId', 'race_id') \
    .withColumnRenamed('qualifyingId', 'qualifying_id') \
    .withColumnRenamed('constructorId', 'constructor_id') \
    .withColumn('ingestion_date', current_timestamp()) \
    .withColumn('data_source', lit(v_data_source)) \
    .withColumn('file_date', lit(v_file_date))
    #.write.mode('overwrite').parquet(f'{processed_folder_path}/qualifying')

In [0]:
final_df = re_arrange_partition_column(qualify_df, 'race_id')

In [0]:
final_df.createOrReplaceTempView('v_qualifying_final')

In [0]:
%sql

SET hive.exec.dynamic.partition.mode=nonstrict;

In [0]:
%sql
---DROP Table IF EXISTS f1_processed.qualifying;
CREATE TABLE IF NOT EXISTS f1_processed.qualifying(
    qualifyId INT,
    race_id INT,
    driver_id INT,
    constructor_id INT,
    number INT,
    position INT,
    q1 STRING,
    q2 STRING,
    q3 STRING,
    ingestion_date STRING,
    data_source STRING,
    file_date STRING
)
PARTITIONED BY (race_id)
STORED AS PARQUET
--AS
--SELECT * FROM v_qualifying_final;

In [0]:
%sql

INSERT OVERWRITE f1_processed.qualifying 
PARTITION (race_id)
SELECT * FROM v_qualifying_final

In [0]:
dbutils.notebook.exit('Success')

In [0]:
%sql
select file_date, count(*)
from f1_processed.qualifying 
GROUP BY file_date