###### `configure the file`

In [0]:
%run "playground/includes/configuration/"

In [0]:
%run "playground/includes/common_functions"

#### `Data Ingestion: Multiple Files lap_times.csv, qualifying.json`

In [0]:
display(dbutils.fs.mounts())

mountPoint,source,encryptionType
/databricks-datasets,databricks-datasets,
/databricks/mlflow-tracking,databricks/mlflow-tracking,
/databricks-results,databricks-results,
/mnt/databrickssa1/raw,abfss://raw@databrickssa1.dfs.core.windows.net/,
/databricks/mlflow-registry,databricks/mlflow-registry,
/mnt/databrickssa1/processed,abfss://processed@databrickssa1.dfs.core.windows.net/,
/,DatabricksRoot,


In [0]:
%fs
ls /mnt/databrickssa1/raw/

path,name,size,modificationTime
dbfs:/mnt/databrickssa1/raw/lap_times/.DS_Store,.DS_Store,6148,1652650462000
dbfs:/mnt/databrickssa1/raw/lap_times/lap_times_split_1.csv,lap_times_split_1.csv,3016498,1652650469000
dbfs:/mnt/databrickssa1/raw/lap_times/lap_times_split_2.csv,lap_times_split_2.csv,2959610,1652650469000
dbfs:/mnt/databrickssa1/raw/lap_times/lap_times_split_3.csv,lap_times_split_3.csv,2880491,1652650468000
dbfs:/mnt/databrickssa1/raw/lap_times/lap_times_split_4.csv,lap_times_split_4.csv,2882624,1652650469000
dbfs:/mnt/databrickssa1/raw/lap_times/lap_times_split_5.csv,lap_times_split_5.csv,2806321,1652650468000


In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

lap_times_schema = StructType(fields=[StructField("race_id", IntegerType(), False),
                                     StructField("driver_id", IntegerType(), True),
                                     StructField("lap", IntegerType(), True),
                                     StructField("position", IntegerType(), True),
                                     StructField("time", StringType(), True),
                                     StructField("milliseconds", IntegerType(), True)])

# specify the whole folder
# path = "dbfs:/mnt/databrickssa1/raw/lap_times/"
# or wild card option lap_times_split*.csv
path = f"{raw_folder_path}/{v_file_date}/lap_times/lap_times_split*.csv/"
lap_times_df = spark.read.csv(path=path, schema=lap_times_schema, header=True)

lap_times_df = lap_times_df.withColumn("file_date", lit(v_file_date))
lap_times_df = add_ingestion_date(lap_times_df)

lap_times_df.display()

race_id,driver_id,lap,position,time,milliseconds,ingestion_date
841,20,2,1,1:33.006,93006,2022-05-30T06:04:45.919+0000
841,20,3,1,1:32.713,92713,2022-05-30T06:04:45.919+0000
841,20,4,1,1:32.803,92803,2022-05-30T06:04:45.919+0000
841,20,5,1,1:32.342,92342,2022-05-30T06:04:45.919+0000
841,20,6,1,1:32.605,92605,2022-05-30T06:04:45.919+0000
841,20,7,1,1:32.502,92502,2022-05-30T06:04:45.919+0000
841,20,8,1,1:32.537,92537,2022-05-30T06:04:45.919+0000
841,20,9,1,1:33.240,93240,2022-05-30T06:04:45.919+0000
841,20,10,1,1:32.572,92572,2022-05-30T06:04:45.919+0000
841,20,11,1,1:32.669,92669,2022-05-30T06:04:45.919+0000


In [0]:
path = f"{processed_folder_path}/lap_times/"
lap_times_df.write.parquet(path=path, mode="overwrite")

##### `Multiple Files qualifying.json`

In [0]:
%fs
ls /mnt/databrickssa1/raw/

path,name,size,modificationTime
dbfs:/mnt/databrickssa1/raw/qualifying/.DS_Store,.DS_Store,6148,1652650536000
dbfs:/mnt/databrickssa1/raw/qualifying/qualifying_split_1.json,qualifying_split_1.json,948426,1652650537000
dbfs:/mnt/databrickssa1/raw/qualifying/qualifying_split_2.json,qualifying_split_2.json,718351,1652650537000


In [0]:
qualifying_schema = StructType(fields=[StructField("qualifyId", IntegerType(), False),
                                      StructField("raceId", IntegerType(), False),
                                      StructField("driverId", IntegerType(), False),
                                      StructField("constructorId", IntegerType(), False),
                                      StructField("number", IntegerType(), True),
                                      StructField("position", IntegerType(), True),
                                      StructField("q1", StringType(), True),
                                      StructField("q2", StringType(), True),
                                      StructField("q3", StringType(), True)
                                      ])

path = f"{raw_folder_path}/{v_file_date}/qualifying/qualifying_split_*.json/"
qualifying_df = spark.read.json(path=path, schema=qualifying_schema, multiLine=True)
qualifying_df.display()

qualifyId,raceId,driverId,constructorId,number,position,q1,q2,q3
1,18,1,1,22,1,1:26.572,1:25.187,1:26.714
2,18,9,2,4,2,1:26.103,1:25.315,1:26.869
3,18,5,1,23,3,1:25.664,1:25.452,1:27.079
4,18,13,6,2,4,1:25.994,1:25.691,1:27.178
5,18,2,2,3,5,1:25.960,1:25.518,1:27.236
6,18,15,7,11,6,1:26.427,1:26.101,1:28.527
7,18,3,3,7,7,1:26.295,1:26.059,1:28.687
8,18,14,9,9,8,1:26.381,1:26.063,1:29.041
9,18,10,7,12,9,1:26.919,1:26.164,1:29.593
10,18,20,5,15,10,1:26.702,1:25.842,\N


In [0]:
qualifying_final_df = (qualifying_df\
                      .withColumnRenamed("qualifyId", "qualify_id")\
                      .withColumnRenamed("raceId", "race_id")\
                      .withColumnRenamed("driverId", "driver_id")\
                      .withColumnRenamed("constructorId", "constructor_id"))\
                      .withColumn("file_date", lit(v_file_date))

qualifying_final_df = add_ingestion_date(qualifying_final_df)

qualifying_final_df.display()

qualify_id,race_id,driver_id,constructor_id,number,position,q1,q2,q3,ingestion_date
1,18,1,1,22,1,1:26.572,1:25.187,1:26.714,2022-05-30T06:04:53.249+0000
2,18,9,2,4,2,1:26.103,1:25.315,1:26.869,2022-05-30T06:04:53.249+0000
3,18,5,1,23,3,1:25.664,1:25.452,1:27.079,2022-05-30T06:04:53.249+0000
4,18,13,6,2,4,1:25.994,1:25.691,1:27.178,2022-05-30T06:04:53.249+0000
5,18,2,2,3,5,1:25.960,1:25.518,1:27.236,2022-05-30T06:04:53.249+0000
6,18,15,7,11,6,1:26.427,1:26.101,1:28.527,2022-05-30T06:04:53.249+0000
7,18,3,3,7,7,1:26.295,1:26.059,1:28.687,2022-05-30T06:04:53.249+0000
8,18,14,9,9,8,1:26.381,1:26.063,1:29.041,2022-05-30T06:04:53.249+0000
9,18,10,7,12,9,1:26.919,1:26.164,1:29.593,2022-05-30T06:04:53.249+0000
10,18,20,5,15,10,1:26.702,1:25.842,\N,2022-05-30T06:04:53.249+0000


In [0]:
path = f"{processed_folder_path}/qualifying/"
qualifying_final_df.write.parquet(path=path, mode="overwrite")

In [0]:
df = spark.read.parquet(f"{processed_folder_path}/qualifying/")
df.display()

qualify_id,race_id,driver_id,constructor_id,number,position,q1,q2,q3,ingestion_date
1,18,1,1,22,1,1:26.572,1:25.187,1:26.714,2022-05-30T06:04:53.865+0000
2,18,9,2,4,2,1:26.103,1:25.315,1:26.869,2022-05-30T06:04:53.865+0000
3,18,5,1,23,3,1:25.664,1:25.452,1:27.079,2022-05-30T06:04:53.865+0000
4,18,13,6,2,4,1:25.994,1:25.691,1:27.178,2022-05-30T06:04:53.865+0000
5,18,2,2,3,5,1:25.960,1:25.518,1:27.236,2022-05-30T06:04:53.865+0000
6,18,15,7,11,6,1:26.427,1:26.101,1:28.527,2022-05-30T06:04:53.865+0000
7,18,3,3,7,7,1:26.295,1:26.059,1:28.687,2022-05-30T06:04:53.865+0000
8,18,14,9,9,8,1:26.381,1:26.063,1:29.041,2022-05-30T06:04:53.865+0000
9,18,10,7,12,9,1:26.919,1:26.164,1:29.593,2022-05-30T06:04:53.865+0000
10,18,20,5,15,10,1:26.702,1:25.842,\N,2022-05-30T06:04:53.865+0000


In [0]:
dbutils.notebook.exit("Data Ingestion - Multiple Files")

Data Ingestion - Multiple Files