# Read Laptimes JSON File to Spark Dataframe reader

In [0]:
client_id=dbutils.secrets.get(scope="formula1-scope",key="client-id")
tenant_id=dbutils.secrets.get(scope="formula1-scope",key="tenant-id")
client_secret=dbutils.secrets.get(scope="formula1-scope",key="client-secret")

spark.conf.set("fs.azure.account.auth.type.databrickspracticesa.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.databrickspracticesa.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.databrickspracticesa.dfs.core.windows.net", client_id)
spark.conf.set("fs.azure.account.oauth2.client.secret.databrickspracticesa.dfs.core.windows.net", client_secret)
spark.conf.set("fs.azure.account.oauth2.client.endpoint.databrickspracticesa.dfs.core.windows.net",f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

In [0]:
display(dbutils.fs.ls("abfss://raw@databrickspracticesa.dfs.core.windows.net/lap_times"))

In [0]:
#Construc Schema
from pyspark.sql.types import StructType,StructField,IntegerType,StringType

laptimes_schema=StructType(fields=[StructField("raceId",IntegerType(),True),StructField("driverId",IntegerType(),True),StructField("lap",IntegerType(),True),StructField("time",StringType(),True),StructField("position",StringType(),True),StructField("milliseconds",IntegerType(),True)])

#Read Driver JSON Data
laptimes_df=spark.read\
.schema(laptimes_schema).csv("abfss://raw@databrickspracticesa.dfs.core.windows.net/lap_times")
type(laptimes_df)
display(laptimes_df.limit(10))



In [0]:
laptimes_df.printSchema()


In [0]:
from pyspark.sql.functions import lit;
from pyspark.sql.functions import to_timestamp
from pyspark.sql.functions import concat
from pyspark.sql.functions import current_timestamp

In [0]:
laptimes_select_df=laptimes_df.select(laptimes_df["driverId"].alias("driver_id"),laptimes_df["raceId"].alias("race_id"),laptimes_df["lap"],laptimes_df["time"],laptimes_df["milliseconds"].alias("milli_seconds"),laptimes_df["position"],lit("production").alias("env"),current_timestamp().alias("ingestion_date"))
display(laptimes_select_df.limit(10))


# Write data to parquet

In [0]:
#Write with Partition
laptimes_select_df.write.mode("overwrite").parquet("abfss://processed@databrickspracticesa.dfs.core.windows.net/lap_times")

In [0]:
#Display the data
display(spark.read.parquet("abfss://processed@databrickspracticesa.dfs.core.windows.net/lap_times"))