# Read Contrucor JSON File to Spark Dataframe reader

In [0]:
client_id=dbutils.secrets.get(scope="formula1-scope",key="client-id")
tenant_id=dbutils.secrets.get(scope="formula1-scope",key="tenant-id")
client_secret=dbutils.secrets.get(scope="formula1-scope",key="client-secret")

spark.conf.set("fs.azure.account.auth.type.databrickspracticesa.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.databrickspracticesa.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.databrickspracticesa.dfs.core.windows.net", client_id)
spark.conf.set("fs.azure.account.oauth2.client.secret.databrickspracticesa.dfs.core.windows.net", client_secret)
spark.conf.set("fs.azure.account.oauth2.client.endpoint.databrickspracticesa.dfs.core.windows.net",f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

In [0]:
display(dbutils.fs.ls("abfss://raw@databrickspracticesa.dfs.core.windows.net"))

In [0]:
#Construc Schema
from pyspark.sql.types import StructType,StructField,IntegerType,StringType


results_schema=StructType(fields=[StructField("resultId",IntegerType(),False),
                                 StructField("raceId",IntegerType(),False),
                                 StructField("driverId",IntegerType(),False),
                                 StructField("constructorId",IntegerType(),True),
                                 StructField("number",IntegerType(),True),
                                 StructField("grid",IntegerType(),True),
                                 StructField("laps",IntegerType(),True),
                                 StructField("points",IntegerType(),True),
                                 StructField("positionOrder",IntegerType(),True),
                                 StructField("statusId",IntegerType(),True),
                                 StructField("rank",IntegerType(),True),
                                 StructField("fastestLap",StringType(),True),
                                 StructField("fastestLapSpeed",StringType(),True),
                                 StructField("fastestLapTime",StringType(),True),
                                 StructField("milliseconds",StringType(),True),
                                 StructField("positionText",StringType(),True),
                                 StructField("time",StringType(),True),
                                 StructField("position",StringType(),True)])
#Read Driver JSON Data
results_df=spark.read.schema(results_schema).json("abfss://raw@databrickspracticesa.dfs.core.windows.net/results.json")
results_df.printSchema



In [0]:
results_df.printSchema()


In [0]:
from pyspark.sql.functions import lit;
from pyspark.sql.functions import to_timestamp
from pyspark.sql.functions import concat
from pyspark.sql.functions import current_timestamp

In [0]:
results_select_df=results_df.select(results_df["resultId"].alias("result_id"),results_df["raceId"].alias("race_id"),results_df["driverId"].alias("driver_id"),results_df["constructorId"].alias("constructor_id"),results_df["number"],results_df["grid"],results_df["laps"],results_df["points"],results_df["positionOrder"].alias("position_order"),results_df["statusId"].alias("status_id"),results_df["fastestLap"].alias("fastest_lap"),results_df["fastestLapSpeed"].alias("fastest_lap_speed"),results_df["rank"],results_df["fastestLapTime"].alias("fastest_lap_time"),results_df["milliseconds"].alias("milli_seconds"),results_df["positionText"].alias("position_text"),results_df["time"],results_df["position"],lit("env").alias("env"),current_timestamp().alias("ingestion_date"))

display(results_select_df.limit(10))




In [0]:
results_final_df=results_select_df.drop("status_id")


In [0]:
display(results_final_df.limit(10))

# Write data to parquet

In [0]:
#Write with Partition
results_final_df.write.mode("overwrite").partitionBy("race_id").parquet("abfss://processed@databrickspracticesa.dfs.core.windows.net/results")

In [0]:
#Display the data
from pyspark.sql.functions import col
display(spark.read.parquet("abfss://processed@databrickspracticesa.dfs.core.windows.net/results").filter(col('race_id') =='800'))
