# Ingest Circuit.csv file from the source folder

In [0]:
client_id=dbutils.secrets.get(scope="formula1-scope",key="client-id")
tenant_id=dbutils.secrets.get(scope="formula1-scope",key="tenant-id")
client_secret=dbutils.secrets.get(scope="formula1-scope",key="client-secret")

In [0]:
spark.conf.set("fs.azure.account.auth.type.databrickspracticesa.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.databrickspracticesa.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.databrickspracticesa.dfs.core.windows.net", client_id)
spark.conf.set("fs.azure.account.oauth2.client.secret.databrickspracticesa.dfs.core.windows.net", client_secret)
spark.conf.set("fs.azure.account.oauth2.client.endpoint.databrickspracticesa.dfs.core.windows.net",f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

In [0]:
display(dbutils.fs.ls("abfss://raw@databrickspracticesa.dfs.core.windows.net"))

In [0]:
from pyspark.sql.types import StructType,StructField,IntegerType,StringType,DoubleType


In [0]:
circuit_schema=StructType(fields=[StructField("circuitId",IntegerType(),False),
                                   StructField("circuitRef",StringType(),True),
                                   StructField("name",StringType(),True),
                                   StructField("location",StringType(),True),
                                   StructField("country",StringType(),True),
                                   StructField("lat",DoubleType(),True),
                                   StructField("lng",DoubleType(),True),
                                   StructField("alt",IntegerType(),True),
                                   StructField("url",StringType(),True)])

In [0]:

circuit_df=spark.read.csv("abfss://raw@databrickspracticesa.dfs.core.windows.net/circuits.csv",header=True,schema=circuit_schema)



In [0]:
circuit_df.display()
circuit_df.describe()
circuit_df.printSchema()

In [0]:
type(circuit_df)
circuit_df.show();

# Select the requried Columns that are required


In [0]:
circuit_select_df=circuit_df.select("circuitId","circuitRef","name","location","country","lat","lng","alt")
circuit_select_df.show()

In [0]:
circuit_select_df=circuit_df.select(circuit_df.circuitId,circuit_df.circuitRef,circuit_df.name,circuit_df.location,circuit_df.country,circuit_df.lat,circuit_df.lng,circuit_df.alt)
circuit_select_df.show()

In [0]:
display(circuit_select_df)

In [0]:
circuit_select_df=circuit_df.select(circuit_df["circuitId"],circuit_df["circuitRef"],circuit_df["name"],circuit_df["location"],circuit_df["country"],circuit_df["lat"],circuit_df["lng"],circuit_df["alt"])
circuit_select_df.show()
display(circuit_select_df)

In [0]:
from pyspark.sql.functions import col;
from pyspark.sql.functions import current_timestamp;

In [0]:
circuit_select_df=circuit_df.select(col("circuitId"),col("circuitRef"),col("name"),col("location"),col("country")
                                    ,col("lat"),col("alt"),col("lng"))
display(circuit_select_df)

# Rename circuitId and circuitRef to with more meaningful name

In [0]:
circuit_renamed_df=circuit_select_df.withColumnRenamed("circuitId","circuit_id") \
                                    .withColumnRenamed("circuitRef","circuit_ref")\
                                    .withColumnRenamed("lat","latitude") \
                                    .withColumnRenamed("lng","longitude") \
                                    .withColumnRenamed("alt","altitude")
circuit_renamed_df.show()


In [0]:
display(circuit_renamed_df)

# Add new column which is current date field

In [0]:

display(current_timestamp)

In [0]:
from pyspark.sql.functions import lit;

In [0]:
circuit_final_df=circuit_renamed_df.withColumn("ingestion_date",current_timestamp()) \
    .withColumn("environment",lit("ITG"))
circuit_final_df.show()

# Write data to parquet

In [0]:
circuit_final_df.write.parquet("abfss://processed@databrickspracticesa.dfs.core.windows.net/circuits")

In [0]:
circuit_processed_df=spark.read.parquet("abfss://processed@databrickspracticesa.dfs.core.windows.net/circuits").limit(100)
display(circuit_processed_df)