###Ingest circuits.csv file

In [0]:
dbutils.widgets.text('p_data_source','')
v_data_source = dbutils.widgets.get('p_data_source')

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

#####Step 1 - Read the CSV file using the spark dataframe reader

In [0]:
from pyspark.sql.types import StructType,StructField,IntegerType,StringType,DoubleType

In [0]:
circuits_schema=StructType(fields=[StructField('circuitid',IntegerType(),False),
                                    StructField('circuitRef',StringType(),True),
                                    StructField('name',StringType(),True),
                                    StructField('location',StringType(),True),
                                    StructField('country',StringType(),True),
                                    StructField('lat',DoubleType(),True),
                                    StructField('lng',DoubleType(),True),
                                    StructField('alt',IntegerType(),True),
                                    StructField('url',StringType(),True)]
                            )

In [0]:
circuits_df = spark.read.csv(f"{raw_folder_path}/circuits.csv",header=True,schema=circuits_schema)


In [0]:
circuits_df.describe()

Out[7]: DataFrame[summary: string, circuitid: string, circuitRef: string, name: string, location: string, country: string, lat: string, lng: string, alt: string, url: string]

#####Step 2 -Select only the required columns

In [0]:
from pyspark.sql.functions import lit,col

In [0]:
circuits_selected = circuits_df.select(col('circuitid'),col('circuitRef'),
                                       col('name'),col('location'),
                                       col('country'),col('lat'),
                                       col('lng'),col('alt'))

#####Step 3 - Rename the columns as required

In [0]:
circuits_renamed_df = circuits_selected.withColumnRenamed('circuitId','circuit_id') \
    .withColumnRenamed('circuitRef','circuit_ref') \
    .withColumnRenamed('lat','latitude') \
    .withColumnRenamed('lng','longitude') \
    .withColumnRenamed('alt','altitude') \
    .withColumn('data_source',lit(v_data_source))

#####Step 4 - Add ingestion date to the dataframe

In [0]:
circuits_final_df = add_ingestion_date(circuits_renamed_df)

##### Step 4 - Write the output to processed container in parquet file

In [0]:
circuits_final_df.write.mode('overwrite').parquet(f"{processed_folder_path}/circuits")

In [0]:
dbutils.notebook.exit('Success')