###Ingest circuits.csv file

In [0]:
dbutils.widgets.text('p_data_source','')
v_data_source = dbutils.widgets.get('p_data_source')

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

#####Step 1 - Read the CSV file using the spark dataframe reader

In [0]:
from pyspark.sql.types import StructType,StructField,IntegerType,StringType,DoubleType

In [0]:
circuits_schema=StructType(fields=[StructField('circuitid',IntegerType(),False),
                                    StructField('circuitRef',StringType(),True),
                                    StructField('name',StringType(),True),
                                    StructField('location',StringType(),True),
                                    StructField('country',StringType(),True),
                                    StructField('lat',DoubleType(),True),
                                    StructField('lng',DoubleType(),True),
                                    StructField('alt',IntegerType(),True),
                                    StructField('url',StringType(),True)]
                            )

In [0]:
circuits_df = spark.read.csv(f"{raw_folder_path}/circuits.csv",header=True,schema=circuits_schema)


In [0]:
circuits_df.describe()

Out[20]: DataFrame[summary: string, circuitid: string, circuitRef: string, name: string, location: string, country: string, lat: string, lng: string, alt: string, url: string]

In [0]:
circuits_df.show(10)

+---------+--------------+--------------------+------------+---------+--------+--------+---+--------------------+
|circuitid|    circuitRef|                name|    location|  country|     lat|     lng|alt|                 url|
+---------+--------------+--------------------+------------+---------+--------+--------+---+--------------------+
|        1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497| 144.968| 10|http://en.wikiped...|
|        2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083| 101.738| 18|http://en.wikiped...|
|        3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325| 50.5106|  7|http://en.wikiped...|
|        4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57| 2.26111|109|http://en.wikiped...|
|        5|      istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517|  29.405|130|http://en.wikiped...|
|        6|        monaco|   Circuit de Monaco| Monte-Carlo|   Monaco| 43.7347| 7.42056|

#####Step 2 -Select only the required columns

In [0]:
from pyspark.sql.functions import lit,col

In [0]:
circuits_selected = circuits_df.select(col('circuitid'),col('circuitRef'),
                                       col('name'),col('location'),
                                       col('country'),col('lat'),
                                       col('lng'),col('alt'))

In [0]:
circuits_selected.show(10)

+---------+--------------+--------------------+------------+---------+--------+--------+---+
|circuitid|    circuitRef|                name|    location|  country|     lat|     lng|alt|
+---------+--------------+--------------------+------------+---------+--------+--------+---+
|        1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497| 144.968| 10|
|        2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083| 101.738| 18|
|        3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325| 50.5106|  7|
|        4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57| 2.26111|109|
|        5|      istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517|  29.405|130|
|        6|        monaco|   Circuit de Monaco| Monte-Carlo|   Monaco| 43.7347| 7.42056|  7|
|        7|    villeneuve|Circuit Gilles Vi...|    Montreal|   Canada|    45.5|-73.5228| 13|
|        8|   magny_cours|Circuit de Nevers...| Magny Cours|   France|

#####Step 3 - Rename the columns as required

In [0]:
circuits_renamed_df = circuits_selected.withColumnRenamed('circuitId','circuit_id') \
    .withColumnRenamed('circuitRef','circuit_ref') \
    .withColumnRenamed('lat','latitude') \
    .withColumnRenamed('lng','longitude') \
    .withColumnRenamed('alt','altitude') \
    .withColumn('data_source',lit(v_data_source))

In [0]:
circuits_renamed_df.show(10)

+----------+--------------+--------------------+------------+---------+--------+---------+--------+-----------+
|circuit_id|   circuit_ref|                name|    location|  country|latitude|longitude|altitude|data_source|
+----------+--------------+--------------------+------------+---------+--------+---------+--------+-----------+
|         1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968|      10| Ergast API|
|         2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738|      18| Ergast API|
|         3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|       7| Ergast API|
|         4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|  2.26111|     109| Ergast API|
|         5|      istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517|   29.405|     130| Ergast API|
|         6|        monaco|   Circuit de Monaco| Monte-Carlo|   Monaco| 43.7347|  7.42056|       7| Erga

#####Step 4 - Add ingestion date to the dataframe

In [0]:
circuits_final_df = add_ingestion_date(circuits_renamed_df)

In [0]:
circuits_final_df.show(10)

+----------+--------------+--------------------+------------+---------+--------+---------+--------+-----------+--------------------+
|circuit_id|   circuit_ref|                name|    location|  country|latitude|longitude|altitude|data_source|      ingestion date|
+----------+--------------+--------------------+------------+---------+--------+---------+--------+-----------+--------------------+
|         1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968|      10| Ergast API|2023-09-30 17:55:...|
|         2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738|      18| Ergast API|2023-09-30 17:55:...|
|         3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|       7| Ergast API|2023-09-30 17:55:...|
|         4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|  2.26111|     109| Ergast API|2023-09-30 17:55:...|
|         5|      istanbul|       Istanbul Park|    Istanbul|   Turke

##### Step 4 - Write the output to processed container in parquet file

In [0]:
circuits_final_df.write.mode('overwrite').parquet(f"{processed_folder_path}/circuits")

In [0]:
dbutils.notebook.exit('Success')