# Ingest Circuits CSV File
1. Read Data
2. Transform Data
3. Write Data

In [None]:
# Import Modules
from pyspark.sql.functions import col, lit, current_timestamp
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

In [None]:
%run "../01-Setup/09-Global-Variables"

In [None]:
%run "../01-Setup/10-Global-Functions"

In [None]:
# print(raw_folder_path)
# print(processed_folder_path)
# print(curated_folder_path)

### Read Data

In [None]:
# Read CSV File Using Spark DataFrame Reader API
# circuits_df = spark.read.option('header', True).csv("abfss://raw@dbcourselakehouse.dfs.core.windows.net/circuits.csv")

In [None]:
# Display Data Using Show Method
# circuits_df.show(n = 10)

# Display Data in Table Format
# display(circuits_df)

In [None]:
# Write Schema (Implicitly) (Suitable for Dev/Small Data)
# circuits_df = spark.read \
#     .option('header', True) \
#     .option('inferSchema', True) \
#     .csv('abfss://raw@dbcourselakehouse.dfs.core.windows.net/circuits.csv')

# Display Data
# display(circuits_df)


# ----------------------------------------
# Write Schema (Explicitly) (Best Practice for Production)
# StructType Represents Rows, StructField Represents Columns
circuits_schema = StructType(fields = [
    StructField('circuitId', IntegerType(), False),
    StructField('circuitRef', StringType(), True),
    StructField('name', StringType(), True),
    StructField('location', StringType(), True),
    StructField('country', StringType(), True),
    StructField('lat', DoubleType(), True),
    StructField('lng', DoubleType(), True),
    StructField('alt', IntegerType(), True),
    StructField('url', StringType(), True)
])

# Apply Schema
circuits_df = spark.read \
    .option('header', True) \
    .schema(circuits_schema) \
    .csv(f'{raw_folder_path}/circuits.csv')
    # .csv('abfss://raw@dbcourselakehouse.dfs.core.windows.net/circuits.csv') \

# Display Data
display(circuits_df)

circuitId,circuitRef,name,location,country,lat,lng,alt,url
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,http://en.wikipedia.org/wiki/Melbourne_Grand_Prix_Circuit
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,http://en.wikipedia.org/wiki/Sepang_International_Circuit
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,http://en.wikipedia.org/wiki/Bahrain_International_Circuit
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,http://en.wikipedia.org/wiki/Circuit_de_Barcelona-Catalunya
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,http://en.wikipedia.org/wiki/Istanbul_Park
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7,http://en.wikipedia.org/wiki/Circuit_de_Monaco
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13,http://en.wikipedia.org/wiki/Circuit_Gilles_Villeneuve
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228,http://en.wikipedia.org/wiki/Circuit_de_Nevers_Magny-Cours
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153,http://en.wikipedia.org/wiki/Silverstone_Circuit
10,hockenheimring,Hockenheimring,Hockenheim,Germany,49.3278,8.56583,103,http://en.wikipedia.org/wiki/Hockenheimring


In [None]:
# Print Schema
circuits_df.printSchema()

root
 |-- circuitId: integer (nullable = true)
 |-- circuitRef: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- country: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lng: double (nullable = true)
 |-- alt: integer (nullable = true)
 |-- url: string (nullable = true)



In [None]:
# Describe Data
circuits_df.describe().show()

+-------+------------------+----------+-------+---------+---------+------------------+-----------------+-----------------+--------------------+
|summary|         circuitId|circuitRef|   name| location|  country|               lat|              lng|              alt|                 url|
+-------+------------------+----------+-------+---------+---------+------------------+-----------------+-----------------+--------------------+
|  count|                77|        77|     77|       77|       77|                77|               77|               77|                  77|
|   mean|              39.0|      null|   null|     null|     null| 33.72035103896102|3.551302597402597|247.4935064935065|                null|
| stddev|22.371857321197094|      null|   null|     null|     null|22.885969000074535| 64.8766790440326|363.2672505910991|                null|
|    min|                 1|       BAK|A1-Ring|Abu Dhabi|Argentina|          -37.8497|         -118.189|               -7|http://en.wiki

### Transform Data

In [None]:
# Select Columns Implicitly (Method 1)
circuits_selected_df = circuits_df.select(
    'circuitId',
    'circuitRef',
    'name',
    'location',
    'country',
    'lat',
    'lng',
    'alt'
)

# Display Data
# display(circuits_selected_df)


# ----------------------------------------
# Select Columns Explicilty (Method 2)
circuits_selected_df = circuits_df.select(
    circuits_df.circuitId,
    circuits_df.circuitRef,
    circuits_df.name,
    circuits_df.location,
    circuits_df.country,
    circuits_df.lat,
    circuits_df.lng,
    circuits_df.alt
)

# Display Data
# display(circuits_seected_df)


# ----------------------------------------
# Select Columns Explicilty (Method 3)
# circuits_selected_df = circuits_df.select(
#     circuits_df['circuitId'],
#     circuits_df['circuitRef'],
#     circuits_df['name'],
#     circuits_df['location'],
#     circuits_df['country'],
#     circuits_df['lat'],
#     circuits_df['lng'],
#     circuits_df['alt']
# )

# Display Data
# display(circuits_selected_df)


# ----------------------------------------
# Select Columns Using col Function (Method 4)
# Allows Column Aliasing
circuits_selected_df = circuits_df.select(
    col('circuitId'),
    col('circuitRef'),
    col('name'),
    col('location'),
    col('country'),
    col('lat'),
    col('lng').alias('long'),
    col('alt')
)

# Display Data
display(circuits_selected_df)

circuitId,circuitRef,name,location,country,lat,long,alt
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153
10,hockenheimring,Hockenheimring,Hockenheim,Germany,49.3278,8.56583,103


In [None]:
# Rename Columns Using withColumnRenamed
circuits_renamed_df = circuits_selected_df \
    .withColumnRenamed('circuitId', 'circuit_id') \
    .withColumnRenamed('circuitRef', 'circuit_ref') \
    .withColumnRenamed('lat', 'latitude') \
    .withColumnRenamed('long', 'longitude') \
    .withColumnRenamed('alt', 'altitude')

# Display Data
display(circuits_renamed_df)

circuit_id,circuit_ref,name,location,country,latitude,longitude,altitude
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153
10,hockenheimring,Hockenheimring,Hockenheim,Germany,49.3278,8.56583,103


In [None]:
# Add String Literal Column w/ Static Value (lit Function)
circuits_env_df = circuits_renamed_df \
    .withColumn('env', lit('prod'))
    # .withColumn('ingestion_date', current_timestamp())

# Add Ingested Date Column w/ Current Timestamp
circuits_final_df = add_ingestion_date(circuits_env_df)

# Display Data
display(circuits_final_df)

circuit_id,circuit_ref,name,location,country,latitude,longitude,altitude,env,ingestion_date
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,prod,2023-07-12T02:46:58.793+0000
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,prod,2023-07-12T02:46:58.793+0000
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,prod,2023-07-12T02:46:58.793+0000
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,prod,2023-07-12T02:46:58.793+0000
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,prod,2023-07-12T02:46:58.793+0000
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7,prod,2023-07-12T02:46:58.793+0000
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13,prod,2023-07-12T02:46:58.793+0000
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228,prod,2023-07-12T02:46:58.793+0000
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153,prod,2023-07-12T02:46:58.793+0000
10,hockenheimring,Hockenheimring,Hockenheim,Germany,49.3278,8.56583,103,prod,2023-07-12T02:46:58.793+0000


### Write Data

In [None]:
# Write DataFrame to FileSystem in Parquet Format
circuits_final_df.write \
    .mode('overwrite') \
    .parquet(f'{processed_folder_path}/circuits')
    # .parquet('abfss://processed@dbcourselakehouse.dfs.core.windows.net/circuits')

In [None]:
# Display File System Contents
# %fs

# ls "abfss://processed@dbcourselakehouse.dfs.core.windows.net/circuits"

In [None]:
# Read File
df = spark.read.parquet('abfss://processed@dbcourselakehouse.dfs.core.windows.net/circuits')

# Display Data
display(df)

circuit_id,circuit_ref,name,location,country,latitude,longitude,altitude,env,ingestion_date
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,prod,2023-07-12T02:46:58.982+0000
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,prod,2023-07-12T02:46:58.982+0000
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,prod,2023-07-12T02:46:58.982+0000
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,prod,2023-07-12T02:46:58.982+0000
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,prod,2023-07-12T02:46:58.982+0000
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7,prod,2023-07-12T02:46:58.982+0000
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13,prod,2023-07-12T02:46:58.982+0000
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228,prod,2023-07-12T02:46:58.982+0000
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153,prod,2023-07-12T02:46:58.982+0000
10,hockenheimring,Hockenheimring,Hockenheim,Germany,49.3278,8.56583,103,prod,2023-07-12T02:46:58.982+0000


In [None]:
# Notbook Exit Output
dbutils.notebook.exit("Circuits Successful")