# Ingest Constructors JSON File

1. Read Data
2. Transform Data
3. Write Data

In [None]:
# Import Modules
from pyspark.sql.functions import col, current_timestamp

In [None]:
%run "../01-Setup/09-Global-Variables"

In [None]:
%run "../01-Setup/10-Global-Functions"

### Read Data

In [None]:
# Write Schema with DDL (Another Option to StructType)
constructors_schema = "constructorId INT, constructorRef STRING, name STRING, nationality STRING, url STRING"
# display(constructors_schema)

# Apply Schema 
constructors_df = spark.read \
    .schema(constructors_schema) \
    .json(f'{raw_folder_path}/constructors.json')

#Display Data
display(constructors_df)

constructorId,constructorRef,name,nationality,url
1,mclaren,McLaren,British,http://en.wikipedia.org/wiki/McLaren
2,bmw_sauber,BMW Sauber,German,http://en.wikipedia.org/wiki/BMW_Sauber
3,williams,Williams,British,http://en.wikipedia.org/wiki/Williams_Grand_Prix_Engineering
4,renault,Renault,French,http://en.wikipedia.org/wiki/Renault_in_Formula_One
5,toro_rosso,Toro Rosso,Italian,http://en.wikipedia.org/wiki/Scuderia_Toro_Rosso
6,ferrari,Ferrari,Italian,http://en.wikipedia.org/wiki/Scuderia_Ferrari
7,toyota,Toyota,Japanese,http://en.wikipedia.org/wiki/Toyota_Racing
8,super_aguri,Super Aguri,Japanese,http://en.wikipedia.org/wiki/Super_Aguri_F1
9,red_bull,Red Bull,Austrian,http://en.wikipedia.org/wiki/Red_Bull_Racing
10,force_india,Force India,Indian,http://en.wikipedia.org/wiki/Racing_Point_Force_India


In [None]:
# Print Schema
constructors_df.printSchema()

root
 |-- constructorId: integer (nullable = true)
 |-- constructorRef: string (nullable = true)
 |-- name: string (nullable = true)
 |-- nationality: string (nullable = true)
 |-- url: string (nullable = true)



In [None]:
# Describe Data
constructors_df.describe().show()

+-------+------------------+--------------+--------+-----------+--------------------+
|summary|     constructorId|constructorRef|    name|nationality|                 url|
+-------+------------------+--------------+--------+-----------+--------------------+
|  count|               211|           211|     211|        211|                 211|
|   mean|107.03791469194313|          null|    null|       null|                null|
| stddev|61.653629124344434|          null|    null|       null|                null|
|    min|                 1|         adams|     AFM|   American|http://en.wikiped...|
|    max|               214|      zakspeed|Zakspeed|      Swiss|http://en.wikiped...|
+-------+------------------+--------------+--------+-----------+--------------------+



### Transform Data

In [None]:
# Drop Column (Method 1)
# display(
#     constructors_df.drop('url')
# )


# ----------------------------------------
# Drop Column (Method 2)
# display(
#     constructors_df.drop(constructors_df['url'])
# )


# ----------------------------------------
# Drop Column (Method 3)
# display(
#     constructors_df.drop(col('url'))
# )


# ----------------------------------------
# Overwrite DataFrame with Dropped Column Using Method 1
constructors_df = constructors_df.drop('url')

# Display Data
display(constructors_df)

constructorId,constructorRef,name,nationality
1,mclaren,McLaren,British
2,bmw_sauber,BMW Sauber,German
3,williams,Williams,British
4,renault,Renault,French
5,toro_rosso,Toro Rosso,Italian
6,ferrari,Ferrari,Italian
7,toyota,Toyota,Japanese
8,super_aguri,Super Aguri,Japanese
9,red_bull,Red Bull,Austrian
10,force_india,Force India,Indian


In [None]:
# Rename Columns Using withColumnRenamed
# Personal Preference: Break Calls in Sepearte Statements
constructors_renamed_df = constructors_df \
    .withColumnRenamed('constructorId', 'constructor_id') \
    .withColumnRenamed('constructorRef', 'constructor_ref')
    # .withColumn('ingestion_date', current_timestamp())

# Add Ingested Date Column w/ Current Timestamp
constructors_final_df = add_ingestion_date(constructors_renamed_df)

# Display Data
display(constructors_final_df)

constructorId,constructor_ref,name,nationality,ingestion_date
1,mclaren,McLaren,British,2023-07-12T02:55:07.606+0000
2,bmw_sauber,BMW Sauber,German,2023-07-12T02:55:07.606+0000
3,williams,Williams,British,2023-07-12T02:55:07.606+0000
4,renault,Renault,French,2023-07-12T02:55:07.606+0000
5,toro_rosso,Toro Rosso,Italian,2023-07-12T02:55:07.606+0000
6,ferrari,Ferrari,Italian,2023-07-12T02:55:07.606+0000
7,toyota,Toyota,Japanese,2023-07-12T02:55:07.606+0000
8,super_aguri,Super Aguri,Japanese,2023-07-12T02:55:07.606+0000
9,red_bull,Red Bull,Austrian,2023-07-12T02:55:07.606+0000
10,force_india,Force India,Indian,2023-07-12T02:55:07.606+0000


### Write Data

In [None]:
# Write DataFrame to File System
constructors_final_df.write \
    .mode('overwrite') \
    .parquet(f'{processed_folder_path}/constructors')

In [None]:
# Display File System Contents
# %fs

# ls "abfss://processed@dbcourselakehouse.dfs.core.windows.net/constructors"

In [None]:
# Read File from File System (Test)
df = spark.read.parquet('abfss://processed@dbcourselakehouse.dfs.core.windows.net/constructors')

# Display Data
display(df)

constructorId,constructor_ref,name,nationality,ingestion_date
1,mclaren,McLaren,British,2023-07-12T02:55:07.791+0000
2,bmw_sauber,BMW Sauber,German,2023-07-12T02:55:07.791+0000
3,williams,Williams,British,2023-07-12T02:55:07.791+0000
4,renault,Renault,French,2023-07-12T02:55:07.791+0000
5,toro_rosso,Toro Rosso,Italian,2023-07-12T02:55:07.791+0000
6,ferrari,Ferrari,Italian,2023-07-12T02:55:07.791+0000
7,toyota,Toyota,Japanese,2023-07-12T02:55:07.791+0000
8,super_aguri,Super Aguri,Japanese,2023-07-12T02:55:07.791+0000
9,red_bull,Red Bull,Austrian,2023-07-12T02:55:07.791+0000
10,force_india,Force India,Indian,2023-07-12T02:55:07.791+0000


In [None]:
# Notbook Exit Output
dbutils.notebook.exit("Constructors Successful")