# Ingest Races CSV File
1. Read Data
2. Transform Data
3. Write Data

In [None]:
# Import Modules
from pyspark.sql.functions import col, lit, concat, current_timestamp, to_timestamp
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType

In [None]:
%run "../01-Setup/09-Global-Variables"

In [None]:
%run "../01-Setup/10-Global-Functions"

### Read Data

In [None]:
# Write Schema (Explicitly)
# StructType Represents Rows, StructField Represents Columns
races_schema = StructType(fields = [
    StructField('raceId', IntegerType(), False),
    StructField('year', IntegerType(), True),
    StructField('round', IntegerType(), True),
    StructField('circuitId', IntegerType(), True),
    StructField('name', StringType(), True),
    StructField('date', DateType(), True),
    StructField('time', StringType(), True),
    StructField('url', StringType(), True)
])

# Apply Schema
races_df = spark.read \
    .option('header', True) \
    .schema(races_schema) \
    .csv(f'{raw_folder_path}/races.csv')

# Display Data
display(races_df)

raceId,year,round,circuitId,name,date,time,url
1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,http://en.wikipedia.org/wiki/2009_Australian_Grand_Prix
2,2009,2,2,Malaysian Grand Prix,2009-04-05,09:00:00,http://en.wikipedia.org/wiki/2009_Malaysian_Grand_Prix
3,2009,3,17,Chinese Grand Prix,2009-04-19,07:00:00,http://en.wikipedia.org/wiki/2009_Chinese_Grand_Prix
4,2009,4,3,Bahrain Grand Prix,2009-04-26,12:00:00,http://en.wikipedia.org/wiki/2009_Bahrain_Grand_Prix
5,2009,5,4,Spanish Grand Prix,2009-05-10,12:00:00,http://en.wikipedia.org/wiki/2009_Spanish_Grand_Prix
6,2009,6,6,Monaco Grand Prix,2009-05-24,12:00:00,http://en.wikipedia.org/wiki/2009_Monaco_Grand_Prix
7,2009,7,5,Turkish Grand Prix,2009-06-07,12:00:00,http://en.wikipedia.org/wiki/2009_Turkish_Grand_Prix
8,2009,8,9,British Grand Prix,2009-06-21,12:00:00,http://en.wikipedia.org/wiki/2009_British_Grand_Prix
9,2009,9,20,German Grand Prix,2009-07-12,12:00:00,http://en.wikipedia.org/wiki/2009_German_Grand_Prix
10,2009,10,11,Hungarian Grand Prix,2009-07-26,12:00:00,http://en.wikipedia.org/wiki/2009_Hungarian_Grand_Prix


In [None]:
# Print Schema
races_df.printSchema()

root
 |-- raceId: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- round: integer (nullable = true)
 |-- circuitId: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- date: date (nullable = true)
 |-- time: string (nullable = true)
 |-- url: string (nullable = true)



In [None]:
# Describe Data
races_df.describe().show()

+-------+------------------+-----------------+------------------+------------------+--------------------+--------+--------------------+
|summary|            raceId|             year|             round|         circuitId|                name|    time|                 url|
+-------+------------------+-----------------+------------------+------------------+--------------------+--------+--------------------+
|  count|              1058|             1058|              1058|              1058|                1058|    1058|                1058|
|   mean| 531.2315689981097|1990.780718336484| 8.382797731568997|22.089792060491494|                null|    null|                null|
| stddev|308.16570918807656|19.73008802240494|5.0002806845260235|17.154605278616593|                null|    null|                null|
|    min|                 1|             1950|                 1|                 1|70th Anniversary ...|03:00:00|http://en.wikiped...|
|    max|              1073|             2021|  

### Transform Data

In [None]:
# Select Columns Implicitly
races_selected_df = races_df.select(
    'raceId',
    'year',
    'round',
    'circuitId',
    'name',
    'date',
    'time'
)

# Display Data
display(races_selected_df)

raceId,year,round,circuitId,name,date,time
1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00
2,2009,2,2,Malaysian Grand Prix,2009-04-05,09:00:00
3,2009,3,17,Chinese Grand Prix,2009-04-19,07:00:00
4,2009,4,3,Bahrain Grand Prix,2009-04-26,12:00:00
5,2009,5,4,Spanish Grand Prix,2009-05-10,12:00:00
6,2009,6,6,Monaco Grand Prix,2009-05-24,12:00:00
7,2009,7,5,Turkish Grand Prix,2009-06-07,12:00:00
8,2009,8,9,British Grand Prix,2009-06-21,12:00:00
9,2009,9,20,German Grand Prix,2009-07-12,12:00:00
10,2009,10,11,Hungarian Grand Prix,2009-07-26,12:00:00


In [None]:
# Rename Columns Using withColumnRenamed
races_renamed_df = races_selected_df \
    .withColumnRenamed('raceId', 'race_id') \
    .withColumnRenamed('year', 'race_year') \
    .withColumnRenamed('circuitId', 'circuit_id')

# Display Data
display(races_renamed_df)

race_id,race_year,round,circuit_id,name,date,time
1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00
2,2009,2,2,Malaysian Grand Prix,2009-04-05,09:00:00
3,2009,3,17,Chinese Grand Prix,2009-04-19,07:00:00
4,2009,4,3,Bahrain Grand Prix,2009-04-26,12:00:00
5,2009,5,4,Spanish Grand Prix,2009-05-10,12:00:00
6,2009,6,6,Monaco Grand Prix,2009-05-24,12:00:00
7,2009,7,5,Turkish Grand Prix,2009-06-07,12:00:00
8,2009,8,9,British Grand Prix,2009-06-21,12:00:00
9,2009,9,20,German Grand Prix,2009-07-12,12:00:00
10,2009,10,11,Hungarian Grand Prix,2009-07-26,12:00:00


In [None]:
# Add Transformed Race Timestamp Column
races_timestamp_df = races_renamed_df \
    .withColumn('race_timestamp', to_timestamp(concat(col('date'), lit(' '), col('time')), 'yyyy-MM-dd HH:mm:ss'))
    # .withColumn('ingestion_date', current_timestamp())

# Add Ingested Date Column w/ Current Timestamp
races_final_df = add_ingestion_date(races_timestamp_df)


# Display Data
display(races_final_df)

race_id,race_year,round,circuit_id,name,date,time,race_timestamp,ingestion_date
1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,2009-03-29T06:00:00.000+0000,2023-07-12T02:52:18.512+0000
2,2009,2,2,Malaysian Grand Prix,2009-04-05,09:00:00,2009-04-05T09:00:00.000+0000,2023-07-12T02:52:18.512+0000
3,2009,3,17,Chinese Grand Prix,2009-04-19,07:00:00,2009-04-19T07:00:00.000+0000,2023-07-12T02:52:18.512+0000
4,2009,4,3,Bahrain Grand Prix,2009-04-26,12:00:00,2009-04-26T12:00:00.000+0000,2023-07-12T02:52:18.512+0000
5,2009,5,4,Spanish Grand Prix,2009-05-10,12:00:00,2009-05-10T12:00:00.000+0000,2023-07-12T02:52:18.512+0000
6,2009,6,6,Monaco Grand Prix,2009-05-24,12:00:00,2009-05-24T12:00:00.000+0000,2023-07-12T02:52:18.512+0000
7,2009,7,5,Turkish Grand Prix,2009-06-07,12:00:00,2009-06-07T12:00:00.000+0000,2023-07-12T02:52:18.512+0000
8,2009,8,9,British Grand Prix,2009-06-21,12:00:00,2009-06-21T12:00:00.000+0000,2023-07-12T02:52:18.512+0000
9,2009,9,20,German Grand Prix,2009-07-12,12:00:00,2009-07-12T12:00:00.000+0000,2023-07-12T02:52:18.512+0000
10,2009,10,11,Hungarian Grand Prix,2009-07-26,12:00:00,2009-07-26T12:00:00.000+0000,2023-07-12T02:52:18.512+0000


### Write Data

In [None]:
# Write DataFrame to FileSystem in Parquet Format
# Partition on Race year
races_final_df.write \
    .mode('overwrite') \
    .partitionBy('race_year') \
    .parquet(f'{processed_folder_path}/races')

In [None]:
# Display File System Contents
# %fs

# ls "abfss://processed@dbcourselakehouse.dfs.core.windows.net/races"

In [None]:
# Read File
df = spark.read.parquet('abfss://processed@dbcourselakehouse.dfs.core.windows.net/races')

# Display Data
display(df)

race_id,round,circuit_id,name,date,time,race_timestamp,ingestion_date,race_year
1053,2,21,Emilia Romagna Grand Prix,2021-04-18,13:00:00,2021-04-18T13:00:00.000+0000,2023-07-12T02:52:18.782+0000,2021
1052,1,3,Bahrain Grand Prix,2021-03-28,15:00:00,2021-03-28T15:00:00.000+0000,2023-07-12T02:52:18.782+0000,2021
1051,21,1,Australian Grand Prix,2021-11-21,06:00:00,2021-11-21T06:00:00.000+0000,2023-07-12T02:52:18.782+0000,2021
1054,3,20,TBC,2021-05-02,\N,,2023-07-12T02:52:18.782+0000,2021
1055,4,4,Spanish Grand Prix,2021-05-09,13:00:00,2021-05-09T13:00:00.000+0000,2023-07-12T02:52:18.782+0000,2021
1056,5,6,Monaco Grand Prix,2021-05-23,13:00:00,2021-05-23T13:00:00.000+0000,2023-07-12T02:52:18.782+0000,2021
1057,6,73,Azerbaijan Grand Prix,2021-06-06,12:00:00,2021-06-06T12:00:00.000+0000,2023-07-12T02:52:18.782+0000,2021
1058,7,7,Canadian Grand Prix,2021-06-13,18:00:00,2021-06-13T18:00:00.000+0000,2023-07-12T02:52:18.782+0000,2021
1059,8,34,French Grand Prix,2021-06-27,13:00:00,2021-06-27T13:00:00.000+0000,2023-07-12T02:52:18.782+0000,2021
1060,9,70,Austrian Grand Prix,2021-07-04,13:00:00,2021-07-04T13:00:00.000+0000,2023-07-12T02:52:18.782+0000,2021


In [None]:
# Notbook Exit Output
dbutils.notebook.exit("Races Successful")