# Ingest Races CSV File
1. Read CSV File
2. Specify Schema
3. Select Coumns
4. Rename Columns
5. Add New Columns
6. Write to Data Lake (File System) and Partition

In [0]:
# Import Modules
from pyspark.sql.functions import col, lit, concat, current_timestamp, to_timestamp
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType

In [0]:
# Display DBFS Mounted Contents

# %fs
# ls /mnt/dbcourselakehouse/raw

### Read CSV

In [0]:
# Read CSV File Using Spark DataFrame Reader API
# races_df = spark.read.option('header', True).csv("abfss://raw@dbcourselakehouse.dfs.core.windows.net/races.csv")

In [0]:
# Display Data Using Show Method
# races_df.show(n = 10)

# Display Data in Table Format
# display(races_df)

### Specify Schema (Infer and Explicit)

In [0]:
# Infer Schema Option (Suitable for Dev/Small Data)
# races_df = spark.read \
#     .option('header', True) \
#     .option('inferSchema', True) \
#     .csv("abfss://raw@dbcourselakehouse.dfs.core.windows.net/races.csv")

# Display Data
# display(races_df)

In [0]:
# Write Schema Explicitly (Best Practice for Production)
# StructType Represents Rows, StructField Represents Columns
races_schema = StructType(fields = [
    StructField("raceId", IntegerType(), False),
    StructField("year", IntegerType(), True),
    StructField("round", IntegerType(), True),
    StructField("circuitId", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("date", DateType(), True),
    StructField("time", StringType(), True),
    StructField("url", StringType(), True)
])

# Apply Schema
races_df = spark.read \
    .option('header', True) \
    .schema(races_schema) \
    .csv("abfss://raw@dbcourselakehouse.dfs.core.windows.net/races.csv")

# Display Data
# display(races_df)

In [0]:
# Print Schema
# races_df.printSchema()

In [0]:
# Describe Data
# races_df.describe().show()

### Select Required Columns

In [0]:
# Select Columns Implicitly
races_selected_df = races_df.select(
    'raceId',
    'year',
    'round',
    'circuitId',
    'name',
    'date',
    'time'
)

# Display Data
# display(races_selected_df)

In [0]:
# Select Columns Explicilty (Method 1)
# races_selected_df = races_df.select(
#     races_df.raceId,
#     races_df.year,
#     races_df.round,
#     races_df.circuitId,
#     races_df.name,
#     races_df.date,
#     races_df.time
# )

# Display Data
# display(races_selected_df)

In [0]:
# Select Columns Explicilty (Method 2)
# races_selected_df = races_df.select(
#     races_df['raceId'],
#     races_df['year'],
#     races_df['round'],
#     races_df['circuitId'],
#     races_df['name'],
#     races_df['date'],
#     races_df['time']
# )

# Display Data
# display(races_selected_df)

In [0]:
# Select Columns Using col Function
# Allows Column Aliasing
# races_selected_df = races_df.select(
#     col('raceId'),
#     col('year'),
#     col('round'),
#     col('circuitId'),
#     col('name'),
#     col('date'),
#     col('time')
# )

# Display Data
# display(races_selected_df)

### Rename Columns

In [0]:
# Rename Columns Using withColumnRenamed
races_renamed_df = races_selcted_df \
    .withColumnRenamed('raceId', 'race_id') \
    .withColumnRenamed('year', 'race_year') \
    .withColumnRenamed('circuitId', 'circuit_id')

# Display Data
# display(races_renamed_df)

### Add New Column

In [0]:
# Add Ingested Date Column w/ Current Timestamp
# Add Transformed Race Timestamp Column
races_final_df = races_renamed_df \
    .withColumn('race_timestamp', to_timestamp(concat(col('date'), lit(' '), col('time')), 'yyyy-MM-dd HH:mm:ss')) \
    .withColumn('ingestion_date', current_timestamp())

# Display Data
# display(races_final_df)

### Write DF to File System & Partition

In [0]:
# Write DataFrame to FileSystem in Parquet Format
races_final_df.write \
    .mode('overwrite') \
    .partitionBy('race_year') \
    .parquet("abfss://processed@dbcourselakehouse.dfs.core.windows.net/races")

In [0]:
# Display File System Contents
# %fs

# ls "abfss://processed@dbcourselakehouse.dfs.core.windows.net/races"

### Read Parquet File from File System

In [0]:
# Read File
# df = spark.read.parquet("abfss://processed@dbcourselakehouse.dfs.core.windows.net/races")

# Display Data
# display(df)