In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, StringType, DateType
from pyspark.sql.functions import col, lit, concat, current_timestamp
from pathlib import Path

spark = SparkSession.builder.appName('f1Practice').getOrCreate()

In [11]:
drivers_schema = StructType(fields=[
    StructField('driverId', IntegerType(), True),
    StructField('driverRef', StringType(), True),
    StructField('number', IntegerType(), True),
    StructField('code', StringType(), True),
    StructField('name', StructType(fields=[
        StructField('forename', StringType(), True),
        StructField('surname', StringType(), True)
    ]), True),
    StructField('dob', DateType(), True),
    StructField('nationality', StringType(), True),
    StructField('url', StringType(), True)
])

drivers_broze_path = Path.cwd() / 'bronze' / 'drivers.json'

df = spark.read.schema(drivers_schema).json(str(drivers_broze_path))

In [15]:
### Transform Data

selected_df = df.drop(col('url'))

final_df = selected_df.withColumnRenamed('driverId', 'driver_id') \
            .withColumnRenamed('driverRef', 'driver_ref') \
            .withColumnRenamed('dob', 'birthday') \
            .withColumn('ingestion_date', current_timestamp()) \
            .withColumn('name', concat(col("name.forename"), lit(' '), col("name.surname")))

In [17]:
drivers_silver_path = Path.cwd() / 'silver' / 'drivers'

final_df.write.mode('overwrite').parquet(str(drivers_silver_path))

                                                                                