In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp
from pyspark.sql.types import StringType, IntegerType, StructType, StructField, FloatType
from pathlib import Path

pitstops_bronze_path = Path.cwd() / 'bronze' / 'pit_stops.json'
pitstops_silver_path = Path.cwd() / 'silver' / 'pitstops'

spark = SparkSession.builder.appName('f1Practice').getOrCreate()

In [18]:
pitstops_schema = StructType(fields=[
    StructField('driverId', IntegerType(), True),
    StructField('duration', FloatType(), True),
    StructField('lap', IntegerType(), True),
    StructField('milliseconds', IntegerType(), True),
    StructField('raceId', IntegerType(), True),
    StructField('stop', IntegerType(), True),
    StructField('time', StringType(), True)
])

df = spark.read.schema(pitstops_schema).option('multiline', True).json(str(pitstops_bronze_path))


In [21]:
final_df = df.withColumnRenamed('driverId', 'driver_id') \
            .withColumnRenamed('raceId', 'race_id') \
            .withColumn('ingestion_date', current_timestamp())

In [22]:
final_df.write.mode('overwrite').parquet(str(pitstops_silver_path))

                                                                                