In [7]:
import os

cwd = os.getcwd()

In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, concat, to_timestamp, current_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType

spark = SparkSession.builder.appName('f1Practice').getOrCreate()


In [12]:
schema = StructType(fields=[StructField('raceId', IntegerType(), True),
                            StructField('year', IntegerType(), True),
                            StructField('round', IntegerType(), True),
                            StructField('circuitId', IntegerType(), True),
                            StructField('name', StringType(), True),
                            StructField('date', StringType(), True),
                            StructField('time', StringType(), True),
                            StructField('url', StringType(), True)
                            ])

In [30]:
races_path = cwd + '/bronze/races.csv'

df = spark.read.option('header', True).schema(schema).csv(races_path)

In [36]:
### Transform Date and Time to race_timestamp and add ingestion_date

df_race_timestamp = df.withColumn('race_timestamp', to_timestamp(concat(col('date'), lit(' '), col('time')), 'yyyy-MM-dd HH:mm:ss')) \
                    .withColumn('ingestion_date', current_timestamp())

In [37]:
selected_df = df_race_timestamp.select(col('raceId').alias('race_id'),
                                    col('year').alias('race_year'),
                                    col('round'),
                                    col('circuitId').alias('circuit_id'),
                                    col('name'),
                                    col('race_timestamp'),
                                    col('ingestion_date'))

In [41]:
silver_path = cwd + '/silver/races'

selected_df.write.mode('overwrite').parquet(silver_path) ### you can .partitionBy('column_name') to save the data by groups