In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, current_timestamp, to_timestamp, concat
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType
from config_file import raw_path, processed_path
from common_func import add_timestamp

In [2]:
spark = SparkSession.builder.appName('ETL').getOrCreate()
spark

## Lap Times Data Ingestion

In [3]:
Schema = StructType([StructField('_c0',IntegerType(),True),
                    StructField('_c1',IntegerType(),True),
                    StructField('_c2',IntegerType(),True),
                    StructField('_c3',IntegerType(),True),
                    StructField('_c4',StringType(),True),
                    StructField('_c5',IntegerType(),True),
                    ])

In [4]:
lap_time_df = spark.read.option('schema','Schema').csv(f"{raw_path}\lap_times")
lap_time_df.show()

+---+---+---+---+--------+------+
|_c0|_c1|_c2|_c3|     _c4|   _c5|
+---+---+---+---+--------+------+
|841| 20|  1|  1|1:38.109| 98109|
|841| 20|  2|  1|1:33.006| 93006|
|841| 20|  3|  1|1:32.713| 92713|
|841| 20|  4|  1|1:32.803| 92803|
|841| 20|  5|  1|1:32.342| 92342|
|841| 20|  6|  1|1:32.605| 92605|
|841| 20|  7|  1|1:32.502| 92502|
|841| 20|  8|  1|1:32.537| 92537|
|841| 20|  9|  1|1:33.240| 93240|
|841| 20| 10|  1|1:32.572| 92572|
|841| 20| 11|  1|1:32.669| 92669|
|841| 20| 12|  1|1:32.902| 92902|
|841| 20| 13|  1|1:33.698| 93698|
|841| 20| 14|  3|1:52.075|112075|
|841| 20| 15|  4|1:38.385| 98385|
|841| 20| 16|  2|1:31.548| 91548|
|841| 20| 17|  1|1:30.800| 90800|
|841| 20| 18|  1|1:31.810| 91810|
|841| 20| 19|  1|1:31.018| 91018|
|841| 20| 20|  1|1:31.055| 91055|
+---+---+---+---+--------+------+
only showing top 20 rows



In [5]:
lap_time_df = lap_time_df.withColumnRenamed('_c0','race_Id')\
                        .withColumnRenamed('_c1','driver_Id')\
                        .withColumnRenamed('_c2','lap')\
                        .withColumnRenamed('_c3','position')\
                        .withColumnRenamed('_c4','time')\
                        .withColumnRenamed('_c5','milliseconds')

In [6]:
lap_time_df = add_timestamp(lap_time_df)

In [7]:
lap_time_df.show()

+-------+---------+---+--------+--------+------------+--------------------+
|race_Id|driver_Id|lap|position|    time|milliseconds|      ingestion_Date|
+-------+---------+---+--------+--------+------------+--------------------+
|    841|       20|  1|       1|1:38.109|       98109|2022-06-13 10:40:...|
|    841|       20|  2|       1|1:33.006|       93006|2022-06-13 10:40:...|
|    841|       20|  3|       1|1:32.713|       92713|2022-06-13 10:40:...|
|    841|       20|  4|       1|1:32.803|       92803|2022-06-13 10:40:...|
|    841|       20|  5|       1|1:32.342|       92342|2022-06-13 10:40:...|
|    841|       20|  6|       1|1:32.605|       92605|2022-06-13 10:40:...|
|    841|       20|  7|       1|1:32.502|       92502|2022-06-13 10:40:...|
|    841|       20|  8|       1|1:32.537|       92537|2022-06-13 10:40:...|
|    841|       20|  9|       1|1:33.240|       93240|2022-06-13 10:40:...|
|    841|       20| 10|       1|1:32.572|       92572|2022-06-13 10:40:...|
|    841|   

In [8]:
lap_time_df.write.mode('overwrite').parquet(f"{processed_path}\Lap_Time")