In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, current_timestamp
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType
from config_file import raw_path, processed_path
from common_func import add_timestamp

In [2]:
spark = SparkSession.builder.appName('ETL').getOrCreate()
spark

## Qualifying Data Ingestion

In [3]:
qualifying_df = spark.read.json(f"{raw_path}\qualifying",multiLine=True)
qualifying_df.show()

+-------------+--------+------+--------+--------+--------+--------+---------+------+
|constructorId|driverId|number|position|      q1|      q2|      q3|qualifyId|raceId|
+-------------+--------+------+--------+--------+--------+--------+---------+------+
|            1|       1|    22|       1|1:26.572|1:25.187|1:26.714|        1|    18|
|            2|       9|     4|       2|1:26.103|1:25.315|1:26.869|        2|    18|
|            1|       5|    23|       3|1:25.664|1:25.452|1:27.079|        3|    18|
|            6|      13|     2|       4|1:25.994|1:25.691|1:27.178|        4|    18|
|            2|       2|     3|       5|1:25.960|1:25.518|1:27.236|        5|    18|
|            7|      15|    11|       6|1:26.427|1:26.101|1:28.527|        6|    18|
|            3|       3|     7|       7|1:26.295|1:26.059|1:28.687|        7|    18|
|            9|      14|     9|       8|1:26.381|1:26.063|1:29.041|        8|    18|
|            7|      10|    12|       9|1:26.919|1:26.164|1:29.59

In [4]:
qualifying_df.printSchema()

root
 |-- constructorId: long (nullable = true)
 |-- driverId: long (nullable = true)
 |-- number: long (nullable = true)
 |-- position: long (nullable = true)
 |-- q1: string (nullable = true)
 |-- q2: string (nullable = true)
 |-- q3: string (nullable = true)
 |-- qualifyId: long (nullable = true)
 |-- raceId: long (nullable = true)



In [5]:
qualifying_df = qualifying_df.withColumnRenamed('constructorId','constructor_Id')\
.withColumnRenamed('driverId','driver_Id')\
.withColumnRenamed('qualifyId','qualify_Id')\
.withColumnRenamed('raceId','race_Id')

In [6]:
qualifying_df = add_timestamp(qualifying_df)

In [7]:
qualifying_df.show()

+--------------+---------+------+--------+--------+--------+--------+----------+-------+--------------------+
|constructor_Id|driver_Id|number|position|      q1|      q2|      q3|qualify_Id|race_Id|      ingestion_Date|
+--------------+---------+------+--------+--------+--------+--------+----------+-------+--------------------+
|             1|        1|    22|       1|1:26.572|1:25.187|1:26.714|         1|     18|2022-06-13 10:10:...|
|             2|        9|     4|       2|1:26.103|1:25.315|1:26.869|         2|     18|2022-06-13 10:10:...|
|             1|        5|    23|       3|1:25.664|1:25.452|1:27.079|         3|     18|2022-06-13 10:10:...|
|             6|       13|     2|       4|1:25.994|1:25.691|1:27.178|         4|     18|2022-06-13 10:10:...|
|             2|        2|     3|       5|1:25.960|1:25.518|1:27.236|         5|     18|2022-06-13 10:10:...|
|             7|       15|    11|       6|1:26.427|1:26.101|1:28.527|         6|     18|2022-06-13 10:10:...|
|         

In [8]:
qualifying_df.write.mode('overwrite').parquet(f"{processed_path}\Qualifying")