In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, current_timestamp, to_timestamp, concat
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType, DateType, TimestampType
from config_file import raw_path, processed_path
from common_func import add_timestamp

In [2]:
spark = SparkSession.builder.appName('ETL').getOrCreate()
spark

## Race Data Ingestion

In [3]:
race_df = spark.read.csv(f"{raw_path}/races.csv",header=True)
race_df.show()

+------+----+-----+---------+--------------------+----------+--------+--------------------+
|raceId|year|round|circuitId|                name|      date|    time|                 url|
+------+----+-----+---------+--------------------+----------+--------+--------------------+
|     1|2009|    1|        1|Australian Grand ...|2009-03-29|06:00:00|http://en.wikiped...|
|     2|2009|    2|        2|Malaysian Grand Prix|2009-04-05|09:00:00|http://en.wikiped...|
|     3|2009|    3|       17|  Chinese Grand Prix|2009-04-19|07:00:00|http://en.wikiped...|
|     4|2009|    4|        3|  Bahrain Grand Prix|2009-04-26|12:00:00|http://en.wikiped...|
|     5|2009|    5|        4|  Spanish Grand Prix|2009-05-10|12:00:00|http://en.wikiped...|
|     6|2009|    6|        6|   Monaco Grand Prix|2009-05-24|12:00:00|http://en.wikiped...|
|     7|2009|    7|        5|  Turkish Grand Prix|2009-06-07|12:00:00|http://en.wikiped...|
|     8|2009|    8|        9|  British Grand Prix|2009-06-21|12:00:00|http://en.

In [4]:
# Lets print the schema

race_df.printSchema()

root
 |-- raceId: string (nullable = true)
 |-- year: string (nullable = true)
 |-- round: string (nullable = true)
 |-- circuitId: string (nullable = true)
 |-- name: string (nullable = true)
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)
 |-- url: string (nullable = true)



In [5]:
# Lets change the data type

Schema = StructType([StructField('raceId',IntegerType(),True),
                    StructField('year',IntegerType(),True),
                    StructField('round',IntegerType(),True),
                    StructField('circuitId',IntegerType(),True),
                    StructField('name',StringType(),True),
                    StructField('date',DateType(),True),
                    StructField('time',StringType(),True),
                    StructField('url',StringType(),True),
                    ])

In [6]:
race_df = spark.read.csv(r'C:\Users\NeerajBhat\Desktop\Projects\Bizmetric_Projects\Practice_Projects\Formula1_Project\Raw_Data\raw\races.csv',header=True,schema=Schema)
race_df.show()

+------+----+-----+---------+--------------------+----------+--------+--------------------+
|raceId|year|round|circuitId|                name|      date|    time|                 url|
+------+----+-----+---------+--------------------+----------+--------+--------------------+
|     1|2009|    1|        1|Australian Grand ...|2009-03-29|06:00:00|http://en.wikiped...|
|     2|2009|    2|        2|Malaysian Grand Prix|2009-04-05|09:00:00|http://en.wikiped...|
|     3|2009|    3|       17|  Chinese Grand Prix|2009-04-19|07:00:00|http://en.wikiped...|
|     4|2009|    4|        3|  Bahrain Grand Prix|2009-04-26|12:00:00|http://en.wikiped...|
|     5|2009|    5|        4|  Spanish Grand Prix|2009-05-10|12:00:00|http://en.wikiped...|
|     6|2009|    6|        6|   Monaco Grand Prix|2009-05-24|12:00:00|http://en.wikiped...|
|     7|2009|    7|        5|  Turkish Grand Prix|2009-06-07|12:00:00|http://en.wikiped...|
|     8|2009|    8|        9|  British Grand Prix|2009-06-21|12:00:00|http://en.

In [7]:
race_df.printSchema()

root
 |-- raceId: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- round: integer (nullable = true)
 |-- circuitId: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- date: date (nullable = true)
 |-- time: string (nullable = true)
 |-- url: string (nullable = true)



In [8]:
race_df = race_df.withColumn('race_Timestamp',to_timestamp(concat(col('date'),lit(' '),col('time')),'yyy-MM-dd HH:mm:ss'))

In [9]:
race_df = add_timestamp(race_df)

In [10]:
race_df = race_df.select(col('raceId').alias('race_Id'),col('year').alias('race_Year'),col('round'),col('circuitId').alias('circuit_Id'),col('name'),col('race_Timestamp'),col('ingestion_Date'))
race_df.show()

+-------+---------+-----+----------+--------------------+-------------------+--------------------+
|race_Id|race_Year|round|circuit_Id|                name|     race_Timestamp|      ingestion_Date|
+-------+---------+-----+----------+--------------------+-------------------+--------------------+
|      1|     2009|    1|         1|Australian Grand ...|2009-03-29 06:00:00|2022-06-13 10:54:...|
|      2|     2009|    2|         2|Malaysian Grand Prix|2009-04-05 09:00:00|2022-06-13 10:54:...|
|      3|     2009|    3|        17|  Chinese Grand Prix|2009-04-19 07:00:00|2022-06-13 10:54:...|
|      4|     2009|    4|         3|  Bahrain Grand Prix|2009-04-26 12:00:00|2022-06-13 10:54:...|
|      5|     2009|    5|         4|  Spanish Grand Prix|2009-05-10 12:00:00|2022-06-13 10:54:...|
|      6|     2009|    6|         6|   Monaco Grand Prix|2009-05-24 12:00:00|2022-06-13 10:54:...|
|      7|     2009|    7|         5|  Turkish Grand Prix|2009-06-07 12:00:00|2022-06-13 10:54:...|
|      8| 

In [11]:
race_df.write.mode('overwrite').partitionBy('race_Year').parquet(f"{processed_path}\Race")