In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, current_timestamp
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType
from config_file import raw_path, processed_path
from common_func import add_timestamp

In [2]:
spark = SparkSession.builder.appName('ETL').getOrCreate()
spark

## Cuircuits Data Ingestion

In [3]:
circuits_df = spark.read.csv(f"{raw_path}\circuits.csv",header=True)
circuits_df.show()

+---------+--------------+--------------------+------------+---------+--------+---------+---+--------------------+
|circuitId|    circuitRef|                name|    location|  country|     lat|      lng|alt|                 url|
+---------+--------------+--------------------+------------+---------+--------+---------+---+--------------------+
|        1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968| 10|http://en.wikiped...|
|        2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738| 18|http://en.wikiped...|
|        3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|  7|http://en.wikiped...|
|        4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|  2.26111|109|http://en.wikiped...|
|        5|      istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517|   29.405|130|http://en.wikiped...|
|        6|        monaco|   Circuit de Monaco| Monte-Carlo|   Monaco| 43.7347| 

In [4]:
# Lets check the schema
circuits_df.printSchema()

root
 |-- circuitId: string (nullable = true)
 |-- circuitRef: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- country: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lng: string (nullable = true)
 |-- alt: string (nullable = true)
 |-- url: string (nullable = true)



In [5]:
# Lets change the data type

Schema = StructType([StructField('circuitId',IntegerType(),True),
                    StructField('circuitRef',StringType(),True),
                    StructField('name',StringType(),True),
                    StructField('location',StringType(),True),
                    StructField('country',StringType(),True),
                    StructField('lat',DoubleType(),True),
                     StructField('lng',DoubleType(),True),
                    StructField('alt',IntegerType(),True),
                    StructField('url',StringType(),True),
                    ])

In [6]:
circuits_df = spark.read.csv(r'C:\Users\NeerajBhat\Desktop\Projects\Bizmetric_Projects\Practice_Projects\Formula1_Project\Raw_Data\raw\circuits.csv',header=True,schema = Schema)
circuits_df.show()

+---------+--------------+--------------------+------------+---------+--------+---------+---+--------------------+
|circuitId|    circuitRef|                name|    location|  country|     lat|      lng|alt|                 url|
+---------+--------------+--------------------+------------+---------+--------+---------+---+--------------------+
|        1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968| 10|http://en.wikiped...|
|        2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738| 18|http://en.wikiped...|
|        3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|  7|http://en.wikiped...|
|        4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|  2.26111|109|http://en.wikiped...|
|        5|      istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517|   29.405|130|http://en.wikiped...|
|        6|        monaco|   Circuit de Monaco| Monte-Carlo|   Monaco| 43.7347| 

In [7]:
circuits_df.printSchema()

root
 |-- circuitId: integer (nullable = true)
 |-- circuitRef: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- country: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lng: double (nullable = true)
 |-- alt: integer (nullable = true)
 |-- url: string (nullable = true)



In [8]:
circuits_df = circuits_df.withColumnRenamed('circuitId','circuit_Id')\
                        .withColumnRenamed('circuitRef','circuit_Ref')\
                        .withColumnRenamed('lat','latitude')\
                        .withColumnRenamed('lng','longitude')\
                        .withColumnRenamed('alt','altitude')

In [9]:
circuits_df = circuits_df.select('circuit_Id','circuit_Ref','name','location','country','latitude','longitude','altitude')
circuits_df.show()

+----------+--------------+--------------------+------------+---------+--------+---------+--------+
|circuit_Id|   circuit_Ref|                name|    location|  country|latitude|longitude|altitude|
+----------+--------------+--------------------+------------+---------+--------+---------+--------+
|         1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968|      10|
|         2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738|      18|
|         3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|       7|
|         4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|  2.26111|     109|
|         5|      istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517|   29.405|     130|
|         6|        monaco|   Circuit de Monaco| Monte-Carlo|   Monaco| 43.7347|  7.42056|       7|
|         7|    villeneuve|Circuit Gilles Vi...|    Montreal|   Canada|    45.5| -73.5228|      13|


In [10]:
circuits_df = add_timestamp(circuits_df)

In [11]:
circuits_df.show()

+----------+--------------+--------------------+------------+---------+--------+---------+--------+--------------------+
|circuit_Id|   circuit_Ref|                name|    location|  country|latitude|longitude|altitude|      ingestion_Date|
+----------+--------------+--------------------+------------+---------+--------+---------+--------+--------------------+
|         1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968|      10|2022-06-13 10:08:...|
|         2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738|      18|2022-06-13 10:08:...|
|         3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|       7|2022-06-13 10:08:...|
|         4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|  2.26111|     109|2022-06-13 10:08:...|
|         5|      istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517|   29.405|     130|2022-06-13 10:08:...|
|         6|        monaco|   Ci

In [12]:
circuits_df.write.mode('overwrite').parquet(f"{processed_path}\Circuits")