In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('circuits').getOrCreate()

In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

In [3]:
circuits_schema = StructType([StructField("circuitId", IntegerType(), False),
                             StructField("circuitRef", StringType(), False),
                             StructField("name", StringType(), False),
                             StructField("location", StringType(), False),
                             StructField("country", StringType(), False),
                             StructField("lat", DoubleType(), False),
                             StructField("lng", DoubleType(), False),
                             StructField("alt", IntegerType(), False),
                             StructField("url", StringType(), False)])

In [4]:
circuits_df = spark.read.csv('../../raw/circuits.csv', schema=circuits_schema, header=True)

In [5]:
circuits_df.show()

+---------+--------------+--------------------+------------+---------+--------+---------+---+--------------------+
|circuitId|    circuitRef|                name|    location|  country|     lat|      lng|alt|                 url|
+---------+--------------+--------------------+------------+---------+--------+---------+---+--------------------+
|        1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968| 10|http://en.wikiped...|
|        2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738| 18|http://en.wikiped...|
|        3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|  7|http://en.wikiped...|
|        4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|  2.26111|109|http://en.wikiped...|
|        5|      istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517|   29.405|130|http://en.wikiped...|
|        6|        monaco|   Circuit de Monaco| Monte-Carlo|   Monaco| 43.7347| 

In [6]:
circuits_df = circuits_df.withColumnRenamed('circuitId', 'circuit_id') \
.withColumnRenamed('circuitRef', 'circuit_ref') \
.withColumnRenamed('lat', 'latitude') \
.withColumnRenamed('long', 'longitude') \
.withColumnRenamed('alt', 'altitude')

In [7]:
circuits_df = circuits_df.drop('url')

In [8]:
from pyspark.sql.functions import current_timestamp

In [9]:
circuits_df = circuits_df.withColumn('ingestion_date', current_timestamp())

In [10]:
circuits_df.show()

+----------+--------------+--------------------+------------+---------+--------+---------+--------+--------------------+
|circuit_id|   circuit_ref|                name|    location|  country|latitude|      lng|altitude|      ingestion_date|
+----------+--------------+--------------------+------------+---------+--------+---------+--------+--------------------+
|         1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968|      10|2022-03-14 12:29:...|
|         2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738|      18|2022-03-14 12:29:...|
|         3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|       7|2022-03-14 12:29:...|
|         4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|  2.26111|     109|2022-03-14 12:29:...|
|         5|      istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517|   29.405|     130|2022-03-14 12:29:...|
|         6|        monaco|   Ci

In [12]:
circuits_df.write.mode('overwrite').parquet('../../processed/circuits')

In [13]:
circuits_df2 = spark.read.parquet('../../processed/circuits/', header=True)

In [14]:
circuits_df2.show()

+----------+--------------+--------------------+------------+---------+--------+---------+--------+--------------------+
|circuit_id|   circuit_ref|                name|    location|  country|latitude|      lng|altitude|      ingestion_date|
+----------+--------------+--------------------+------------+---------+--------+---------+--------+--------------------+
|         1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968|      10|2022-03-14 12:29:...|
|         2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738|      18|2022-03-14 12:29:...|
|         3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|       7|2022-03-14 12:29:...|
|         4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|  2.26111|     109|2022-03-14 12:29:...|
|         5|      istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517|   29.405|     130|2022-03-14 12:29:...|
|         6|        monaco|   Ci