# Install PySpark

In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 30 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 34.9 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845513 sha256=322a5049afa90f09597241d5ec3f42579049443865a93027f9ba37a38b1bdf67
  Stored in directory: /root/.cache/pip/wheels/42/59/f5/79a5bf931714dcd201b26025347785f087370a10a3329a899c
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


In [2]:
import pyspark

sc = pyspark.SparkContext()
sql_sc = pyspark.SQLContext(sc)



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
spark = (pyspark.sql.SparkSession
         .builder
         .appName('FormulaOne')
         .getOrCreate()
)

# Import datasets 

In [5]:
!curl -O http://ergast.com/downloads/f1db_csv.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 5836k  100 5836k    0     0  3081k      0  0:00:01  0:00:01 --:--:-- 3080k


In [6]:
!unzip /content/f1db_csv.zip -d /content/f1_datasets

Archive:  /content/f1db_csv.zip
  inflating: /content/f1_datasets/circuits.csv  
  inflating: /content/f1_datasets/constructor_results.csv  
  inflating: /content/f1_datasets/constructors.csv  
  inflating: /content/f1_datasets/constructor_standings.csv  
  inflating: /content/f1_datasets/drivers.csv  
  inflating: /content/f1_datasets/driver_standings.csv  
  inflating: /content/f1_datasets/lap_times.csv  
  inflating: /content/f1_datasets/pit_stops.csv  
  inflating: /content/f1_datasets/qualifying.csv  
  inflating: /content/f1_datasets/races.csv  
  inflating: /content/f1_datasets/results.csv  
  inflating: /content/f1_datasets/seasons.csv  
  inflating: /content/f1_datasets/sprint_results.csv  
  inflating: /content/f1_datasets/status.csv  


In [7]:
!ls -lh /content/f1_datasets

total 20M
-rw-rw-r-- 1 root root 9.8K Oct 31 08:51 circuits.csv
-rw-rw-r-- 1 root root 206K Oct 31 08:51 constructor_results.csv
-rw-rw-r-- 1 root root  17K Oct 31 08:51 constructors.csv
-rw-rw-r-- 1 root root 298K Oct 31 08:51 constructor_standings.csv
-rw-rw-r-- 1 root root  92K Oct 31 08:51 drivers.csv
-rw-rw-r-- 1 root root 837K Oct 31 08:51 driver_standings.csv
-rw-rw-r-- 1 root root  16M Oct 31 08:51 lap_times.csv
-rw-rw-r-- 1 root root 363K Oct 31 08:51 pit_stops.csv
-rw-rw-r-- 1 root root 408K Oct 31 08:51 qualifying.csv
-rw-rw-r-- 1 root root 151K Oct 31 08:51 races.csv
-rw-rw-r-- 1 root root 1.6M Oct 31 08:51 results.csv
-rw-rw-r-- 1 root root 4.4K Oct 31 08:51 seasons.csv
-rw-r--r-- 1 root root 6.8K Oct 31 08:51 sprint_results.csv
-rw-rw-r-- 1 root root 2.1K Oct 31 08:51 status.csv


# Import PySpark

In [37]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, TimestampType, DateType
from pyspark.sql.functions import current_timestamp, lit, col, to_timestamp, concat

# Load .csv file, using InferSchema 

In [9]:
circuits_schema = StructType(fields=[StructField("circuitId", IntegerType(), False),
                                     StructField("circuitRef", StringType(), True),
                                     StructField("name", StringType(), True),
                                     StructField("location", StringType(), True),
                                     StructField("country", StringType(), True),
                                     StructField("lat", DoubleType(), True),
                                     StructField("lng", DoubleType(), True),
                                     StructField("alt", IntegerType(), True),
                                     StructField("url", StringType(), True)
])

circuits_df = (spark.read
               .schema(circuits_schema)
               .csv('/content/f1_datasets/circuits.csv', sep=',', header = True)
)

circuits_selected_df = circuits_df.select(col("circuitId"),col("circuitRef"),col("name"),col("location"),col("country"),\
                                          col("lat"),col("lng"),col("alt"))

circuits_renamed_df = circuits_selected_df.withColumnRenamed("circuitId","circuit_id")\
.withColumnRenamed("circuitRef","circuit_ref")\
.withColumnRenamed("lat","latitude")\
.withColumnRenamed("lng","longitude")\
.withColumnRenamed("alt","altitude")

circuits_final_df = circuits_renamed_df.withColumn("ingestion_date",current_timestamp())\
.withColumn("env",lit("Production"))

In [12]:
circuits_df.printSchema()

root
 |-- circuitId: integer (nullable = true)
 |-- circuitRef: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- country: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lng: double (nullable = true)
 |-- alt: integer (nullable = true)
 |-- url: string (nullable = true)



# Write data as parquet 

In [24]:
circuits_df.write.mode("overwrite").parquet("/content/f1_datasets/circuits")

In [22]:
!ls -lh /content/f1_datasets/circuits

total 12K
-rw-r--r-- 1 root root 8.9K Nov  8 09:29 part-00000-c393105c-d685-43f6-8e49-1036df38a041-c000.snappy.parquet
-rw-r--r-- 1 root root    0 Nov  8 09:29 _SUCCESS


In [25]:
df = spark.read.parquet("/content/f1_datasets/circuits")
df.show()

+---------+--------------+--------------------+------------+---------+--------+---------+---+--------------------+
|circuitId|    circuitRef|                name|    location|  country|     lat|      lng|alt|                 url|
+---------+--------------+--------------------+------------+---------+--------+---------+---+--------------------+
|        1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968| 10|http://en.wikiped...|
|        2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738| 18|http://en.wikiped...|
|        3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|  7|http://en.wikiped...|
|        4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|  2.26111|109|http://en.wikiped...|
|        5|      istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517|   29.405|130|http://en.wikiped...|
|        6|        monaco|   Circuit de Monaco| Monte-Carlo|   Monaco| 43.7347| 

# Ingest race.csv

## All step

In [39]:
races_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                     StructField("year", IntegerType(), True),
                                     StructField("round", IntegerType(), True),
                                     StructField("circuitId", StringType(), True),
                                     StructField("name", StringType(), True),
                                     StructField("date", DateType(), True),
                                     StructField("time", StringType(), True),
                                     StructField("url", StringType(), True)
])

races_df = (spark.read
               .schema(races_schema)
               .csv('/content/f1_datasets/races.csv', sep=',', header = True)
)

races_with_df = (races_df
                  .withColumn("race_timestamp", to_timestamp(concat(col("date"),lit(' '),col("time")),'yyyy-MM-dd HH:mm:ss'))
                  .withColumn("ingestion_date",current_timestamp())
)

races_selected_df = (races_with_df.select(col("raceId").alias("race_id"),col("year").alias("race_year"),
                                          col("round"),col("circuitId").alias("circuit_id"),col("name"),col("ingestion_date"),col("race_timestamp"))
)

races_selected_df.write.mode("overwrite").parquet("/content/f1_datasets/races")
races_parquet = spark.read.parquet("/content/f1_datasets/races")
races_parquet.show()

+-------+---------+-----+----------+--------------------+--------------------+-------------------+
|race_id|race_year|round|circuit_id|                name|      ingestion_date|     race_timestamp|
+-------+---------+-----+----------+--------------------+--------------------+-------------------+
|      1|     2009|    1|         1|Australian Grand ...|2022-11-08 10:13:...|2009-03-29 06:00:00|
|      2|     2009|    2|         2|Malaysian Grand Prix|2022-11-08 10:13:...|2009-04-05 09:00:00|
|      3|     2009|    3|        17|  Chinese Grand Prix|2022-11-08 10:13:...|2009-04-19 07:00:00|
|      4|     2009|    4|         3|  Bahrain Grand Prix|2022-11-08 10:13:...|2009-04-26 12:00:00|
|      5|     2009|    5|         4|  Spanish Grand Prix|2022-11-08 10:13:...|2009-05-10 12:00:00|
|      6|     2009|    6|         6|   Monaco Grand Prix|2022-11-08 10:13:...|2009-05-24 12:00:00|
|      7|     2009|    7|         5|  Turkish Grand Prix|2022-11-08 10:13:...|2009-06-07 12:00:00|
|      8| 

## Add partition by 

In [41]:
races_selected_df.write.mode("overwrite").partitionBy('race_year').parquet("/content/f1_datasets/processed/races")
races_parquet = spark.read.parquet("/content/f1_datasets/processed/races")
races_parquet.show()

+-------+-----+----------+--------------------+--------------------+-------------------+---------+
|race_id|round|circuit_id|                name|      ingestion_date|     race_timestamp|race_year|
+-------+-----+----------+--------------------+--------------------+-------------------+---------+
|   1074|    1|         3|  Bahrain Grand Prix|2022-11-08 10:54:...|2022-03-20 15:00:00|     2022|
|   1075|    2|        77|Saudi Arabian Gra...|2022-11-08 10:54:...|2022-03-27 17:00:00|     2022|
|   1076|    3|         1|Australian Grand ...|2022-11-08 10:54:...|2022-04-10 05:00:00|     2022|
|   1077|    4|        21|Emilia Romagna Gr...|2022-11-08 10:54:...|2022-04-24 13:00:00|     2022|
|   1078|    5|        79|    Miami Grand Prix|2022-11-08 10:54:...|2022-05-08 19:30:00|     2022|
|   1079|    6|         4|  Spanish Grand Prix|2022-11-08 10:54:...|2022-05-22 13:00:00|     2022|
|   1080|    7|         6|   Monaco Grand Prix|2022-11-08 10:54:...|2022-05-29 13:00:00|     2022|
|   1081| 