In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('constructors').getOrCreate()

In [6]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [7]:
constructors_schema = StructType([StructField("constructorId", IntegerType(), False),
                             StructField("constructorRef", StringType(), False),
                             StructField("name", StringType(), False),
                             StructField("nationality", StringType(), False),
                             StructField("url", StringType(), False)])

In [8]:
constructors_df = spark.read.json('../../raw/constructors.json', schema=constructors_schema)

In [10]:
constructors_df.show(5)

+-------------+--------------+----------+-----------+--------------------+
|constructorId|constructorRef|      name|nationality|                 url|
+-------------+--------------+----------+-----------+--------------------+
|            1|       mclaren|   McLaren|    British|http://en.wikiped...|
|            2|    bmw_sauber|BMW Sauber|     German|http://en.wikiped...|
|            3|      williams|  Williams|    British|http://en.wikiped...|
|            4|       renault|   Renault|     French|http://en.wikiped...|
|            5|    toro_rosso|Toro Rosso|    Italian|http://en.wikiped...|
+-------------+--------------+----------+-----------+--------------------+
only showing top 5 rows



In [11]:
from pyspark.sql.functions import current_timestamp

In [13]:
constructors_df = constructors_df.withColumnRenamed('constructorId', 'constructor_id') \
                                .withColumnRenamed('constructorRef', 'constructor_ref') \
                                .withColumn('ingestion_date', current_timestamp())

In [14]:
constructors_df.show(5)

+--------------+---------------+----------+-----------+--------------------+--------------------+
|constructor_id|constructor_ref|      name|nationality|                 url|      ingestion_date|
+--------------+---------------+----------+-----------+--------------------+--------------------+
|             1|        mclaren|   McLaren|    British|http://en.wikiped...|2022-03-14 14:44:...|
|             2|     bmw_sauber|BMW Sauber|     German|http://en.wikiped...|2022-03-14 14:44:...|
|             3|       williams|  Williams|    British|http://en.wikiped...|2022-03-14 14:44:...|
|             4|        renault|   Renault|     French|http://en.wikiped...|2022-03-14 14:44:...|
|             5|     toro_rosso|Toro Rosso|    Italian|http://en.wikiped...|2022-03-14 14:44:...|
+--------------+---------------+----------+-----------+--------------------+--------------------+
only showing top 5 rows



In [16]:
constructors_df.write.mode('overwrite').parquet('../../processed/constructos')

In [17]:
constructos = spark.read.parquet('../../processed/constructos')

In [18]:
constructos.show(5)

+--------------+---------------+----------+-----------+--------------------+--------------------+
|constructor_id|constructor_ref|      name|nationality|                 url|      ingestion_date|
+--------------+---------------+----------+-----------+--------------------+--------------------+
|             1|        mclaren|   McLaren|    British|http://en.wikiped...|2022-03-14 14:45:...|
|             2|     bmw_sauber|BMW Sauber|     German|http://en.wikiped...|2022-03-14 14:45:...|
|             3|       williams|  Williams|    British|http://en.wikiped...|2022-03-14 14:45:...|
|             4|        renault|   Renault|     French|http://en.wikiped...|2022-03-14 14:45:...|
|             5|     toro_rosso|Toro Rosso|    Italian|http://en.wikiped...|2022-03-14 14:45:...|
+--------------+---------------+----------+-----------+--------------------+--------------------+
only showing top 5 rows

