In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [None]:
spark = SparkSession.builder.appName('Optimize I').getOrCreate()

In [None]:
walmart_stock = spark.read.format('csv') \
                        .option("inferSchema", True) \
                        .option("header", False) \
                        .option("sep", ',') \
                .load("/FileStore/tables/sales_data/data.csv")

#### 1.1 Perform map operation ####
We need to propagate make and year to the accident records (incident type A), using
vin_number as the aggregate key. Therefore the map output key should be vin_number, value
should be the make and year, along with the incident type. In Spark, in order to proceed with the
“groupByKey” function, we need the map operation to produce PairRDD, with tuple type as each
record.

In [None]:
vin_kv = raw_rdd.map( lambda x: extract_vin_key_value(x))
# Please implement method extract_vin_key_value()

#### 1.2 Perform group aggregation to populate make and year to all the records ####
Like the reducer in MapReduce framework, Spark provides a “groupByKey” function to achieve
shuffle and sort in order to aggregate all records sharing the same key to the same groups.
Within a group of vin_number, we need to iterate through all the records and find the one that
has the make and year available and capture it in group level master info. As we filter and
output accident records, those records need to be modified adding the master info that we
captured in the first iteration.

In [None]:
enhance_make = vin_kv.groupByKey().flatMap( lambda kv: populate_make(kv[ 1 ]))
# Please implement method populate_make()

#### 2.1 Perform map operation ####
The goal of this step is to count the number of records for each make and year combination,
given the result we derived previously. The output key should be the combination of vehicle
make and year. The value should be the count of 1.

In [None]:
make_kv = enhance_make.map( lambda x: extract_make_key_value(x))
# Please implement method extract_make_key_value()

#### 2.2 Aggregate the key and count the number of records in total per key ####
Use Spark provided “reduceByKey” function to perform the sum of all the values (1) from each
record. As a result, we get the make and year combination key along with its total record count.

#### Step 3. Save the result to HDFS as text ####
The output file should look similar to this.
- Nissan-2003,1
- BMW-2008,10
- MERCEDES-2013,2