In [85]:
# This notebook will investigate the viability of transforming the data into parquet files using Spark.
# Pros with using Parquet over csv are that it has an embedded schema, takes up less storage and can extract
#  individual columns much faster
from config import proj
from pathlib import Path
import zipfile
import os
import py7zr # need for decompression of 7z

PATH_RAW_DATA_DIR = Path(proj.proj_paths["top"]).joinpath('data').joinpath('raw')
PATH_INTERIM_DATA_DIR = Path(proj.proj_paths["top"]).joinpath('data').joinpath('interim')
PATH_PROC_DATA_DIR = Path(proj.proj_paths["top"]).joinpath('data').joinpath('processed')

In [7]:
# Get compressed file name
print(os.listdir(PATH_RAW_DATA_DIR)[0])

favorita-grocery-sales-forecasting.zip


In [8]:
PATH_RAW_DATA = PATH_RAW_DATA_DIR.joinpath('favorita-grocery-sales-forecasting.zip')

In [13]:
# Check contents of compressed file
with zipfile.ZipFile(PATH_RAW_DATA, 'r') as zip_ref:
    for file in zip_ref.infolist():
        print(file)

<ZipInfo filename='holidays_events.csv.7z' compress_type=deflate file_size=1898 compress_size=1903>
<ZipInfo filename='items.csv.7z' compress_type=deflate file_size=14315 compress_size=14320>
<ZipInfo filename='oil.csv.7z' compress_type=deflate file_size=3762 compress_size=3767>
<ZipInfo filename='sample_submission.csv.7z' compress_type=deflate file_size=666528 compress_size=649511>
<ZipInfo filename='stores.csv.7z' compress_type=deflate file_size=648 compress_size=653>
<ZipInfo filename='test.csv.7z' compress_type=deflate file_size=4885065 compress_size=4886553>
<ZipInfo filename='train.csv.7z' compress_type=deflate file_size=474092593 compress_size=474237203>
<ZipInfo filename='transactions.csv.7z' compress_type=deflate file_size=219499 compress_size=219569>


All the contents are also compressed. Will start with the largest one and generate a parquet file with it and do ensure everything works smoothly.

In [14]:
# Unzip master file
with zipfile.ZipFile(PATH_RAW_DATA, 'r') as zip_ref:
    zip_ref.extractall(PATH_INTERIM_DATA_DIR)

In [24]:
# Get file names of interim
file_list = os.listdir(PATH_INTERIM_DATA_DIR)
for file in file_list:
    print(file + ' - ' + str(os.stat(PATH_INTERIM_DATA_DIR.joinpath(file)).st_size))

items.csv.7z - 14315
transactions.csv.7z - 219499
holidays_events.csv.7z - 1898
train.csv.7z - 474092593
stores.csv.7z - 648
oil.csv.7z - 3762
test.csv.7z - 4885065
sample_submission.csv.7z - 666528


train.csv.7z is clearly the largest file.
stores.csv7z is the smallest.

Will start small with the stores, then work with train.

In [33]:
# Unzip stores
PATH_INTERIM_DATA_STORES = PATH_INTERIM_DATA_DIR.joinpath('stores.csv.7z')

with py7zr.SevenZipFile(PATH_INTERIM_DATA_STORES, 'r') as zip_ref:
    zip_ref.extractall(PATH_INTERIM_DATA_DIR)

In [40]:
print('stores.csv' in os.listdir(PATH_INTERIM_DATA_DIR))

True


Can see stores.csv is in the folder.

Now we will convert into a parquet file using Spark

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [51]:
# Show top 5 rows
df = spark.read.csv(str(PATH_INTERIM_DATA_DIR.joinpath('stores.csv')), header=True)
df.show(5)

+---------+-------------+--------------------+----+-------+
|store_nbr|         city|               state|type|cluster|
+---------+-------------+--------------------+----+-------+
|        1|        Quito|           Pichincha|   D|     13|
|        2|        Quito|           Pichincha|   D|     13|
|        3|        Quito|           Pichincha|   D|      8|
|        4|        Quito|           Pichincha|   D|      9|
|        5|Santo Domingo|Santo Domingo de ...|   D|      4|
+---------+-------------+--------------------+----+-------+
only showing top 5 rows



In [69]:
# Update schema
df.printSchema()

root
 |-- store_nbr: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- type: string (nullable = true)
 |-- cluster: string (nullable = true)



In [72]:
from pyspark.sql.types import StructType, IntegerType, StringType

# Need to clarify the schema
stores_schema = StructType()\
    .add('store_nbr', IntegerType(), True)\
    .add('city', StringType(), True)\
    .add('state', StringType(), True)\
    .add('type', StringType(), True)\
    .add('cluster', StringType(), True) # is an integer, but given it's a grouping better to keep it categorical

In [76]:
# Rather than cast everything in place, we will just read in the file again
df = spark.read.csv(str(PATH_INTERIM_DATA_DIR.joinpath('stores.csv')), header=True, schema=stores_schema)
df.printSchema()
df.show(5)

root
 |-- store_nbr: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- type: string (nullable = true)
 |-- cluster: string (nullable = true)

+---------+-------------+--------------------+----+-------+
|store_nbr|         city|               state|type|cluster|
+---------+-------------+--------------------+----+-------+
|        1|        Quito|           Pichincha|   D|     13|
|        2|        Quito|           Pichincha|   D|     13|
|        3|        Quito|           Pichincha|   D|      8|
|        4|        Quito|           Pichincha|   D|      9|
|        5|Santo Domingo|Santo Domingo de ...|   D|      4|
+---------+-------------+--------------------+----+-------+
only showing top 5 rows



In [86]:
# Write parquet file
PATH_STORES_PQ = str(PATH_PROC_DATA_DIR.joinpath('stores.parquet'))
df.write.parquet(PATH_STORES_PQ)

In [79]:
# Read in from parquet
df = spark.read.parquet(PATH_STORES_PQ)
df.printSchema()
df.show(6)

root
 |-- store_nbr: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- type: string (nullable = true)
 |-- cluster: string (nullable = true)

+---------+-------------+--------------------+----+-------+
|store_nbr|         city|               state|type|cluster|
+---------+-------------+--------------------+----+-------+
|        1|        Quito|           Pichincha|   D|     13|
|        2|        Quito|           Pichincha|   D|     13|
|        3|        Quito|           Pichincha|   D|      8|
|        4|        Quito|           Pichincha|   D|      9|
|        5|Santo Domingo|Santo Domingo de ...|   D|      4|
|        6|        Quito|           Pichincha|   D|     13|
+---------+-------------+--------------------+----+-------+
only showing top 6 rows



Schema looks good. We could also partition this file but wont have to since it is so small.

Given pandas is so commonly used with Python, the pandas api on spark will be used moving forward.

In [82]:
import pandas as pd
import numpy as np
import pyspark.pandas as ps
psdf = df.pandas_api()
psdf.head()

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


In [84]:
psdf.describe()

Unnamed: 0,store_nbr
count,54.0
mean,27.5
std,15.732133
min,1.0
25%,14.0
50%,27.0
75%,41.0
max,54.0


Above looks good.

Below we will do the same for the largest file (train.csv) and make sure to partition it.
# TODO generate parquet file for train, attempt some operations on it.