# ETL for main data
The following script builds each of the datasets provided by Kaggle.

While this script may seem to have a lot of repetition, many of the functions used have been extracted moved to the `src/utils/etl_utils`. The assumption is that each dataset will require individual treatment, and this will become more clear the data is better understood. Repeating each file cell by cell gives a data engineer a lot more freedom to make changes to individual datasets rather than treat them all the same.

Note, that the schemas deliberately coupled with the etl cells below rather than living externally. This is because it is very unlikely a dataset will have multiple different schemas, hence simplifying in this instance by coupling configuration and functionality.

### Clear the interim data directory
This should be run when an entire refresh is required.

In [None]:
from src.utils.etl_utils import *

refresh = True # set to True if refresh of all data will be done.

if refresh:
    clear_interm_data_dir()

### Decompress downloaded file
(main file from Kaggle)

In [2]:
from config import proj
from src.utils.etl_utils import *

decomp_file(file_path=proj.Config.paths.get("data_raw").joinpath('favorita-grocery-sales-forecasting.zip'),
            extract_path=proj.Config.paths.get("data_interim"))

file_list = get_file_list(dir_path=proj.Config.paths.get("data_interim"),
                          regex_etx_pattern=".*\\.7z$")

print(file_list)

['items.csv.7z', 'transactions.csv.7z', 'holidays_events.csv.7z', 'train.csv.7z', 'stores.csv.7z', 'oil.csv.7z', 'test.csv.7z', 'sample_submission.csv.7z']


## Prepare extracted data

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

### Holidays Events
Holidays and Events, with metadata

In [None]:
from config import proj
from src.utils.etl_utils import *
from re import sub
from pathlib import Path
from pyspark.sql.types import StructType, IntegerType, StringType, DateType, FloatType, BooleanType

file_name = "holidays_events.csv.7z"

# Decompress file
file_path = proj.Config.paths.get("data_interim").joinpath(file_name)
decomp_file(file_path=file_path,
            extract_path=proj.Config.paths.get("data_interim"),
            print_msg=True)

# Convert decompressed csv to parquet file with custom schema
file_path_csv = Path(sub("\\.7z$", "", str(file_path)))
parquet_file_name = Path(sub("\\.csv\\.7z$", ".parquet", str(file_name)))
parquet_file_path = proj.Config.paths.get("data_proc").joinpath(parquet_file_name)

schema = StructType()\
    .add('date', DateType(), True)\
    .add('type', StringType(), True)\
    .add('locale', StringType(), True)\
    .add('locale_name', StringType(), True)\
    .add('description', StringType(), True)\
    .add('transferred', BooleanType(), True)

csv_to_parquet(read_path=file_path_csv,
               write_path=parquet_file_path,
               schema=schema)

### Items
Item metadata, including family, class, and perishable.
NOTE: Items marked as perishable have a score weight of 1.25; otherwise, the weight is 1.0.

In [None]:
from config import proj
from src.utils.etl_utils import *
from re import sub
from pathlib import Path
from pyspark.sql.types import StructType, IntegerType, StringType, DateType, FloatType, BooleanType

file_name = "items.csv.7z"

# Decompress file
file_path = proj.Config.paths.get("data_interim").joinpath(file_name)
decomp_file(file_path=file_path,
            extract_path=proj.Config.paths.get("data_interim"),
            print_msg=True)

# Convert decompressed csv to parquet file with custom schema
file_path_csv = Path(sub("\\.7z$", "", str(file_path)))
parquet_file_name = Path(sub("\\.csv\\.7z$", ".parquet", str(file_name)))
parquet_file_path = proj.Config.paths.get("data_proc").joinpath(parquet_file_name)

schema = StructType()\
    .add('item_nbr', IntegerType(), True)\
    .add('family', StringType(), True)\
    .add('class', StringType(), True)\
    .add('perishable', IntegerType(), True)

csv_to_parquet(read_path=file_path_csv,
               write_path=parquet_file_path,
               schema=schema)

### Oil
Daily oil price. Includes values during both the train and test data timeframe. (Ecuador is an oil-dependent country and it's economical health is highly vulnerable to shocks in oil prices.)
But very unlikely this data will be useful for the model unless we were to know future oil prices (which we would need to predict).

In [None]:
from config import proj
from src.utils.etl_utils import *
from re import sub
from pathlib import Path
from pyspark.sql.types import StructType, IntegerType, StringType, DateType, FloatType, BooleanType

file_name = "oil.csv.7z"

# Decompress file
file_path = proj.Config.paths.get("data_interim").joinpath(file_name)
decomp_file(file_path=file_path,
            extract_path=proj.Config.paths.get("data_interim"),
            print_msg=True)

# Convert decompressed csv to parquet file with custom schema
file_path_csv = Path(sub("\\.7z$", "", str(file_path)))
parquet_file_name = Path(sub("\\.csv\\.7z$", ".parquet", str(file_name)))
parquet_file_path = proj.Config.paths.get("data_proc").joinpath(parquet_file_name)

schema = StructType()\
    .add('date', DateType(), True)\
    .add('dcoilwtico', FloatType(), True)

csv_to_parquet(read_path=file_path_csv,
               write_path=parquet_file_path,
               schema=schema)

### Sample submission
A sample submission file in the correct format.

Note: this is not input data, but instead an illustration of the structure of a submission to Kaggle.
Notice ID is included here, meaning we must continue to keep ID for the test data.

In [None]:
from config import proj
from src.utils.etl_utils import *
from re import sub
from pathlib import Path
from pyspark.sql.types import StructType, IntegerType, StringType, DateType, FloatType, BooleanType

file_name = "sample_submission.csv.7z"

# Decompress file
file_path = proj.Config.paths.get("data_interim").joinpath(file_name)
decomp_file(file_path=file_path,
            extract_path=proj.Config.paths.get("data_interim"),
            print_msg=True)

# Convert decompressed csv to parquet file with custom schema
file_path_csv = Path(sub("\\.7z$", "", str(file_path)))
parquet_file_name = Path(sub("\\.csv\\.7z$", ".parquet", str(file_name)))
parquet_file_path = proj.Config.paths.get("data_proc").joinpath(parquet_file_name)

schema = StructType()\
    .add('id', IntegerType(), True)\
    .add('unit_sales', FloatType(), True)

csv_to_parquet(read_path=file_path_csv,
               write_path=parquet_file_path,
               schema=schema)

### Stores
Store metadata, including city, state, type, and cluster.
cluster is a grouping of similar stores.

In [None]:
from config import proj
from src.utils.etl_utils import *
from re import sub
from pathlib import Path
from pyspark.sql.types import StructType, IntegerType, StringType, DateType, FloatType, BooleanType

file_name = "stores.csv.7z"

# Decompress file
file_path = proj.Config.paths.get("data_interim").joinpath(file_name)
decomp_file(file_path=file_path,
            extract_path=proj.Config.paths.get("data_interim"),
            print_msg=True)

# Convert decompressed csv to parquet file with custom schema
file_path_csv = Path(sub("\\.7z$", "", str(file_path)))
parquet_file_name = Path(sub("\\.csv\\.7z$", ".parquet", str(file_name)))
parquet_file_path = proj.Config.paths.get("data_proc").joinpath(parquet_file_name)

schema = StructType()\
    .add('store_nbr', IntegerType(), True)\
    .add('city', StringType(), True)\
    .add('state', StringType(), True)\
    .add('type', StringType(), True)\
    .add('cluster', StringType(), True) # is an integer, but given it's a grouping better to keep it categorical

csv_to_parquet(read_path=file_path_csv,
               write_path=parquet_file_path,
               schema=schema)

### Test
Test data, with the date, store_nbr, item_nbr combinations that are to be predicted, along with the onpromotion information.
NOTE: The test data has a small number of items that are not contained in the training data. Part of the exercise will be to predict a new item sales based on similar products.
The public / private leaderboard split is based on time. All items in the public split are also included in the private split.

In [None]:
from config import proj
from src.utils.etl_utils import *
from re import sub
from pathlib import Path
from pyspark.sql.types import StructType, IntegerType, StringType, DateType, FloatType, BooleanType

file_name = "test.csv.7z"

# Decompress file
file_path = proj.Config.paths.get("data_interim").joinpath(file_name)
decomp_file(file_path=file_path,
            extract_path=proj.Config.paths.get("data_interim"),
            print_msg=True)

# Convert decompressed csv to parquet file with custom schema
file_path_csv = Path(sub("\\.7z$", "", str(file_path)))
parquet_file_name = Path(sub("\\.csv\\.7z$", ".parquet", str(file_name)))
parquet_file_path = proj.Config.paths.get("data_proc").joinpath(parquet_file_name)

schema = StructType() \
    .add('id', IntegerType(), True) \
    .add('date', DateType(), True)\
    .add('store_nbr', IntegerType(), True)\
    .add('item_nbr', IntegerType(), True)\
    .add('onpromotion', BooleanType(), True)

csv_to_parquet(read_path=file_path_csv,
               write_path=parquet_file_path,
               schema=schema)

### Train
Training data, which includes the target unit_sales by date, store_nbr, and item_nbr and a unique id to label rows.
The target unit_sales can be integer (e.g., a bag of chips) or float (e.g., 1.5 kg of cheese).
Negative values of unit_sales represent returns of that particular item.
The onpromotion column tells whether that item_nbr was on promotion for a specified date and store_nbr.
Approximately 16% of the onpromotion values in this file are NaN.
NOTE: The training data does not include rows for items that had zero unit_sales for a store/date combination. There is no information as to whether or not the item was in stock for the store on the date, and teams will need to decide the best way to handle that situation. Also, there are a small number of items seen in the training data that aren't seen in the test data.

In [None]:
from config import proj
from src.utils.etl_utils import *
from re import sub
from pathlib import Path
from pyspark.sql.types import StructType, IntegerType, StringType, DateType, FloatType, BooleanType

file_name = "train.csv.7z"

# Decompress file
file_path = proj.Config.paths.get("data_interim").joinpath(file_name)
decomp_file(file_path=file_path,
            extract_path=proj.Config.paths.get("data_interim"),
            print_msg=True)

# Convert decompressed csv to parquet file with custom schema
file_path_csv = Path(sub("\\.7z$", "", str(file_path)))
parquet_file_name = Path(sub("\\.csv\\.7z$", ".parquet", str(file_name)))
parquet_file_path = proj.Config.paths.get("data_proc").joinpath(parquet_file_name)

schema = StructType() \
    .add('id', IntegerType(), True) \
    .add('date', DateType(), True)\
    .add('store_nbr', IntegerType(), True)\
    .add('item_nbr', IntegerType(), True)\
    .add('unit_sales', FloatType(), True)\
    .add('onpromotion', BooleanType(), True)

read_path=file_path_csv
write_path=parquet_file_path

spark = SparkSession.builder.getOrCreate()
df = spark.read.csv(str(read_path), header=True, schema=schema)
df = df.drop("id") # ID will never be used for this data set, best to save space
df.write.parquet(str(write_path), mode='overwrite')

### Transactions
The count of sales transactions for each date, store_nbr combination. Only included for the training data timeframe.

In [None]:
from config import proj
from src.utils.etl_utils import *
from re import sub
from pathlib import Path
from pyspark.sql.types import StructType, IntegerType, StringType, DateType, FloatType, BooleanType

file_name = "transactions.csv.7z"

# Decompress file
file_path = proj.Config.paths.get("data_interim").joinpath(file_name)
decomp_file(file_path=file_path,
            extract_path=proj.Config.paths.get("data_interim"),
            print_msg=True)

# Convert decompressed csv to parquet file with custom schema
file_path_csv = Path(sub("\\.7z$", "", str(file_path)))
parquet_file_name = Path(sub("\\.csv\\.7z$", ".parquet", str(file_name)))
parquet_file_path = proj.Config.paths.get("data_proc").joinpath(parquet_file_name)

schema = StructType()\
    .add('date', DateType(), True)\
    .add('store_nbr', IntegerType(), True)\
    .add('transactions', IntegerType(), True)


csv_to_parquet(read_path=file_path_csv,
               write_path=parquet_file_path,
               schema=schema)

### Clean up interim directory

In [None]:
clear_interm_data_dir()