# Build Feature/Future Store


In [None]:
from config import proj
import pyspark.sql.functions as sf
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

build_on = "test" # train builds the feature store, test builds the future store.

## Add provided data

### Pull in train or test

In [2]:
if build_on == "train":
    base = spark.read.parquet(str(proj.Config.paths.get("data_proc").joinpath("train.parquet")))
elif build_on == "test":
    base = spark.read.parquet(str(proj.Config.paths.get("data_proc").joinpath("test.parquet")))
else:
    raise NotImplemented("Can only build feature or future store")

                                                                                

### Add item

In [3]:
items = spark.read.parquet(str(proj.Config.paths.get("data_proc").joinpath("items.parquet")))
base = base.join(items, base.item_nbr == items.item_nbr, "left").drop(items.item_nbr)
base.show(5)

                                                                                

+---------+----------+---------+--------+-----------+------------+-----+----------+
|       id|      date|store_nbr|item_nbr|onpromotion|      family|class|perishable|
+---------+----------+---------+--------+-----------+------------+-----+----------+
|125497040|2017-08-16|        1|   96995|      false|   GROCERY I| 1093|         0|
|125497041|2017-08-16|        1|   99197|      false|   GROCERY I| 1067|         0|
|125497042|2017-08-16|        1|  103501|      false|    CLEANING| 3008|         0|
|125497043|2017-08-16|        1|  103520|      false|   GROCERY I| 1028|         0|
|125497044|2017-08-16|        1|  103665|      false|BREAD/BAKERY| 2712|         1|
+---------+----------+---------+--------+-----------+------------+-----+----------+
only showing top 5 rows



### Add store

In [4]:
stores = spark.read.parquet(str(proj.Config.paths.get("data_proc").joinpath("stores.parquet")))
base = base.join(stores, base.store_nbr == stores.store_nbr, "left").drop(stores.store_nbr)
base.show(5)

+---------+----------+---------+--------+-----------+------------+-----+----------+-----+---------+----+-------+
|       id|      date|store_nbr|item_nbr|onpromotion|      family|class|perishable| city|    state|type|cluster|
+---------+----------+---------+--------+-----------+------------+-----+----------+-----+---------+----+-------+
|125497040|2017-08-16|        1|   96995|      false|   GROCERY I| 1093|         0|Quito|Pichincha|   D|     13|
|125497041|2017-08-16|        1|   99197|      false|   GROCERY I| 1067|         0|Quito|Pichincha|   D|     13|
|125497042|2017-08-16|        1|  103501|      false|    CLEANING| 3008|         0|Quito|Pichincha|   D|     13|
|125497043|2017-08-16|        1|  103520|      false|   GROCERY I| 1028|         0|Quito|Pichincha|   D|     13|
|125497044|2017-08-16|        1|  103665|      false|BREAD/BAKERY| 2712|         1|Quito|Pichincha|   D|     13|
+---------+----------+---------+--------+-----------+------------+-----+----------+-----+-------

## Feature engineering

### New and cleared item flag
This flag will help us know how to treat particular items. If they have been cleared they wont need to be predicted for, so we can possibly filter them out. Or if they are new, a different treatment will need to be applied since the model wont have seen these items before.

In [5]:
train = spark.read.parquet(str(proj.Config.paths.get("data_proc").joinpath("train.parquet")))
test = spark.read.parquet(str(proj.Config.paths.get("data_proc").joinpath("test.parquet")))

train_items = train.select("item_nbr", sf.lit(1).alias("train_fl")).distinct()
test_items = test.select("item_nbr", sf.lit(1).alias("test_fl")).distinct()

item_coverage = train_items.join(test_items, train_items.item_nbr == test_items.item_nbr, "full")

new_items = item_coverage.filter("train_fl is null")\
    .drop(train_items.item_nbr)\
    .select(test_items.item_nbr)\
    .withColumn("new_item", sf.lit(1)) # items not in train

cleared_items = item_coverage.filter("test_fl is null")\
    .drop(test_items.item_nbr)\
    .select(train_items.item_nbr)\
    .withColumn("cleared_item", sf.lit(1)) # items not in test

In [6]:
base = base\
    .join(new_items, ["item_nbr"], "left")\
    .join(cleared_items, ["item_nbr"], "left")\
    .na.fill(value = 0, subset=["new_item", "cleared_item"])

### Add events
TODO need to engineer appropriate flags

# Validation
- Count rows
- Check for nulls, etc

In [7]:
# 125497040 - train
# 3370464 - test

In [8]:
# base.filter("new_item is null").count()
# base.filter("cleared_item is null").count()

In [9]:
base.show(5)

                                                                                

+--------+---------+----------+---------+-----------+------------+-----+----------+-----+---------+----+-------+--------+------------+
|item_nbr|       id|      date|store_nbr|onpromotion|      family|class|perishable| city|    state|type|cluster|new_item|cleared_item|
+--------+---------+----------+---------+-----------+------------+-----+----------+-----+---------+----+-------+--------+------------+
|   96995|125497040|2017-08-16|        1|      false|   GROCERY I| 1093|         0|Quito|Pichincha|   D|     13|       0|           0|
|   99197|125497041|2017-08-16|        1|      false|   GROCERY I| 1067|         0|Quito|Pichincha|   D|     13|       0|           0|
|  103501|125497042|2017-08-16|        1|      false|    CLEANING| 3008|         0|Quito|Pichincha|   D|     13|       0|           0|
|  103520|125497043|2017-08-16|        1|      false|   GROCERY I| 1028|         0|Quito|Pichincha|   D|     13|       0|           0|
|  103665|125497044|2017-08-16|        1|      false|BR