# Feed Forward Preprocessing Aisles

* `mergedProKey.csv` produced by Prep-Data.py

* based on Ghazal's preprocessing script `onlineLDA-productlevel.py`

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

In [2]:
# DataRareProdMergeOrderTime.csv replaces products appearing fewer than 20 times with aisle
userIDprodName_df = pd.read_csv("data/DataRareProdMergOrderTime.csv", encoding = "ISO-8859-1")

In [3]:
# integer encodes product
newProdName = pd.factorize(userIDprodName_df['product_name'])
prodID = newProdName[0]
prodIDindex = newProdName[1]

In [4]:
userIDprodNameprodID_df = pd.concat([userIDprodName_df.reset_index(drop=True), pd.DataFrame(prodID)], axis=1)
userIDprodNameprodID_df.columns = ['n',
 'order_id',
 'product_id',
 'user_id',
 'product_name',
 'aisle',
 'product_id_tsfm',
 'days_since_prior_order',
 'order_number',
 'nn']


In [5]:
user_idx = userIDprodNameprodID_df['user_id'].unique()
ndocs = userIDprodNameprodID_df['user_id'].max()
nwords = userIDprodNameprodID_df['product_id_tsfm'].max()
print("number of documents (users) {} and words (products) {}".format(ndocs, nwords))

number of documents (users) 206209 and words (products) 36054


In [6]:
group_by_columns = ['user_id', 'aisle', 'order_number']
countOrder_series = userIDprodNameprodID_df.groupby(group_by_columns).size()
new_df = countOrder_series.to_frame(name = 'size').reset_index()

In [7]:
new_df.head()

Unnamed: 0,user_id,aisle,order_number,size
0,1,candy chocolate,10,1
1,1,cereal,2,1
2,1,cereal,7,1
3,1,cereal,10,1
4,1,cream,8,1


In [8]:
len(new_df["aisle"].unique())

134

# One Hot Encode Aisles

In [9]:
one_hot_df = pd.get_dummies(new_df, columns=["aisle"])

In [10]:
one_hot_df.head()

Unnamed: 0,user_id,order_number,size,aisle_air fresheners candles,aisle_asian foods,aisle_baby accessories,aisle_baby bath body care,aisle_baby food formula,aisle_bakery desserts,aisle_baking ingredients,...,aisle_spreads,aisle_tea,aisle_tofu meat alternatives,aisle_tortillas flat bread,aisle_trail mix snack mix,aisle_trash bags liners,aisle_vitamins supplements,aisle_water seltzer sparkling water,aisle_white wines,aisle_yogurt
0,1,10,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,2,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,7,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,10,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,8,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Get Last Order

In [11]:
target_df = one_hot_df.groupby("user_id").last()

In [12]:
# remove target from original df, based on index ["user_id", "order_id"]

target_df = target_df.set_index("order_number", append=True)
one_hot_df = one_hot_df.set_index(["user_id", "order_number"])
train_df = one_hot_df.drop(target_df.index, axis=0)

In [13]:
target_df = target_df.reset_index()
train_df = train_df.reset_index()

## Check Target and Train Shapes

In [14]:
target_df.shape

(206209, 137)

In [15]:
train_df.shape

(21569140, 137)

# Group Orders by Count

In [16]:
train_df = train_df.groupby(["user_id"]).sum()

In [17]:
target_df = target_df.groupby(["user_id"]).sum()

In [18]:
train_df.shape

(206209, 136)

In [19]:
target_df.shape

(206209, 136)

# Save DataFrames

In [20]:
train_df.to_csv("data/train.csv")
target_df.to_csv("data/target.csv")