In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm

In [2]:
df = pd.read_csv("data/transactions-small.csv", index_col=0)

In [3]:
df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,aisle,department
0,2,33120,1,1,202279,prior,3,5,9,8.0,Organic Egg Whites,86,16,eggs,dairy eggs
1,26,33120,5,0,153404,prior,2,0,16,7.0,Organic Egg Whites,86,16,eggs,dairy eggs
2,120,33120,13,0,23750,prior,11,6,8,10.0,Organic Egg Whites,86,16,eggs,dairy eggs
3,327,33120,5,1,58707,prior,21,6,9,8.0,Organic Egg Whites,86,16,eggs,dairy eggs
4,390,33120,28,1,166654,prior,48,0,12,9.0,Organic Egg Whites,86,16,eggs,dairy eggs


In [4]:
df.shape

(100000, 15)

# Sort and Fill Columns

In [5]:
columns = ["days_since_prior_order", "product_id", "order_number"]

In [6]:
df["days_since_prior_order"] = df["days_since_prior_order"].fillna(0)
df["days_since_prior_order"] = df["days_since_prior_order"].astype('int64')

In [7]:
df = df.set_index("user_id")
df = df[columns].sort_values(by="order_number")

# Pad Orders with Days Since Last Order

In [8]:
def add_order(user_orders, row):
    """Adds product and pads orders list with 0s for days since last order
    
    Args:
        user_orders (list): containing orders for user up to now
        row (dataframe row): containing days since last order
    
    Returns:
        user_orders (list): updated with order and padded zeros.
    """
    
    days = row["days_since_prior_order"]
    
    
    for i in range(row["days_since_prior_order"]):
        user_orders.append(0)
    
    product_id = row["product_id"]
    user_orders.append(product_id)
    
    return user_orders

In [9]:
df.isnull().sum()

days_since_prior_order    0
product_id                0
order_number              0
dtype: int64

In [10]:
# put into array of hashtables
#         {"user1": [12, 0, 12, 0, 0, ...], 
#           "user2": ...
#        }

users_orders = {}

# Assumes dataframe is ordered by user_id and order_number
for user_id, row in tqdm(df.iterrows()):
    
    if user_id not in users_orders:
        users_orders[user_id] = []
    
    user_orders = users_orders[user_id]
    updated_user_orders = add_order(user_orders, row)
    
    users_orders[user_id] = updated_user_orders

100000it [00:07, 13352.51it/s]


# Pad to Largest Time Step


In [11]:
longest_id = max(users_orders.keys(), key=(lambda k: len(users_orders[k])))
# Subtract 1 for last_order, which will be target
longest_step_size = len(users_orders[longest_id]) - 1

In [12]:
# TODO: add target array containing last order!

In [13]:
order_matrix = []
# first column is user_id
last_orders = []
user_ids = []

for user_id in tqdm(users_orders):
    orders = np.array(users_orders[user_id])
    orders_up_to_last = orders[:-1]
    last_order = orders[-1]
    
    zeros = np.zeros(longest_step_size - len(orders_up_to_last))
    orders_up_to_last_padded = np.concatenate((orders, zeros), axis=0)
    order_matrix.append(orders_up_to_last_padded)
    
    last_orders.append(last_order)
    user_ids.append(user_id)

100%|██████████| 28096/28096 [00:00<00:00, 62088.53it/s]


# Save as CSV

In [14]:
padded_orders_df = pd.DataFrame(order_matrix, index=user_ids)

In [15]:
padded_orders_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,476,477,478,479,480,481,482,483,484,485
4034,36550.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53189,46654.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
94546,11520.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13147,46654.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86933,46654.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# index = user_id
padded_orders_df.to_csv("data/order-matrix.csv")

In [22]:
last_orders_df = pd.DataFrame(last_orders, index=user_ids)

In [23]:
last_orders_df.to_csv("data/last-order-matrix.csv")