In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm

In [2]:
df = pd.read_csv("data/transactions-small.csv", index_col=0)

In [3]:
df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,aisle,department
0,2,33120,1,1,202279,prior,3,5,9,8.0,Organic Egg Whites,86,16,eggs,dairy eggs
1,26,33120,5,0,153404,prior,2,0,16,7.0,Organic Egg Whites,86,16,eggs,dairy eggs
2,120,33120,13,0,23750,prior,11,6,8,10.0,Organic Egg Whites,86,16,eggs,dairy eggs
3,327,33120,5,1,58707,prior,21,6,9,8.0,Organic Egg Whites,86,16,eggs,dairy eggs
4,390,33120,28,1,166654,prior,48,0,12,9.0,Organic Egg Whites,86,16,eggs,dairy eggs


In [4]:
df.shape

(100000, 15)

# One Hot Encode Products

In [5]:
len(df["product_name"].unique())

7

In [6]:
one_hot_df = pd.get_dummies(df, columns=["product_name"])

In [7]:
one_hot_df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,...,department_id,aisle,department,product_name_All Whites 100% Egg Whites,product_name_Large Alfresco Eggs,product_name_Large Grade AA Eggs,product_name_Liquid Egg Whites,product_name_Organic Egg Whites,product_name_Organic Extra Large Grade AA Brown Eggs,product_name_Organic Grade A Large Brown Eggs
0,2,33120,1,1,202279,prior,3,5,9,8.0,...,16,eggs,dairy eggs,0,0,0,0,1,0,0
1,26,33120,5,0,153404,prior,2,0,16,7.0,...,16,eggs,dairy eggs,0,0,0,0,1,0,0
2,120,33120,13,0,23750,prior,11,6,8,10.0,...,16,eggs,dairy eggs,0,0,0,0,1,0,0
3,327,33120,5,1,58707,prior,21,6,9,8.0,...,16,eggs,dairy eggs,0,0,0,0,1,0,0
4,390,33120,28,1,166654,prior,48,0,12,9.0,...,16,eggs,dairy eggs,0,0,0,0,1,0,0


In [8]:
one_hot_columns = [c for c in one_hot_df.columns if c not in df.columns]

# Fill and Filter Columns

In [9]:
one_hot_df["days_since_prior_order"] = one_hot_df["days_since_prior_order"].fillna(0)
one_hot_df["days_since_prior_order"] = one_hot_df["days_since_prior_order"].astype('int64')

In [10]:
# fitler relevant columns
columns = ["user_id", "days_since_prior_order", "order_number"] + one_hot_columns
one_hot_df = one_hot_df[columns]

In [12]:
one_hot_df.head()

Unnamed: 0,user_id,days_since_prior_order,order_number,product_name_All Whites 100% Egg Whites,product_name_Large Alfresco Eggs,product_name_Large Grade AA Eggs,product_name_Liquid Egg Whites,product_name_Organic Egg Whites,product_name_Organic Extra Large Grade AA Brown Eggs,product_name_Organic Grade A Large Brown Eggs
0,202279,8,3,0,0,0,0,1,0,0
1,153404,7,2,0,0,0,0,1,0,0
3,58707,8,21,0,0,0,0,1,0,0
4,166654,9,48,0,0,0,0,1,0,0
5,180135,3,15,0,0,0,0,1,0,0


In [13]:
one_hot_df.shape

(88159, 10)

# Pad Orders with Days Since Last Order

In [24]:
def build_orders_matrix(one_hot_df, one_hot_columns):
    """
    Forms one-hot-encoded order time series padded with zeros for days between orders
        each row is each user's order history
    
    Args: 
        one_hot_df (pd.dataframe): contains user_id, days_since_prior_order, one-hot-encoded orders
        one_hot_columns (list(str)): list containing one-hot-column names
    
    Returns:
        orders_matrix (list): One row per user, [[1, 0, 1, .., 1], [0, 0,...,0], ...], 
                                                        [1, 0, 1, .., 1], [0, 0,...,0], ...], 
                                                       ]
        user_ids (list): [user_id1, user_id2]
    """
    orders_matrix = [[]]
    user_ids = []
    
    # sort one_hot_df by user_id and order_number
    one_hot_df_sorted = one_hot_df.sort_values(by=["user_id", "order_number"])
    
    previous_user_id = one_hot_df_sorted.iloc[0]["user_id"]
    user_ids.append(previous_user_id)
    
    for i, row in tqdm(one_hot_df_sorted.iterrows()):
        user_id = row["user_id"]
        days_since_last = row["days_since_prior_order"]
        order = list(row[one_hot_columns].values)
        
        # new user row or same user?
        if user_id != previous_user_id:
            user_ids.append(user_id)
            previous_orders = []
            orders_matrix.append([])
            previous_user_id = user_id
        else:
            previous_orders = orders_matrix[-1]

        orders = pad_orders(previous_orders, days_since_last, one_hot_columns)
        orders.append(order)
        orders_matrix[-1] = orders
    
    return user_ids, orders_matrix


def pad_orders(previous_orders, days_since_last, one_hot_columns):
    """Appends one-hot-encoded zero vectors for days since previous order"""
    if days_since_last == 0 or not previous_orders:
        return previous_orders
    
    padding = [[0]*len(one_hot_columns)]*days_since_last
    return previous_orders + padding

In [25]:
user_ids, orders_matrix = build_orders_matrix(one_hot_df, one_hot_columns)

88159it [00:57, 1537.82it/s]


In [26]:
user_ids[2]

77

In [27]:
len(user_ids)

16255

In [28]:
len(orders_matrix)

16255

In [29]:
orders_matrix[3]

[[0, 0, 0, 0, 0, 1, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0]]

In [30]:
# see number of timesteps
for i, user_orders in enumerate(orders_matrix):
    print(len(user_orders))
    if i > 10: 
        break
    

16
24
29
32
32
9
102
13
201
7
11
271


# Get Target and Pad to Largest Time Step


In [31]:
max_time = max(len(user_orders) for user_orders in orders_matrix)
max_time

486

In [32]:
def get_target_square_matrix(orders_matrix, max_time, one_hot_columns):
    """
    Creates target array and pads orders_matrix so it's square
    
    Args:
        orders_matrix (list): list containing one hot encoded orders with zero padding between orders.
                                should only contain users who order more than once.
        max_time (int): longest order time steps
        one_hot_columns (list): strings containing one-hot-column names
        
    Returns:
        target (list): users last order, one-hot-encoded
        orders_matrix (list): squared matrix padded with zeros at the end
    """
    target = []
    square_orders_matrix = []
    
    for user_id, user_orders in zip(user_ids, orders_matrix):
        
        orders_up_to_last = user_orders[:-1]
        last_order = user_orders[-1]
        
        target.append(last_order)
        padded_orders = pad_end(orders_up_to_last, max_time, one_hot_columns)
        square_orders_matrix.append(padded_orders)
        
    return target, square_orders_matrix


def pad_end(user_orders, max_time, one_hot_columns):
    """
    Adds padding to the end of a user_orders to reach max number of steps
        this creates a square matrix
    """
    remaining_steps = max_time - len(user_orders)
    padding = [[0]*len(one_hot_columns)] * remaining_steps
    return user_orders + padding

In [34]:
target, square_orders_matrix = get_target_square_matrix(orders_matrix, max_time, one_hot_columns)

In [35]:
print(len(user_ids))
print(user_ids[3])

16255
79


In [36]:
print(len(target))
print(target[3])

16255
[0, 0, 0, 0, 0, 1, 0]


In [37]:
print(len(square_orders_matrix))
print(len(square_orders_matrix[3]))
print(square_orders_matrix[3])

16255
486
[[0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], 

# Save

In [44]:
np.save("data/orders_matrix.npy", orders_matrix)

In [43]:
np.save("data/target.npy", target)