In [25]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("transactions-small.csv")

In [3]:
df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,aisle,department
0,2,33120,1,1,202279,prior,3,5,9,8.0,Organic Egg Whites,86,16,eggs,dairy eggs
1,26,33120,5,0,153404,prior,2,0,16,7.0,Organic Egg Whites,86,16,eggs,dairy eggs
2,120,33120,13,0,23750,prior,11,6,8,10.0,Organic Egg Whites,86,16,eggs,dairy eggs
3,327,33120,5,1,58707,prior,21,6,9,8.0,Organic Egg Whites,86,16,eggs,dairy eggs
4,390,33120,28,1,166654,prior,48,0,12,9.0,Organic Egg Whites,86,16,eggs,dairy eggs


# Sort and Fill Columns

In [4]:
columns = ["days_since_prior_order", "product_id", "order_number"]

In [5]:
df = df.set_index("user_id")
df = df[columns].sort_values(by="order_number")

In [6]:
df["days_since_prior_order"] = df["days_since_prior_order"].fillna(0)
df["days_since_prior_order"] = df["days_since_prior_order"].astype('int64')

# Pad Days Since Last Order

In [16]:
def pad_order(user_orders, user_id, row):
    """Pads orders list with 0s for days since last order
    
    Args:
        user_orders (dict): mapping user_id to list of orders
        user_id (int)
        row (dataframe row): containing days since last order
    
    Returns:
        user_orders (dict): updated with padded zeros.
    """
    
    days = row["days_since_prior_order"]
    orders = user_orders[user_id]
    
    for i in range(row["days_since_prior_order"]):
        orders.append(0)
    
    product_id = row["product_id"]
    orders.append(product_id)
    user_orders[user_id] = orders
    
    return user_orders

In [13]:
df.isnull().sum()

days_since_prior_order    0
product_id                0
order_number              0
dtype: int64

In [14]:
# put into array of hashtables
#         {"user1": [12, 0, 12, 0, 0, ...], 
#           "user2": ...
#        }

user_orders = {}

for user_id, row in df.iterrows():
    
    if user_id not in user_orders:
        user_orders[user_id] = []
    
    user_orders = pad_order(user_orders, user_id, row)


In [15]:
user_orders

{29552: [33120,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  33120,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  33120,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  33120],
 182905: [33120],
 132385: [33120],
 101077: [33120,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  33120,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  33120,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  33120],
 79494: [33120],
 171279: [33120, 0, 0, 0, 0, 33120],
 11732: [33120,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  33120],
 109423: [33120, 0, 0, 0, 0, 0, 33120],
 176849: [33120],
 94121: [33120, 0, 0, 0, 0, 0, 0

# Pad to Largest Time Step


In [24]:
longest_id = max(user_orders.keys(), key=(lambda k: len(user_orders[k])))
longest_step_size = len(user_orders[longest])

In [29]:
a = np.array([1, 2, 3])

In [30]:
np.zeros(10)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [33]:
np.concatenate([a, np.zeros(10)], axis=0)

array([1., 2., 3., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [35]:
order_matrix = []

for user_id in user_orders:
    orders = np.array(user_orders[user_id])
    zeros = np.zeros(longest_step_size - len(orders))
    orders = np.concatenate((orders, zeros), axis=0)
    order_matrix.append(orders)

    

# Save as CSV

In [37]:
order_matrix = np.array(order_matrix)

In [38]:
padded_orders_df = pd.DataFrame(order_matrix)

In [40]:
padded_orders_df.to_csv("ordermatrix.csv")