In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm

In [2]:
df = pd.read_csv("data/transactions.csv", index_col=0)

KeyboardInterrupt: 

In [None]:
df.head()

In [None]:
df.shape

# Sort and Fill Columns

In [None]:
columns = ["days_since_prior_order", "product_id", "order_number"]

In [None]:
df["days_since_prior_order"] = df["days_since_prior_order"].fillna(0)
df["days_since_prior_order"] = df["days_since_prior_order"].astype('int64')

In [None]:
df = df.set_index("user_id")
df = df[columns].sort_values(by="order_number")

# Pad Orders with Days Since Last Order

In [None]:
def add_order(user_orders, row):
    """Adds product and pads orders list with 0s for days since last order
    
    Args:
        user_orders (list): containing orders for user up to now
        row (dataframe row): containing days since last order
    
    Returns:
        user_orders (list): updated with order and padded zeros.
    """
    
    days = row["days_since_prior_order"]
    
    
    for i in range(row["days_since_prior_order"]):
        user_orders.append(0)
    
    product_id = row["product_id"]
    user_orders.append(product_id)
    
    return user_orders

In [None]:
df.isnull().sum()

In [None]:
# put into array of hashtables
#         {"user1": [12, 0, 12, 0, 0, ...], 
#           "user2": ...
#        }

users_orders = {}

# Assumes dataframe is ordered by user_id and order_number
for user_id, row in tqdm(df.iterrows()):
    
    if user_id not in users_orders:
        users_orders[user_id] = []
    
    user_orders = users_orders[user_id]
    updated_user_orders = add_order(user_orders, row)
    
    users_orders[user_id] = updated_user_orders

# Pad to Largest Time Step


In [None]:
longest_id = max(users_orders.keys(), key=(lambda k: len(users_orders[k])))
# Subtract 1 for last_order, which will be target
longest_step_size = len(users_orders[longest_id]) - 1

In [None]:
# TODO: add target array containing last order!

In [None]:
order_matrix = []
# first column is user_id
last_orders = []
user_ids = []

for user_id in tqdm(users_orders):
    orders = np.array(users_orders[user_id])
    orders_up_to_last = orders[:-1]
    last_order = orders[-1]
    
    zeros = np.zeros(longest_step_size - len(orders_up_to_last))
    orders_up_to_last_padded = np.concatenate((orders, zeros), axis=0)
    order_matrix.append(orders_up_to_last_padded)
    
    last_orders.append(last_order)
    user_ids.append(user_id)

# Save as CSV

In [None]:
padded_orders_df = pd.DataFrame(order_matrix, index=user_ids)

In [None]:
padded_orders_df.head()

In [None]:
# index = user_id
padded_orders_df.to_csv("data/order-matrix-full.csv")

In [None]:
last_orders_df = pd.DataFrame(last_orders, index=user_ids)

In [None]:
last_orders_df.to_csv("data/last-order-matrix-full.csv")