In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("data/transactions.csv")

In [4]:
df = df.drop("Unnamed: 0", axis=1)

In [5]:
df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,aisle,department
0,2,33120,1,1,202279,prior,3,5,9,8.0,Organic Egg Whites,86,16,eggs,dairy eggs
1,26,33120,5,0,153404,prior,2,0,16,7.0,Organic Egg Whites,86,16,eggs,dairy eggs
2,120,33120,13,0,23750,prior,11,6,8,10.0,Organic Egg Whites,86,16,eggs,dairy eggs
3,327,33120,5,1,58707,prior,21,6,9,8.0,Organic Egg Whites,86,16,eggs,dairy eggs
4,390,33120,28,1,166654,prior,48,0,12,9.0,Organic Egg Whites,86,16,eggs,dairy eggs


# Target DataFrame 
* using items in last order

In [13]:
# TODO

# Sort and Fill Columns
* reordered is our target column

In [6]:
columns = ["days_since_prior_order", "product_id", "order_number", "order_id", "reordered"]

In [7]:
df = df.set_index("user_id")
df = df[columns].sort_values(by="order_number")

In [8]:
df["days_since_prior_order"] = df["days_since_prior_order"].fillna(0)
df["days_since_prior_order"] = df["days_since_prior_order"].astype('int64')

In [9]:
df.head()

Unnamed: 0_level_0,days_since_prior_order,product_id,order_number,order_id,reordered
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
137411,0,4085,1,2974797,0
48176,0,44560,1,1664689,0
92198,0,44560,1,1661625,0
198405,0,44560,1,1655285,0
159896,0,44560,1,1647065,0


# Pad Days Since Last Order

In [10]:
def pad_order(user_orders, user_id, row):
    """Pads orders list with 0s for days since last order
    
    Args:
        user_orders (dict): mapping user_id to list of orders
        user_id (int)
        row (dataframe row): containing days since last order
    
    Returns:
        user_orders (dict): updated with padded zeros.
    """
    
    days = row["days_since_prior_order"]
    orders = user_orders[user_id]
    
    for i in range(row["days_since_prior_order"]):
        orders.append(0)
    
    product_id = row["product_id"]
    orders.append(product_id)
    user_orders[user_id] = orders
    
    return user_orders

In [11]:
df.isnull().sum()

days_since_prior_order    0
product_id                0
order_number              0
order_id                  0
reordered                 0
dtype: int64

In [12]:
# put into array of hashtables
#         {"user1": [12, 0, 12, 0, 0, ...], 
#           "user2": ...
#        }

user_orders = {}

for user_id, row in df.iterrows():
    
    if user_id not in user_orders:
        user_orders[user_id] = []
    
    user_orders = pad_order(user_orders, user_id, row)


KeyboardInterrupt: 

In [None]:
user_orders

# Pad to Largest Time Step


In [None]:
longest_id = max(user_orders.keys(), key=(lambda k: len(user_orders[k])))
longest_step_size = len(user_orders[longest])

In [None]:
np.concatenate([a, np.zeros(10)], axis=0)

In [None]:
order_matrix = []

for user_id in user_orders:
    orders = np.array(user_orders[user_id])
    zeros = np.zeros(longest_step_size - len(orders))
    orders = np.concatenate((orders, zeros), axis=0)
    order_matrix.append(orders) 

# Save as CSV

In [None]:
order_matrix = np.array(order_matrix)
padded_orders_df = pd.DataFrame(order_matrix)

In [None]:
# add reordered target

In [None]:
padded_orders_df["target"] = df["reordered"]

In [None]:
padded_orders_df.to_csv("order-matrix.csv")