* `mergedProKey.csv` produced by Prep-Data.py

* based on Ghazal's preprocessing script `onlineLDA-productlevel.py`

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

In [2]:
# DataRareProdMergeOrderTime.csv replaces products appearing fewer than 20 times with aisle
userIDprodName_df = pd.read_csv("data/DataRareProdMergOrderTime.csv", encoding = "ISO-8859-1")

In [3]:
# integer encodes product
newProdName = pd.factorize(userIDprodName_df['product_name'])
prodID = newProdName[0]
prodIDindex = newProdName[1]

In [4]:
userIDprodNameprodID_df = pd.concat([userIDprodName_df.reset_index(drop=True), pd.DataFrame(prodID)], axis=1)
userIDprodNameprodID_df.columns = ['n',
 'order_id',
 'product_id',
 'user_id',
 'product_name',
 'aisle',
 'product_id_tsfm',
 'days_since_prior_order',
 'order_number',
 'nn']


In [5]:
user_idx = userIDprodNameprodID_df['user_id'].unique()
ndocs = userIDprodNameprodID_df['user_id'].max()
nwords = userIDprodNameprodID_df['product_id_tsfm'].max()
print("number of documents (users) {} and words (products) {}".format(ndocs, nwords))

number of documents (users) 206209 and words (products) 36054


In [6]:
group_by_columns = ['user_id','order_id', 'product_id_tsfm','order_number','days_since_prior_order']
countOrder_series = userIDprodNameprodID_df.groupby(group_by_columns).size()
new_df = countOrder_series.to_frame(name = 'size').reset_index()

In [7]:
new_df.head()

Unnamed: 0,user_id,order_id,product_id_tsfm,order_number,days_since_prior_order,size
0,1,431534,1589,5,28.0,1
1,1,431534,4027,5,28.0,1
2,1,431534,4087,5,28.0,1
3,1,431534,4090,5,28.0,1
4,1,431534,4102,5,28.0,1


In [8]:
new_df.shape

(30352327, 6)

In [9]:
# remove users who only make one order
new_df = new_df[new_df.groupby("user_id").user_id.transform(len) > 1]

In [10]:
new_df.head()

Unnamed: 0,user_id,order_id,product_id_tsfm,order_number,days_since_prior_order,size
0,1,431534,1589,5,28.0,1
1,1,431534,4027,5,28.0,1
2,1,431534,4087,5,28.0,1
3,1,431534,4090,5,28.0,1
4,1,431534,4102,5,28.0,1


In [11]:
# number of unique products
new_df["product_id_tsfm"].unique().shape

(36055,)

# Get Target

In [12]:
target_df = new_df.groupby("user_id").last()

In [13]:
# remove target from original df, based on index ["user_id", "order_id"]
target_df = target_df.set_index("order_id", append=True)

In [14]:
new_df = new_df.set_index(["user_id", "order_id"])
new_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,product_id_tsfm,order_number,days_since_prior_order,size
user_id,order_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,431534,1589,5,28.0,1
1,431534,4027,5,28.0,1
1,431534,4087,5,28.0,1
1,431534,4090,5,28.0,1
1,431534,4102,5,28.0,1


In [15]:
# remove target from new_df
train_df = new_df.drop(target_df.index, axis=0)

In [16]:
# remove indices
target_df = target_df.reset_index()
train_df = train_df.reset_index()

In [17]:
target_df.shape

(206209, 6)

206209

In [22]:
assert len(train_df["user_id"].unique()) == len(target_df["user_id"].unique())
assert len(target_df["user_id"].unique()) == len(target_df["order_id"].unique())
print(len(target_df["user_id"].unique()))

206209


In [30]:
train_df.shape

(28302159, 6)

# Save DataFrames

In [49]:
train_df.to_csv("data/train.csv")
target_df.to_csv("data/target.csv")

## Compress Matrix based on size (number of times a product appears in an order)

In [24]:
order_prod_matrix = csr_matrix((train_df['size'], (train_df['order_id'], train_df['product_id_tsfm'])), dtype=np.int32)

In [25]:
# rows: user, order 
# columns: product counts
order_prod_matrix.shape

(3420339, 36055)

In [26]:
target_matrix = csr_matrix((target_df["size"], (target_df["user_id"], target_df["product_id_tsfm"])), dtype=np.int32)

In [27]:
target_matrix.shape

(206210, 36055)

In [42]:
row = np.array([0, 0, 1, 2, 2, 2])
col = np.array([0, 2, 2, 0, 1, 2])
data = np.array([1, 2, 3, 4, 5, 6])

In [43]:
a = csr_matrix((data, (row, col)))

In [44]:
a.toarray()

array([[1, 0, 2],
       [0, 0, 3],
       [4, 5, 6]], dtype=int64)

In [45]:
row = np.array([0, 0, 1, 2, 2, 2])
col = np.array([0, 2, 2, 0, 1, 2])
data = np.array([4, 2, 4, 10, 5, 6])

In [46]:
b = csr_matrix((data, (row, col)))

In [47]:
b.toarray()

array([[ 4,  0,  2],
       [ 0,  0,  4],
       [10,  5,  6]], dtype=int64)

In [48]:
a.multiply(b).toarray()

array([[ 4,  0,  4],
       [ 0,  0, 12],
       [40, 25, 36]], dtype=int64)

# Save Order Matrix and Target

In [28]:
np.save("data/order_prod_matrix.npy", order_prod_matrix)

In [29]:
np.save("data/target_matrix.npy", target_matrix)