In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import gc
from scipy.sparse import vstack, csr_matrix, save_npz, load_npz

In [3]:
test = False

In [4]:
order_products = pd.concat([pd.read_csv('../data/order_products__prior.csv'),
                            pd.read_csv('../data/order_products__train.csv')])
products = pd.read_csv('../data/products.csv')
departments = pd.read_csv('../data/departments.csv')

In [5]:
df = pd.merge(order_products, products, on='product_id')

#cleanup
del order_products
del products
del departments
gc.collect()

df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
0,2,33120,1,1,Organic Egg Whites,86,16
1,26,33120,5,0,Organic Egg Whites,86,16
2,120,33120,13,0,Organic Egg Whites,86,16
3,327,33120,5,1,Organic Egg Whites,86,16
4,390,33120,28,1,Organic Egg Whites,86,16


In [6]:
# top 10 product categories
top10_cat = df.groupby('department_id')['department_id'].count().sort_values(ascending=False)[:10].index.tolist()
top10_cat

[4, 16, 19, 7, 1, 13, 3, 15, 20, 9]

In [7]:
# filtering top 10 categories
print(f"n rows before filtering: {len(df)}")
df = df[df['department_id'].isin(top10_cat)]
print(f"n rows after filtering: {len(df)}")

n rows before filtering: 33819106
n rows after filtering: 29964627


In [8]:
# taking relevant columns and add values
df = df[['order_id', 'product_id']]
df['values'] = 1

In [9]:
# product ids from np.unique is already sorted
product_ids = np.unique(df['product_id'])
np.sort(product_ids) == product_ids

array([ True,  True,  True, ...,  True,  True,  True])

In [8]:
if test: df = df.iloc[:1000]

if 'csr' in globals():
    del csr

chunk_size = 50000    # number of distinct order_id processed per chunk
n_iter = (df['order_id'].max() // chunk_size) + 1
col_names = np.unique(df['product_id'])
index = np.append([])

for i in range(n_iter):
    
    print(f"iteration {i+1}/{n_iter}...", end='')
    
    df_chunk = df[(df['order_id'] > i*chunk_size) & (df['order_id'] <= (i+1)*chunk_size)]
    
    # pivot
    print('pivoting...', end='')
    df_chunk = df_chunk.pivot_table(values='values', index='order_id', columns='product_id').fillna(0)
    
    # add columns(product_id) of zeros if product_id not present
    for col in col_names:
        if col not in df_chunk.columns:
            df_chunk[col] = 0
    
    # sort columns
    df_chunk = df_chunk[col_names.tolist()]
    
    # append index
    index.append(df_chunk.index.to_numpy())
    
    # instantiate csr or stack chunk on top of existing csr
    print('converting to sparse...')
    try: csr
    except: csr = csr_matrix(df_chunk.values, dtype=np.int8)
    else: csr = vstack((csr, csr_matrix(df_chunk.values, dtype=np.int8)))

iteration 1/343...pivoting...

  df_chunk[col] = 0


converting to sparse...
iteration 2/343...pivoting...converting to sparse...
iteration 3/343...pivoting...converting to sparse...
iteration 4/343...pivoting...converting to sparse...
iteration 5/343...pivoting...converting to sparse...
iteration 6/343...pivoting...converting to sparse...
iteration 7/343...pivoting...converting to sparse...
iteration 8/343...pivoting...converting to sparse...
iteration 9/343...pivoting...converting to sparse...
iteration 10/343...pivoting...converting to sparse...
iteration 11/343...pivoting...converting to sparse...
iteration 12/343...pivoting...converting to sparse...
iteration 13/343...pivoting...converting to sparse...
iteration 14/343...pivoting...converting to sparse...
iteration 15/343...pivoting...converting to sparse...
iteration 16/343...pivoting...converting to sparse...
iteration 17/343...pivoting...converting to sparse...
iteration 18/343...pivoting...converting to sparse...
iteration 19/343...pivoting...converting to sparse...
iteration 20

iteration 152/343...pivoting...converting to sparse...
iteration 153/343...pivoting...converting to sparse...
iteration 154/343...pivoting...converting to sparse...
iteration 155/343...pivoting...converting to sparse...
iteration 156/343...pivoting...converting to sparse...
iteration 157/343...pivoting...converting to sparse...
iteration 158/343...pivoting...converting to sparse...
iteration 159/343...pivoting...converting to sparse...
iteration 160/343...pivoting...converting to sparse...
iteration 161/343...pivoting...converting to sparse...
iteration 162/343...pivoting...converting to sparse...
iteration 163/343...pivoting...converting to sparse...
iteration 164/343...pivoting...converting to sparse...
iteration 165/343...pivoting...converting to sparse...
iteration 166/343...pivoting...converting to sparse...
iteration 167/343...pivoting...converting to sparse...
iteration 168/343...pivoting...converting to sparse...
iteration 169/343...pivoting...converting to sparse...
iteration 

iteration 301/343...pivoting...converting to sparse...
iteration 302/343...pivoting...converting to sparse...
iteration 303/343...pivoting...converting to sparse...
iteration 304/343...pivoting...converting to sparse...
iteration 305/343...pivoting...converting to sparse...
iteration 306/343...pivoting...converting to sparse...
iteration 307/343...pivoting...converting to sparse...
iteration 308/343...pivoting...converting to sparse...
iteration 309/343...pivoting...converting to sparse...
iteration 310/343...pivoting...converting to sparse...
iteration 311/343...pivoting...converting to sparse...
iteration 312/343...pivoting...converting to sparse...
iteration 313/343...pivoting...converting to sparse...
iteration 314/343...pivoting...converting to sparse...
iteration 315/343...pivoting...converting to sparse...
iteration 316/343...pivoting...converting to sparse...
iteration 317/343...pivoting...converting to sparse...
iteration 318/343...pivoting...converting to sparse...
iteration 

In [10]:
# save csr matrix
save_npz('../data_shared/top10_categories.npz', csr)

In [12]:
# check if number of orders is equivalen to sparse's rows and number of unique product_ids is equivalent to sparse's columns
print(f"# rows is consistent: {len(np.unique(df['order_id'])) == csr.shape[0]}")
print(f"# columns is consistent: {len(np.unique(df['product_id'])) == csr.shape[1]}")

# rows is consistent: True
# columns is consistent: True


In [28]:
# save sparse dataframe to parquet
df_sparse = pd.DataFrame.sparse.from_spmatrix(csr, 
                                              index=index.astype(str).tolist(), 
                                              columns=col_names.astype(str).tolist())
df_sparse.to_pickle('../data_shared/top10_categories.pickle')