In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import gc
from scipy.sparse import vstack, csr_matrix, save_npz, load_npz

In [2]:
test = False

In [3]:
order_products = pd.concat([pd.read_csv('../data/order_products__prior.csv'),
                            pd.read_csv('../data/order_products__train.csv')])
products = pd.read_csv('../data/products.csv')
departments = pd.read_csv('../data/departments.csv')

In [4]:
df = pd.merge(order_products, products, on='product_id')

#cleanup
del order_products
del products
del departments
gc.collect()

df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
0,2,33120,1,1,Organic Egg Whites,86,16
1,26,33120,5,0,Organic Egg Whites,86,16
2,120,33120,13,0,Organic Egg Whites,86,16
3,327,33120,5,1,Organic Egg Whites,86,16
4,390,33120,28,1,Organic Egg Whites,86,16


In [6]:
# top 10 product categories
num_products = 500
top_products = df.groupby('product_id')['product_id'].count().sort_values(ascending=False)[:num_products].index.tolist()
top_products

[24852,
 13176,
 21137,
 21903,
 47209,
 47766,
 47626,
 16797,
 26209,
 27845,
 27966,
 22935,
 24964,
 45007,
 39275,
 49683,
 28204,
 5876,
 40706,
 8277,
 4920,
 30391,
 45066,
 42265,
 44632,
 49235,
 19057,
 4605,
 21616,
 37646,
 17794,
 27104,
 30489,
 31717,
 27086,
 46979,
 8518,
 44359,
 28985,
 41950,
 26604,
 5077,
 34126,
 22035,
 39877,
 43352,
 35951,
 10749,
 19660,
 9076,
 24184,
 21938,
 43961,
 34969,
 48679,
 46667,
 12341,
 25890,
 31506,
 5450,
 39928,
 24838,
 22825,
 5785,
 35221,
 28842,
 33731,
 8424,
 27521,
 33198,
 8174,
 44142,
 20114,
 27344,
 11520,
 29487,
 18465,
 28199,
 15290,
 46906,
 9839,
 27156,
 3957,
 43122,
 23909,
 34358,
 4799,
 9387,
 16759,
 196,
 42736,
 4210,
 38689,
 41787,
 47144,
 41220,
 7781,
 33000,
 20995,
 21709,
 40604,
 19678,
 30233,
 34243,
 37687,
 24489,
 42828,
 432,
 6184,
 5479,
 16185,
 42768,
 17948,
 33754,
 8193,
 19348,
 26369,
 42585,
 14992,
 14947,
 22963,
 28849,
 8021,
 1463,
 25659,
 21405,
 5025,
 41844,
 43

In [7]:
# filtering top products
print(f"n rows before filtering: {len(df)}")
df = df[df['product_id'].isin(top_products)]
print(f"n rows after filtering: {len(df)}")

n rows before filtering: 33819106
n rows after filtering: 14485274


In [8]:
# taking relevant columns and add values
df = df[['order_id', 'product_id']]
df['values'] = 1

In [10]:
# product ids from np.unique is already sorted
product_ids = np.unique(df['product_id'])
# np.sort(product_ids) == product_ids

In [12]:
if test: df = df.iloc[:1000]

if 'csr' in globals():
    del csr

chunk_size = 200000    # number of distinct order_id processed per chunk
n_iter = (df['order_id'].max() // chunk_size) + 1
col_names = np.unique(df['product_id'])
index = np.array([])

for i in range(n_iter):
    
    print(f"iteration {i+1}/{n_iter}...", end='')
    
    df_chunk = df[(df['order_id'] > i*chunk_size) & (df['order_id'] <= (i+1)*chunk_size)]
    
    # pivot
    print('pivoting...', end='')
    df_chunk = df_chunk.pivot_table(values='values', index='order_id', columns='product_id').fillna(0)
    
    # add columns(product_id) of zeros if product_id not present
    for col in col_names:
        if col not in df_chunk.columns:
            df_chunk[col] = 0
    
    # sort columns
    df_chunk = df_chunk[col_names.tolist()]
    
    # append index
    index = np.append(index, df_chunk.index.to_numpy())
    
    # instantiate csr or stack chunk on top of existing csr
    print('converting to sparse...')
    try: csr
    except: csr = csr_matrix(df_chunk.values, dtype=np.int8)
    else: csr = vstack((csr, csr_matrix(df_chunk.values, dtype=np.int8)))

iteration 1/18...pivoting...converting to sparse...
iteration 2/18...pivoting...converting to sparse...
iteration 3/18...pivoting...converting to sparse...
iteration 4/18...pivoting...converting to sparse...
iteration 5/18...pivoting...converting to sparse...
iteration 6/18...pivoting...converting to sparse...
iteration 7/18...pivoting...converting to sparse...
iteration 8/18...pivoting...converting to sparse...
iteration 9/18...pivoting...converting to sparse...
iteration 10/18...pivoting...converting to sparse...
iteration 11/18...pivoting...converting to sparse...
iteration 12/18...pivoting...converting to sparse...
iteration 13/18...pivoting...converting to sparse...
iteration 14/18...pivoting...converting to sparse...
iteration 15/18...pivoting...converting to sparse...
iteration 16/18...pivoting...converting to sparse...
iteration 17/18...pivoting...converting to sparse...
iteration 18/18...pivoting...converting to sparse...


In [13]:
# save csr matrix
save_npz(f"../data/top{num_products}_products.npz", csr)

In [14]:
# check if number of orders is equivalent to sparse's rows and number of unique product_ids is equivalent to sparse's columns
print(f"# rows is consistent: {len(np.unique(df['order_id'])) == csr.shape[0]}")
print(f"# columns is consistent: {len(np.unique(df['product_id'])) == csr.shape[1]}")

# rows is consistent: True
# columns is consistent: True


In [15]:
# save sparse dataframe to parquet
df_sparse = pd.DataFrame.sparse.from_spmatrix(csr, 
                                              index=index.astype(str).tolist(), 
                                              columns=col_names.astype(str).tolist())
df_sparse.to_pickle(f'../data/top{num_products}_products.pickle')

In [19]:
df_sparse.shape

(2922905, 500)