In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Read in Data

In [3]:
raw = pd.read_csv("../raw_data/ncr/items_transactions.csv", dtype=str)

In [7]:
np.sum(raw['dept_num'].isna())

0

## Compute Co-occurrence Matrix

In [8]:
temp = raw.groupby(['global_transaction_id','item_id'])['dept_num'].count()

In [9]:
temp = temp.unstack().fillna(0)

In [10]:
df_np = temp.to_numpy()

In [11]:
num_items = np.sum(df_np, axis=1).reshape(-1,1)

In [12]:
# different weights on pairwise count according to transaction size
def weight_assign(a):
    cutoff1 = 5
    cutoff2 = 10
    cutoff3 = 20
    w1 = 1.0
    w2 = 0.9
    w3 = 0.8
    w4 = 0.7    
    if a[0] <= cutoff1:
        return w1
    elif a[0] > cutoff1 and a[0] <= cutoff2:
        return w2
    elif a[0] > cutoff2 and a[0] <= cutoff3:
        return w3
    else:
        return w4

weights = np.apply_along_axis(weight_assign, axis=1, arr=num_items).reshape(-1,1)

In [13]:
df_np_weighted = df_np * weights

In [14]:
co_occ = df_np.T.dot(df_np_weighted)

In [15]:
co_occ.shape

(10121, 10121)

In [16]:
co_occ

array([[8.58970e+03, 2.25720e+03, 1.08100e+02, ..., 0.00000e+00,
        2.50000e+00, 0.00000e+00],
       [2.25720e+03, 1.85791e+04, 2.12600e+02, ..., 0.00000e+00,
        9.10000e+00, 2.80000e+00],
       [1.08100e+02, 2.12600e+02, 8.98000e+02, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       ...,
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 7.49000e+01,
        0.00000e+00, 0.00000e+00],
       [2.50000e+00, 9.10000e+00, 0.00000e+00, ..., 0.00000e+00,
        3.11000e+01, 0.00000e+00],
       [0.00000e+00, 2.80000e+00, 0.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 1.94800e+02]])

## Compute the Substitute Score and Some Experiments

In [17]:
item_id_list = temp.columns.tolist()

### substitute score function

$score = \frac{{X_i} \cdot {X_j}}{\lVert {X_i} \rVert \lVert {X_j} \rVert}\ e^{-\frac{C_{ij}}{min(C_{ii}, C_{jj})}}$

In [18]:
# compute substitute score
def sub_score(item1, item2):
    idx1 = item_id_list.index(item1)
    idx2 = item_id_list.index(item2)
    #penalty =  np.exp(co_occ[idx1,idx2]*2 / (co_occ[idx1, idx1] + co_occ[idx2, idx2]))
    penalty =  np.exp(co_occ[idx1,idx2] / min(co_occ[idx1, idx1],co_occ[idx2, idx2]))
    arr_item1 = np.delete(co_occ[idx1, :], [idx1,idx2])
    arr_item2 = np.delete(co_occ[idx2, :], [idx1,idx2])
    return np.dot(arr_item1, arr_item2)/(np.linalg.norm(arr_item1)*np.linalg.norm(arr_item2)) / penalty

In [19]:
item_des =  pd.read_csv("../raw_data/ncr/items_descriptions.csv", dtype=str)

In [20]:
sample_item = ['00004900000044',
'00004900000045',
'00004900005016',
'00001200000129',
'00001200000559',
'00001200017174',
'00004900000764',
'00001300000212']

In [23]:
def get_top_substitute(target, item_id_list, top=20):
    s = []
    for item in item_id_list:
        if item == target:
            s.append(1)
        else:
            s.append(sub_score(target, item))
    temp_df = pd.DataFrame({'item_id': item_id_list, 'sub_score': s})
    temp_df = temp_df.merge(item_des, on='item_id', how='left')
    temp_df = temp_df.sort_values(by=['sub_score'], ascending=False)
    temp_df.dropna(axis=0, how='any', thresh=4, inplace=True)
    return temp_df.iloc[:top+1, :]

In [35]:
target = item_id_list[np.random.choice(len(item_id_list))]
target = '00001300000466'
temp_df = get_top_substitute(target, item_id_list, 10)
temp_df

  return np.dot(arr_item1, arr_item2)/(np.linalg.norm(arr_item1)*np.linalg.norm(arr_item2)) / penalty


Unnamed: 0,item_id,sub_score,description,ecomm_description,category,item_type,upc
926,1300000466,1.0,HEINZ KETCHUP,"HEINZ KETCHUP, 38 OZ",1260102,0,13000004664
1673,2500002716,0.843969,MINUTE MAID BERRY PUNCH,"Minute Maid Juice, Berry Punch, 128 Oz",7142101,0,25000027161
1038,1480051324,0.839684,MOTTS CLAMATO JUICE-PLASTIC,"Clamato Tomato Cocktail From Concentrate, 32 oz",1500104,0,14800513240
1672,2500002653,0.838669,MINUTE MAID FRUIT PUNCH,"Minute Maid All Natural Fruit Punch, 128 Oz",7142101,0,25000026539
2126,3120020300,0.827934,OCEAN SPRAY CRAN JCE COCKTAIL,"Ocean Spray Juice Cocktail, Cranberry, 101.4 oz",1500102,0,31200203007
1680,2500004498,0.826004,SIMPLY LEMONADE,"SIMPLY LEMONADE, 52 OZ",7142101,0,25000044984
4630,5020000880,0.82553,PALLET SUNNY D FLORIDA CITRUS PUNCH,"Sunny Delight Beverage, Florida Style, 128 Ounce",7142101,0,50200008801
1036,1480051275,0.822481,MOTTS CLAMATO PICANTE-PLASTIC,"Mott's Inc Clamato Picante, 32 oz",1500104,0,14800512755
9942,89954100101,0.820076,CAMARONAZO PICANTE,Camaronazo Picante / Spicy Tomato Shrimp Cocktail,1500104,0,899541001016
1037,1480051276,0.819795,MOTTS CLAMATO TOMATO PICANTE,"Clamato, Tomato Cocktail, Picante, 64 Ounce",1500104,0,14800512762
