In [1]:
import pandas as pd
# pd.set_option('display.max_columns', None)
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, normalize
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, recall_score, precision_score
from scipy.sparse import csr_matrix, csc_matrix, hstack
import xgboost as xgb
from xgboost import XGBClassifier
import pickle
import time
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
fi = pd.read_csv('../results/xgb_feature_importances_v1_top500_norm.csv', index_col=0).fillna(0)
# shift columns to align and remove temporal features (dow, hod)
fi = fi[['Banana'] + [col for col in fi.columns if 'order' not in col and col != 'Banana']]
print(f"fi shape: {fi.shape}")
display(fi.head())

fi shape: (500, 500)


Unnamed: 0,Banana,Bag of Organic Bananas,Organic Strawberries,Organic Baby Spinach,Organic Hass Avocado,Organic Avocado,Large Lemon,Strawberries,Limes,Organic Whole Milk,...,Dha Omega 3 Reduced Fat 2% Milk,Chopped Walnuts,Strawberry Rhubarb Yoghurt,Mixed Fruit Fruit Snacks,Boomchickapop Sea Salt Popcorn,Raspberry Preserves,Organic Original Almond Milk,Cage Free Large White Eggs,Organic Sliced Peaches,Blueberry Yoghurt
Banana,0.0,1.0,0.077852,0.031515,0.002404,0.167419,0.047204,0.097433,0.003745,0.048947,...,0.001153,0.000589,0.002246,0.00213,0.000917,0.0,0.000606,0.002655,0.000307,0.000892
Bag of Organic Bananas,1.0,0.0,0.10279,0.076618,0.186334,0.001238,0.000817,0.001298,0.001098,0.034429,...,0.002416,0.0,0.001629,0.001682,0.0,0.0,0.006032,0.0007,0.001338,0.003252
Organic Strawberries,0.298307,0.497814,0.0,0.385618,0.510642,0.195867,0.00608,0.448489,0.032256,0.385389,...,0.005958,0.0,0.004152,0.283666,0.0,0.005752,0.008063,0.010316,0.0,0.006492
Organic Baby Spinach,0.081398,0.2769,0.373226,0.0,0.614931,0.924437,0.2362,0.004541,0.05648,0.070472,...,0.005203,0.004255,0.005946,0.279079,0.003537,0.004241,0.007774,0.012499,0.005563,0.0
Organic Hass Avocado,0.007883,1.0,0.494464,0.333251,0.0,0.373023,0.014296,0.019024,0.248735,0.004879,...,0.006338,0.005443,0.007256,0.127408,0.0,0.006071,0.004405,0.0,0.003878,0.00733


In [3]:
# load dataset
with open('../data/top500_products.pickle', "rb") as input_file:
    data = pickle.load(input_file)
print(f"order data shape: {data.shape}")
print(f"order data columns:\n{data.columns}")
arr = csr_matrix(data.sparse.to_coo())
print(f"array shape: {arr.shape}")

order data shape: (2922905, 500)
order data columns:
Index(['24852', '13176', '21137', '21903', '47209', '47766', '47626', '16797',
       '26209', '27845',
       ...
       '35914', '12845', '17758', '26348', '25513', '4656', '6948', '11712',
       '34584', '40198'],
      dtype='object', length=500)
array shape: (2922905, 500)


In [4]:
n_iter = 0
tic = time.process_time()

I, J = fi.shape
mutuality_arr = np.zeros((I, J))

for i in range(I):
    for j in range(J):
        
        if i == j: continue

        n_iter += 1
        if n_iter % 1000 == 0:
            print(f"iteration {n_iter}...")
            toc = time.process_time()
            print(f"elapsed time: {toc-tic}")

        arr_ = arr[:, [i, j]].toarray()
        arr_ = arr_[arr_[:, 0] > 0]    # where product i is bought

        # mutuality: percentage of order with both product i and j
        mutuality = arr_[:, 1].mean()    # percentage of order where product j is bought
        mutuality_arr[i, j] = mutuality
        
# save mutuality
mutuality_df = pd.DataFrame(mutuality_arr, index=fi.index, columns=fi.columns)
mutuality_df.to_csv(f'../results/mutuality_top500.csv')

iteration 1000...
elapsed time: 80.875
iteration 2000...
elapsed time: 157.21875
iteration 3000...
elapsed time: 233.421875
iteration 4000...
elapsed time: 304.890625
iteration 5000...
elapsed time: 377.359375
iteration 6000...
elapsed time: 447.203125
iteration 7000...
elapsed time: 516.578125
iteration 8000...
elapsed time: 585.3125
iteration 9000...
elapsed time: 654.328125
iteration 10000...
elapsed time: 721.609375
iteration 11000...
elapsed time: 788.90625
iteration 12000...
elapsed time: 855.671875
iteration 13000...
elapsed time: 922.609375
iteration 14000...
elapsed time: 989.4375
iteration 15000...
elapsed time: 1056.703125
iteration 16000...
elapsed time: 1123.640625
iteration 17000...
elapsed time: 1190.46875
iteration 18000...
elapsed time: 1257.6875
iteration 19000...
elapsed time: 1324.4375
iteration 20000...
elapsed time: 1390.828125
iteration 21000...
elapsed time: 1457.65625
iteration 22000...
elapsed time: 1524.5625
iteration 23000...
elapsed time: 1591.265625
iterat

iteration 185000...
elapsed time: 12403.546875
iteration 186000...
elapsed time: 12469.03125
iteration 187000...
elapsed time: 12534.28125
iteration 188000...
elapsed time: 12599.6875
iteration 189000...
elapsed time: 12665.859375
iteration 190000...
elapsed time: 12731.40625
iteration 191000...
elapsed time: 12797.03125
iteration 192000...
elapsed time: 12862.25
iteration 193000...
elapsed time: 12927.546875
iteration 194000...
elapsed time: 12992.453125
iteration 195000...
elapsed time: 13057.75
iteration 196000...
elapsed time: 13122.734375
iteration 197000...
elapsed time: 13187.78125
iteration 198000...
elapsed time: 13253.125
iteration 199000...
elapsed time: 13318.140625
iteration 200000...
elapsed time: 13383.234375
iteration 201000...
elapsed time: 13447.953125
iteration 202000...
elapsed time: 13513.109375
iteration 203000...
elapsed time: 13578.15625
iteration 204000...
elapsed time: 13642.890625
iteration 205000...
elapsed time: 13707.875
iteration 206000...
elapsed time: 1