In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage
import plotly.figure_factory as ff
import networkx as nx
import plotly.express as px
import matplotlib.patches as mpatches
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
import Levenshtein

In [2]:
# Load the three datasets into Pandas DataFrames
df_selected_products = pd.read_csv('selected_products.csv')
df_selected_products_frozen = pd.read_csv('selected_products_frozen.csv')
df_selected_products_ref = pd.read_csv('selected_products_ref.csv')

# Add a new column to each DataFrame to specify the source
df_selected_products['Source'] = 'Other'
df_selected_products_frozen['Source'] = 'Frozen'
df_selected_products_ref['Source'] = 'Refrigerated'

# Concatenate (merge) the three DataFrames into one
final_1000 = pd.concat([df_selected_products, df_selected_products_frozen, df_selected_products_ref], ignore_index=True)

# Save the merged DataFrame to a new CSV file if needed
final_1000.to_csv('final_1000.csv', index=False)


In [3]:
final_1000.head()

Unnamed: 0,product_name,product_id,aisle_id,department_id,Source
0,Bag of Organic Bananas,13176,24,4,Other
1,Organic Fuji Apple,28204,24,4,Other
2,Bananas,39276,24,4,Other
3,Apples,13575,24,4,Other
4,Red Mango,32429,24,4,Other


In [4]:
# Load the three datasets into Pandas DataFrames
df_test_products = pd.read_csv('test_other_df.csv')
df_test_products_frozen = pd.read_csv('test_frozen_df.csv')
df_test_products_ref = pd.read_csv('test_ref_df.csv')

# Add a new column to each DataFrame to specify the source
df_test_products['Source'] = 'Other'
df_test_products_frozen['Source'] = 'Frozen'
df_test_products_ref['Source'] = 'Refrigerated'

# Concatenate (merge) the three DataFrames into one
final_test_df = pd.concat([df_test_products, df_test_products_frozen, df_test_products_ref], ignore_index=True)

# Save the merged DataFrame to a new CSV file if needed
final_test_df.to_csv('final_test_df', index=False)

In [5]:
selected_data=final_1000
test_df=final_test_df

#### Metric Calculation without accounting for substitutes for METHOD OF OTHERS

In [6]:
# calculate total unique orders in the test data
tot_order = test_df['order_id'].nunique()
print(f"Total number of orders: {tot_order}")

Total number of orders: 51679


In [7]:
# calculate the average number of items in each order
avg_item = len(test_df) / tot_order
print(f"Average number of items in each order: {avg_item}")

Average number of items in each order: 5.6002631629868995


In [8]:
# join the test data with my pick on 'product_id'
test_selected_data = test_df.merge(selected_data, on='product_id', how='left')

# group by 'order_id' and count the non-null values of 'product_name_y' column
metrics_tmp = test_selected_data[['order_id', 'product_name_y']].groupby('order_id').count().reset_index().rename(columns={'product_name_y': "num_prod_matching"})
metrics_tmp.head()

Unnamed: 0,order_id,num_prod_matching
0,3,2
1,5,12
2,9,9
3,10,5
4,11,3


In [9]:
# calculate metric 1
# count orders with at least a match
metric_1 = len(metrics_tmp[metrics_tmp['num_prod_matching']>0])
print(f"Out of {tot_order} orders, {metric_1} of them utilize the in-aisle items. That's about {metric_1/tot_order*100:.2f}%.")

Out of 51679 orders, 43905 of them utilize the in-aisle items. That's about 84.96%.


In [10]:
# calculate metric 2
metric_2 = metrics_tmp['num_prod_matching'].sum() / len(metrics_tmp)
print(f"On average there are {avg_item:.2f} items in each order, and {metric_2:.2f} of them utilize in-aisle items. That's about {metric_2/avg_item*100:.2f}%")

On average there are 5.60 items in each order, and 3.66 of them utilize in-aisle items. That's about 65.40%


#### Metric Calculation accounting for substitutes for METHOD OF OTHERS

In [20]:
selected_products=selected_data['product_name'].unique()
# Get unique product names from the entire dataset
unique_product_names = test_df['product_name'].unique()

# Vectorize the unique product names and selected products
tfidf_vectorizer = TfidfVectorizer()
product_name_matrix = tfidf_vectorizer.fit_transform(unique_product_names)
selected_products_matrix = tfidf_vectorizer.transform(selected_products)

# Calculate the cosine similarity matrix between unique product names and selected products
similarity_matrix = cosine_similarity(product_name_matrix, selected_products_matrix)

# Create a dictionary to store num_prod_match for each order_id
order_id_dict = {}
similarity_threshold = 0.5
order_ids_checked = 0

# Iterate through unique order IDs in test_df
unique_order_ids = test_df['order_id'].unique()
for order_id in unique_order_ids:
    order_products = test_df[test_df['order_id'] == order_id]['product_name']
    num_prod_match = 0
    
    for product_name in order_products:
        product_index = np.where(unique_product_names == product_name)[0][0]  # Find the index of the current product
        similarities = similarity_matrix[product_index]  # Get similarities for the current product
        
        # Check if any similarity is above the threshold
        if any(similarities >= similarity_threshold):
            num_prod_match += 1
    
    order_id_dict[order_id] = num_prod_match
    order_ids_checked += 1
    print(f"Progress: {((order_ids_checked) / len(unique_order_ids)) * 100}%")

Progress: 0.0019350219624992746%
Progress: 0.003870043924998549%
Progress: 0.005805065887497823%
Progress: 0.007740087849997098%
Progress: 0.009675109812496372%
Progress: 0.011610131774995647%
Progress: 0.01354515373749492%
Progress: 0.015480175699994197%
Progress: 0.01741519766249347%
Progress: 0.019350219624992743%
Progress: 0.021285241587492017%
Progress: 0.023220263549991293%
Progress: 0.025155285512490567%
Progress: 0.02709030747498984%
Progress: 0.029025329437489113%
Progress: 0.030960351399988394%
Progress: 0.032895373362487663%
Progress: 0.03483039532498694%
Progress: 0.03676541728748621%
Progress: 0.03870043924998549%
Progress: 0.040635461212484764%
Progress: 0.04257048317498403%
Progress: 0.04450550513748331%
Progress: 0.04644052709998259%
Progress: 0.04837554906248186%
Progress: 0.050310571024981134%
Progress: 0.05224559298748041%
Progress: 0.05418061494997968%
Progress: 0.05611563691247896%
Progress: 0.05805065887497823%
Progress: 0.059985680837477504%
Progress: 0.061920702

In [21]:
result_df = pd.DataFrame(order_id_dict.items(), columns=['order_id', 'num_prod_match'])
result_df.head()

Unnamed: 0,order_id,num_prod_match
0,5,15
1,27,25
2,60,16
3,63,7
4,67,3


In [23]:
#calculate metric 1
# count orders with at least a match
metric_sub_1 = len(result_df[result_df['num_prod_match']>0])
print(f"Out of {tot_order} orders, {metric_sub_1} of them utilize the in-aisle items. That's about {metric_sub_1/tot_order*100:.2f}%.")

Out of 51679 orders, 50308 of them utilize the in-aisle items. That's about 97.35%.


In [24]:
# calcualte metric 2
metric_sub_2 = result_df['num_prod_match'].sum() / len(result_df)
print(f"On average there are {avg_item:.2f} items in each order, and {metric_sub_2:.2f} of them utilize in-aisle items. That's about {metric_sub_2/avg_item*100:.2f}%")

On average there are 5.60 items in each order, and 5.04 of them utilize in-aisle items. That's about 89.92%


### TOP 1000