In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage
import plotly.figure_factory as ff

In [2]:
data = pd.read_csv('frozen_df.csv')
data.head()

Unnamed: 0,order_id,product_id,product_name,aisle_id,aisle,department_id,department
0,7,46802,Pineapple Chunks,116,frozen produce,1,frozen
1,11,30162,Teriyaki & Pineapple Chicken Meatballs,38,frozen meals,1,frozen
2,12,38050,All Natural Boneless Skinless Chicken Breasts,34,frozen meat seafood,1,frozen
3,12,29471,Combination Pizza Rolls,129,frozen appetizers sides,1,frozen
4,14,162,Organic Mini Homestyle Waffles,52,frozen breakfast,1,frozen


In [3]:
data.describe()

Unnamed: 0,order_id,product_id,aisle_id,department_id
count,68202.0,68202.0,68202.0,68202.0
mean,49880.091713,24427.201578,69.549515,1.0
std,28861.478206,14535.050268,36.879206,0.0
min,7.0,4.0,34.0,1.0
25%,24820.75,11440.0,37.0,1.0
50%,49666.0,24195.0,52.0,1.0
75%,74737.75,37158.0,116.0,1.0
max,99996.0,49685.0,129.0,1.0


In [4]:
# Get unique order IDs
unique_order_ids = data['order_id'].unique()

# Calculate the index to split the data (70% training, 30% testing)
split_index = int(0.7 * len(unique_order_ids))

# Randomly shuffle the unique order IDs
np.random.seed(42)  # Set a seed for reproducibility
np.random.shuffle(unique_order_ids)

# Split the unique order IDs into training and testing sets
train_order_ids = unique_order_ids[:split_index]
test_order_ids = unique_order_ids[split_index:]

# Create training and testing DataFrames based on the selected order IDs
train_frozen_df = data[data['order_id'].isin(train_order_ids)]
test_frozen_df = data[data['order_id'].isin(test_order_ids)]

In [5]:
train_frozen_df.to_csv('train_frozen_df.csv', index=False)
test_frozen_df.to_csv('test_frozen_df.csv', index=False)

### METHOD 1: Top 100

In [None]:
df = train_frozen_df
df.head()

In [None]:
# Group the data by product name, product ID, aisle, and count unique order IDs
product_counts = df.groupby(['product_name', 'product_id', 'aisle'])['order_id'].nunique().reset_index()

# Sort the products by the count of unique order IDs in descending order
sorted_products = product_counts.sort_values(by='order_id', ascending=False)

# Select the top 100 products with the highest counts
top_100_frozen = sorted_products.head(100)

# Rename the 'order_id' column to 'total_sales' for clarity
top_100_frozen = top_100_frozen.rename(columns={'order_id': 'total_sales'})

In [None]:
top_100_frozen

In [None]:
# Create a bar chart to display the top 100 products and their total sales
plt.figure(figsize=(12, 20))
plt.barh(top_100_frozen['product_name'], top_100_frozen['total_sales'], color='skyblue')
plt.xlabel('Total Sales')
plt.ylabel('Product Name')
plt.title('Top 100 Frozen Products by Total Sales')
plt.gca().invert_yaxis()  # Invert the y-axis to display the highest sales at the top
plt.tight_layout()

# Show the bar chart
plt.show()

#### Metric Calculation for frozen

In [None]:
# calculate total unique orders in the test data
tot_order = test_frozen_df['order_id'].nunique()
print(f"Total number of orders: {tot_order}")

In [None]:
# calculate the average number of items in each order
avg_item = len(test_frozen_df) / tot_order
print(f"Average number of items in each order: {avg_item}")

In [None]:
# join the test data with top_100_frozen on 'product_id'
test_frozen_pick = test_frozen_df.merge(top_100_frozen, on='product_id', how='left')

# group by 'order_id' and count the non-null values of 'product_name_y' column
metrics_tmp = test_frozen_pick[['order_id', 'product_name_y']].groupby('order_id').count().reset_index().rename(columns={'product_name_y': "num_prod_matching"})
metrics_tmp.head()

In [None]:
# calculate metric 1
# count orders with at least a match
metric_1 = len(metrics_tmp[metrics_tmp['num_prod_matching']>0])
print(f"Out of {tot_order} orders, {metric_1} of them utilize the in-aisle items. That's about {metric_1/tot_order*100:.2f}%.")

In [None]:
# calculate metric 2
metric_2 = metrics_tmp['num_prod_matching'].sum() / len(metrics_tmp)
print(f"On average there are {avg_item:.2f} items in each order, and {metric_2:.2f} of them utilize in-aisle items. That's about {metric_2/avg_item*100:.2f}%")

In [None]:
sorted_products.shape

In [None]:
sorted_products.head()

In [None]:
sorted_products['product_name'].nunique   


### METHOD 2: Top 100 with similarity checking

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import Levenshtein

In [None]:
# Step 1: Calculate sales (number of unique order_ids) for each product
product_sales = df.groupby('product_name')['order_id'].nunique().reset_index()
product_sales = product_sales.rename(columns={'order_id': 'sales'})

# Step 2: Sort products by sales in descending order
sorted_products = product_sales.sort_values(by='sales', ascending=False)

# Step 3: Initialize an empty list of selected products
selected_products = []

# Step 4: Iterate through products and select based on similarity
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(sorted_products['product_name'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
selection_threshold = 0.6

for idx, row in sorted_products.iterrows():
    product_name = row['product_name']

    # Check similarity with already selected products
    is_similar = all(
        cosine_similarities[idx, sorted_products[sorted_products['product_name'] == sp].index[0]] < selection_threshold
        for sp in selected_products
    )

    if is_similar:
        selected_products.append(product_name)

    # Stop when 100 products are selected
    if len(selected_products) == 100:
        break


In [None]:
selected_data = df[df['product_name'].isin(selected_products)]

# Calculate total sales for each selected product
total_sales = selected_data.groupby('product_name')['order_id'].nunique().reset_index()
total_sales = total_sales.rename(columns={'order_id': 'total_sales'})

# Merge the total sales data with the selected data
selected_data = selected_data.merge(total_sales, on='product_name')

# Select the desired columns
selected_data = selected_data[['product_name', 'product_id', 'aisle', 'total_sales']]

# Remove duplicate rows
selected_data = selected_data.drop_duplicates()

In [None]:
selected_data.head(10)

In [None]:
# Sort the selected_data by total_sales in descending order
selected_data = selected_data.sort_values(by='total_sales', ascending=False)

# Create a bar chart to display the top 100 products and their total sales
plt.figure(figsize=(12, 20))
plt.barh(selected_data['product_name'], selected_data['total_sales'], color='skyblue')
plt.xlabel('Total Sales')
plt.ylabel('Product Name')
plt.title('Top 100 Frozen Products by Total Sales')
plt.gca().invert_yaxis()  # Invert the y-axis to display the highest sales at the top
plt.tight_layout()

# Show the bar chart
plt.show()

In [None]:
test_df = pd.read_csv('test_frozen_df.csv')

In [None]:
test_df.head()

#### Metric Calculation without accounting for substitutes

In [None]:
# calculate total unique orders in the test data
tot_order = test_df['order_id'].nunique()
print(f"Total number of orders: {tot_order}")

In [None]:
# calculate the average number of items in each order
avg_item = len(test_df) / tot_order
print(f"Average number of items in each order: {avg_item}")

In [None]:
# join the test data with my pick on 'product_id'
test_selected_products = test_df.merge(selected_data, on='product_id', how='left')

# group by 'order_id' and count the non-null values of 'product_name_y' column
metrics_tmp = test_selected_products[['order_id', 'product_name_y']].groupby('order_id').count().reset_index().rename(columns={'product_name_y': "num_prod_matching"})
metrics_tmp.head()

In [None]:
# calculate metric 1
# count orders with at least a match
metric_1 = len(metrics_tmp[metrics_tmp['num_prod_matching']>0])
print(f"Out of {tot_order} orders, {metric_1} of them utilize the in-aisle items. That's about {metric_1/tot_order*100:.2f}%.")

In [None]:
# calculate metric 2
metric_2 = metrics_tmp['num_prod_matching'].sum() / len(metrics_tmp)
print(f"On average there are {avg_item:.2f} items in each order, and {metric_2:.2f} of them utilize in-aisle items. That's about {metric_2/avg_item*100:.2f}%")

#### Metric Calculation accounting for substitutes

In [None]:
# Function to calculate similarity between two product names
def calculate_similarity(product_name1, product_name2):
    return 1 - (Levenshtein.distance(product_name1.lower(), product_name2.lower()) / max(len(product_name1), len(product_name2)))

# Initialize a dictionary to store num_prod_match for each order_id
order_id_dict = {}
similarity_threshold = 0.4

# Iterate through unique order IDs in test_df
unique_order_ids = test_df['order_id'].unique()
for order_id in unique_order_ids:
    order_products = test_df[test_df['order_id'] == order_id]['product_name']
    num_prod_match = 0

    for product_name in order_products:
        for selected_product in selected_products:
            similarity = calculate_similarity(product_name, selected_product)
            if similarity >= similarity_threshold:
                num_prod_match += 1
                break  # Exit inner loop once a match is found

    order_id_dict[order_id] = num_prod_match

# Create a DataFrame from the order_id_dict
result_df = pd.DataFrame(order_id_dict.items(), columns=['order_id', 'num_prod_match'])

In [None]:
result_df.head()

In [None]:
# calculate metric 1
# count orders with at least a match
metric_sub_1 = len(result_df[result_df['num_prod_match']>0])
print(f"Out of {tot_order} orders, {metric_sub_1} of them utilize the in-aisle items. That's about {metric_sub_1/tot_order*100:.2f}%.")

In [None]:
# calcualte metric 2
metric_sub_2 = result_df['num_prod_match'].sum() / len(result_df)
print(f"On average there are {avg_item:.2f} items in each order, and {metric_sub_2:.2f} of them utilize in-aisle items. That's about {metric_sub_2/avg_item*100:.2f}%")

In [None]:
import plotly.express as px
# Get unique departments
unique_departments = df['department'].unique()

# Iterate through each department
for department in unique_departments:
    # Filter the DataFrame for the current department
    department_df = df[df['department'] == department]

    # Get unique aisles and products within the department
    unique_aisles = department_df['aisle'].unique()
    unique_products = department_df['product_name'].unique()

    # Create a sunburst chart for the current department
    fig = px.sunburst(department_df, path=['aisle', 'product_name'])

    # Customize the layout
    fig.update_layout(
        title=f"Sunburst Chart for Department: {department}",
        margin=dict(l=0, r=0, b=0, t=30),
        paper_bgcolor="white",
        height=600,  # Adjust the height
        width=800,   # Adjust the width
    )

    # Show the chart
    fig.show()

### METHOD 3: THE ULTIMATE

In [6]:
df = train_frozen_df

In [7]:
# Step 1: Calculate total sales and total unique products for each unique aisle
aisle_info = df.groupby(['aisle']).agg(
    total_sales=('order_id', 'nunique'),
    total_unique_products=('product_id', 'nunique')
).reset_index()

# Sort aisles by total sales in ascending order
aisle_info = aisle_info.sort_values(by='total_sales', ascending=True)

# Calculate the total number of products to select
total_products_to_select = 100

# Create a table to store the data of aisle, number of unique products, and number of products to find
aisle_data = []

# Step 4: Loop through each aisle and assign the number of products to select
for index, row in aisle_info.iterrows():
    aisle = row['aisle']
    total_sales = row['total_sales']
    total_products = row['total_unique_products']
    num_products_to_select = int((total_sales / aisle_info['total_sales'].sum()) * total_products_to_select)

    # Ensure we don't exceed the available unique products in this aisle
    num_products_to_select = min(num_products_to_select, total_products)

    # Calculate unselected_products
    unselected_products = total_products - num_products_to_select

    # Create a row for the aisle_data table
    aisle_row = {'aisle': aisle, 'total_unique_products': total_products, 'total_sales': total_sales,
                 'number of products to select': num_products_to_select, 'unselected_products': unselected_products}
    aisle_data.append(aisle_row)

# Convert the aisle_data table to a DataFrame
aisle_info = pd.DataFrame(aisle_data)

products_left_to_find = total_products_to_select - aisle_info['number of products to select'].sum() 

# Create aisle_info_2 with aisles having unselected_products not equal to 0
aisle_info_2 = aisle_info[aisle_info['unselected_products'] != 0]
# Drop the 'total_unique_products' column
aisle_info_2.drop(columns=['total_unique_products','number of products to select'], inplace=True)
# Rename the 'unselected_products' column as 'total_unique_products'
aisle_info_2.rename(columns={'unselected_products': 'total_unique_products'}, inplace=True)

In [8]:
products_left_to_find

6

In [9]:
df['aisle'].nunique()

11

In [10]:
aisle_info.drop(columns=['total_unique_products','total_sales'], inplace=True)
final_aisle_info=aisle_info
final_aisle_info.head()

Unnamed: 0,aisle,number of products to select,unselected_products
0,frozen juice,0,23
1,frozen dessert,1,61
2,frozen breads doughs,2,58
3,frozen meat seafood,3,139
4,frozen vegan vegetarian,4,151


In [11]:
final_aisle_info = final_aisle_info.sort_values(by='unselected_products', ascending=True)
for index, row in final_aisle_info.iterrows():
    # Check if 'unselected_products' is non-zero
    if row['unselected_products'] > 0:
        # Increment 'number of products to select' by 1
        final_aisle_info.at[index, 'number of products to select'] += 1
        # Decrement 'unselected_products' by 1
        final_aisle_info.at[index, 'unselected_products'] -= 1
        products_left_to_find-=1
        if products_left_to_find ==0:
            break

In [12]:
products_left_to_find

0

In [13]:
# Find the aisle(s) with number of products to select equal to 0
aisles_with_zero_products_to_select = final_aisle_info[final_aisle_info['number of products to select'] == 0]

# Print the result
print("Aisle(s) with number of products to select equal to 0:")
print(aisles_with_zero_products_to_select)

Aisle(s) with number of products to select equal to 0:
Empty DataFrame
Columns: [aisle, number of products to select, unselected_products]
Index: []


In [14]:
final_aisle_info.head()

Unnamed: 0,aisle,number of products to select,unselected_products
0,frozen juice,1,22
2,frozen breads doughs,3,57
1,frozen dessert,2,60
3,frozen meat seafood,4,138
4,frozen vegan vegetarian,5,150


In [15]:
# Initialize a DataFrame to store selected products
selected_products_df = pd.DataFrame(columns=df.columns)

# Group by 'aisle' and 'product_name' to calculate total sales
product_sales = df.groupby(['aisle', 'product_name'])['order_id'].nunique().reset_index()
product_sales.rename(columns={'order_id': 'total_sales'}, inplace=True)

# Sort products in each aisle by total sales in descending order
sorted_products = product_sales.sort_values(by=['aisle', 'total_sales'], ascending=[True, False])

# Iterate through each aisle and select the top-selling products
for aisle, num_to_select in zip(final_aisle_info['aisle'], final_aisle_info['number of products to select']):
    aisle_products = sorted_products[sorted_products['aisle'] == aisle].head(num_to_select)
    selected_products_df = pd.concat([selected_products_df, df[df['product_name'].isin(aisle_products['product_name'])]])

# Reset the index of the selected products DataFrame
selected_products_df.reset_index(drop=True, inplace=True)

# Create a DataFrame with the selected columns
selected_data = selected_products_df[['product_name', 'product_id', 'aisle_id', 'department_id']]

# Drop duplicate rows to keep only unique products
selected_data.drop_duplicates(inplace=True)

# Reset the index of the selected_data DataFrame
selected_data.reset_index(drop=True, inplace=True)

In [16]:
selected_data.head()

Unnamed: 0,product_name,product_id,aisle_id,department_id
0,"Smoothies, Strawberries Wild",38959,113,1
1,Pizza Dough,18908,58,1
2,Gluten Free Whole Grain Bread,7963,58,1
3,Organic Fillo Dough,45143,58,1
4,Dark Chocolate Covered Banana,43889,119,1


In [17]:
selected_data.shape

(100, 4)

In [25]:
test_df = pd.read_csv('test_frozen_df.csv')

#### Metric Calculation without accounting for substitutes for METHOD OF OTHERS

In [26]:
# calculate total unique orders in the test data
tot_order = test_df['order_id'].nunique()
print(f"Total number of orders: {tot_order}")

Total number of orders: 10782


In [27]:
# calculate the average number of items in each order
avg_item = len(test_df) / tot_order
print(f"Average number of items in each order: {avg_item}")

Average number of items in each order: 1.9004822852902987


In [28]:
# join the test data with my pick on 'product_id'
test_selected_data = test_df.merge(selected_data, on='product_id', how='left')

# group by 'order_id' and count the non-null values of 'product_name_y' column
metrics_tmp = test_selected_data[['order_id', 'product_name_y']].groupby('order_id').count().reset_index().rename(columns={'product_name_y': "num_prod_matching"})
metrics_tmp.head()

Unnamed: 0,order_id,num_prod_matching
0,11,0
1,12,0
2,26,1
3,27,2
4,32,2


In [29]:
# calculate metric 1
# count orders with at least a match
metric_1 = len(metrics_tmp[metrics_tmp['num_prod_matching']>0])
print(f"Out of {tot_order} orders, {metric_1} of them utilize the in-aisle items. That's about {metric_1/tot_order*100:.2f}%.")

Out of 10782 orders, 5859 of them utilize the in-aisle items. That's about 54.34%.


In [30]:
# calculate metric 2
metric_2 = metrics_tmp['num_prod_matching'].sum() / len(metrics_tmp)
print(f"On average there are {avg_item:.2f} items in each order, and {metric_2:.2f} of them utilize in-aisle items. That's about {metric_2/avg_item*100:.2f}%")

On average there are 1.90 items in each order, and 0.75 of them utilize in-aisle items. That's about 39.35%


#### Metric Calculation accounting for substitutes for METHOD OF OTHERS

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

selected_products=selected_data['product_name'].unique()
# Get unique product names from the entire dataset
unique_product_names = test_df['product_name'].unique()

# Vectorize the unique product names and selected products
tfidf_vectorizer = TfidfVectorizer()
product_name_matrix = tfidf_vectorizer.fit_transform(unique_product_names)
selected_products_matrix = tfidf_vectorizer.transform(selected_products)

# Calculate the cosine similarity matrix between unique product names and selected products
similarity_matrix = cosine_similarity(product_name_matrix, selected_products_matrix)

# Create a dictionary to store num_prod_match for each order_id
order_id_dict = {}
similarity_threshold = 0.5
order_ids_checked = 0

# Iterate through unique order IDs in test_df
unique_order_ids = test_df['order_id'].unique()
for order_id in unique_order_ids:
    order_products = test_df[test_df['order_id'] == order_id]['product_name']
    num_prod_match = 0
    
    for product_name in order_products:
        product_index = np.where(unique_product_names == product_name)[0][0]  # Find the index of the current product
        similarities = similarity_matrix[product_index]  # Get similarities for the current product
        
        # Check if any similarity is above the threshold
        if any(similarities >= similarity_threshold):
            num_prod_match += 1
    
    order_id_dict[order_id] = num_prod_match
    order_ids_checked += 1
    print(f"Progress: {((order_ids_checked) / len(unique_order_ids)) * 100}%")

Progress: 0.009274717121127806%
Progress: 0.01854943424225561%
Progress: 0.02782415136338342%
Progress: 0.03709886848451122%
Progress: 0.04637358560563903%
Progress: 0.05564830272676684%
Progress: 0.06492301984789464%
Progress: 0.07419773696902245%
Progress: 0.08347245409015025%
Progress: 0.09274717121127805%
Progress: 0.10202188833240587%
Progress: 0.11129660545353368%
Progress: 0.12057132257466147%
Progress: 0.12984603969578928%
Progress: 0.13912075681691707%
Progress: 0.1483954739380449%
Progress: 0.1576701910591727%
Progress: 0.1669449081803005%
Progress: 0.17621962530142832%
Progress: 0.1854943424225561%
Progress: 0.19476905954368393%
Progress: 0.20404377666481174%
Progress: 0.21331849378593953%
Progress: 0.22259321090706735%
Progress: 0.23186792802819514%
Progress: 0.24114264514932293%
Progress: 0.25041736227045075%
Progress: 0.25969207939157857%
Progress: 0.26896679651270633%
Progress: 0.27824151363383415%
Progress: 0.28751623075496197%
Progress: 0.2967909478760898%
Progress: 0.

In [33]:
result_df = pd.DataFrame(order_id_dict.items(), columns=['order_id', 'num_prod_match'])
result_df.head()

Unnamed: 0,order_id,num_prod_match
0,11,0
1,12,0
2,26,1
3,27,2
4,32,2


In [34]:
# calculate metric 1
# count orders with at least a match
metric_sub_1 = len(result_df[result_df['num_prod_match']>0])
print(f"Out of {tot_order} orders, {metric_sub_1} of them utilize the in-aisle items. That's about {metric_sub_1/tot_order*100:.2f}%.")

Out of 10782 orders, 7828 of them utilize the in-aisle items. That's about 72.60%.


In [35]:
# calcualte metric 2
metric_sub_2 = result_df['num_prod_match'].sum() / len(result_df)
print(f"On average there are {avg_item:.2f} items in each order, and {metric_sub_2:.2f} of them utilize in-aisle items. That's about {metric_sub_2/avg_item*100:.2f}%")

On average there are 1.90 items in each order, and 1.08 of them utilize in-aisle items. That's about 56.79%


In [36]:
# Save the selected_data DataFrame to a CSV file
selected_data.to_csv('selected_products_frozen.csv', index=False)

In [37]:
selected_data.head()

Unnamed: 0,product_name,product_id,aisle_id,department_id
0,"Smoothies, Strawberries Wild",38959,113,1
1,Pizza Dough,18908,58,1
2,Gluten Free Whole Grain Bread,7963,58,1
3,Organic Fillo Dough,45143,58,1
4,Dark Chocolate Covered Banana,43889,119,1


### METHOD 3: THE ULTIMATE