In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage
import plotly.figure_factory as ff
import networkx as nx
import plotly.express as px
import matplotlib.patches as mpatches
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
import Levenshtein

In [None]:
data = pd.read_csv('new_other_df.csv')
data.head()

In [None]:
data['department'].unique()

In [None]:
# Get unique order IDs
unique_order_ids = data['order_id'].unique()

# Calculate the index to split the data (70% training, 30% testing)
split_index = int(0.7 * len(unique_order_ids))

# Randomly shuffle the unique order IDs
np.random.seed(42)  # Set a seed for reproducibility
np.random.shuffle(unique_order_ids)

# Split the unique order IDs into training and testing sets
train_order_ids = unique_order_ids[:split_index]
test_order_ids = unique_order_ids[split_index:]

# Create training and testing DataFrames based on the selected order IDs
train_other_df = data[data['order_id'].isin(train_order_ids)]
test_other_df = data[data['order_id'].isin(test_order_ids)]

In [None]:
train_other_df.to_csv('train_other_df.csv', index=False)
test_other_df.to_csv('test_other_df.csv', index=False)

In [None]:
df = train_other_df
df.head()

### PLOTS

In [None]:
# Get unique departments
unique_departments = df['department'].unique()

# Iterate through each department
for department in unique_departments:
    # Filter the DataFrame for the current department
    department_df = df[df['department'] == department]

    # Get unique aisles and products within the department
    unique_aisles = department_df['aisle'].unique()
    unique_products = department_df['product_name'].unique()

    # Create a sunburst chart for the current department
    fig = px.sunburst(department_df, path=['aisle', 'product_name'])

    # Customize the layout
    fig.update_layout(
        title=f"Sunburst Chart for Department: {department}",
        margin=dict(l=0, r=0, b=0, t=30),
        paper_bgcolor="white",
        height=600,  # Adjust the height
        width=800,   # Adjust the width
    )

    # Show the chart
    fig.show()

In [None]:
# Calculate total sales for each department-aisle combination
department_aisle_sales = df.groupby(['department', 'aisle'])['order_id'].nunique().reset_index()
department_aisle_sales.columns = ['department', 'aisle', 'total_sales']

# Sort department-aisle combinations by total sales in descending order
department_aisle_sales = department_aisle_sales.sort_values(by='total_sales', ascending=False)

# Create a horizontal bar graph
plt.figure(figsize=(10, 6))
plt.barh(department_aisle_sales['department'] + ' - ' + department_aisle_sales['aisle'],
         department_aisle_sales['total_sales'],
         color='skyblue')
plt.xlabel('Total Sales')
plt.ylabel('Department - Aisle')
plt.title('Sales by Department-Aisle Combinations')
plt.gca().invert_yaxis()  # Reverse the y-axis to display the highest sales at the top
plt.tight_layout()
plt.show()

In [None]:
# Group by department and aisle, then count unique products
unique_products_per_combination = df.groupby(['department', 'aisle'])['product_id'].nunique().reset_index()
unique_products_per_combination.columns = ['department', 'aisle', 'total_products']
unique_products_per_combination = unique_products_per_combination.sort_values(by='total_products', ascending=False)

# Create a bar plot to visualize the data
plt.figure(figsize=(12, 6))
plt.barh(
    unique_products_per_combination['department'] + ' - ' + unique_products_per_combination['aisle'],
    unique_products_per_combination['total_products'],
)
plt.xlabel('Number of Unique Products')
plt.ylabel('Department - Aisle Combination')
plt.title('Number of Unique Products per Department-Aisle Combination')
plt.gca().invert_yaxis()
plt.xticks(rotation=45)
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Group by department, aisle, and product name and count unique order_ids for each combination
product_sales = df.groupby(['department', 'aisle', 'product_name'])['order_id'].nunique().reset_index()

# Sort the products by sales in descending order and get the top 100
top_100_products = product_sales.sort_values(by='order_id', ascending=False).head(100)

# Create a unique color for each department-aisle combination
unique_combinations = top_100_products[['department', 'aisle']].drop_duplicates()
color_palette = sns.color_palette("hsv", len(unique_combinations))

# Create a mapping of department-aisle combinations to colors
color_mapping = dict(zip(zip(unique_combinations['department'], unique_combinations['aisle']), color_palette))

# Map each product's department-aisle combination to its color
top_100_products['color'] = top_100_products.apply(lambda row: color_mapping[(row['department'], row['aisle'])], axis=1)

# Create a bar plot to visualize the top 100 products with color-coded department-aisle combinations
plt.figure(figsize=(12, 18))
ax = sns.barplot(
    x='order_id',
    y='product_name',
    data=top_100_products,
    palette=top_100_products['color'],
)
plt.xlabel('Number of Unique Orders (Sales)')
plt.ylabel('Product Name')
plt.title('Top 100 Products by Sales with Associated Department-Aisle Combination (Color Coded by Department-Aisle)')
plt.xticks(rotation=0)
plt.tight_layout()

# Create a legend for department-aisle combinations
legend_labels = []
for key, value in color_mapping.items():
    label = f"{key[0]} - {key[1]}"
    legend_labels.append(mpatches.Patch(color=value, label=label))

# Add a legend to the plot
plt.legend(handles=legend_labels, title='Department - Aisle', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
# Group by department and aisle to calculate total unique products and total sales
department_aisle_stats = df.groupby(['department', 'aisle'])[['product_id', 'order_id']].nunique().reset_index()

# Calculate the average per product cost
department_aisle_stats['avg_per_product_sales'] = department_aisle_stats['order_id'] / department_aisle_stats['product_id']

# Sort the department-aisle combinations by average per product cost in descending order
department_aisle_stats = department_aisle_stats.sort_values(by='avg_per_product_sales', ascending=False)

# Create a bar plot with a single color for all department-aisle combinations
plt.figure(figsize=(12, 8))
plt.barh(
    department_aisle_stats['department'] + ' - ' + department_aisle_stats['aisle'],
    department_aisle_stats['avg_per_product_sales'],
    color='skyblue',  # Specify a single color
)

plt.xlabel('Average Per Product Sales')
plt.ylabel('Department - Aisle Combination')
plt.title('Average Per Product Sales by Department-Aisle Combination (Sorted by Sales)')
plt.gca().invert_yaxis()
plt.xticks(rotation=45)
plt.tight_layout()

# Show the plot
plt.show()


### METHOD 1: Selection based on Pure total sales of each product

In [None]:
# Group by product and count unique order_ids for each product
product_sales = df.groupby('product_name')['order_id'].nunique().reset_index()

# Sort the products by sales in descending order and get the top 9800
top_9800_products = product_sales.sort_values(by='order_id', ascending=False).head(9800)

# Merge with the original DataFrame to get the additional information
top_9800_products = top_9800_products.merge(
    df[['product_name', 'product_id', 'aisle_id', 'department_id']].drop_duplicates(),
    on='product_name',
    how='left'
)
top_9800_products = top_9800_products.drop(columns=['order_id'])
top_9800_products.head()

In [None]:
top_9800_products.shape

In [None]:
selected_data = top_9800_products

In [None]:
test_df = pd.read_csv('test_other_df.csv')

#### Metric Calculation without accounting for substitutes for METHOD 1

In [None]:
# calculate total unique orders in the test data
tot_order = test_df['order_id'].nunique()
print(f"Total number of orders: {tot_order}")

In [None]:
# calculate the average number of items in each order
avg_item = len(test_df) / tot_order
print(f"Average number of items in each order: {avg_item}")

In [None]:
# join the test data with my pick on 'product_id'
test_selected_products = test_df.merge(selected_data, on='product_id', how='left')

# group by 'order_id' and count the non-null values of 'product_name_y' column
metrics_tmp = test_selected_products[['order_id', 'product_name_y']].groupby('order_id').count().reset_index().rename(columns={'product_name_y': "num_prod_matching"})
metrics_tmp.head()

In [None]:
# calculate metric 1
# count orders with at least a match
metric_1 = len(metrics_tmp[metrics_tmp['num_prod_matching']>0])
print(f"Out of {tot_order} orders, {metric_1} of them utilize the in-aisle items. That's about {metric_1/tot_order*100:.2f}%.")

In [None]:
# calculate metric 2
metric_2 = metrics_tmp['num_prod_matching'].sum() / len(metrics_tmp)
print(f"On average there are {avg_item:.2f} items in each order, and {metric_2:.2f} of them utilize in-aisle items. That's about {metric_2/avg_item*100:.2f}%")

#### Metric Calculation accounting for substitutes for METHOD 1

In [None]:
selected_products=selected_data['product_name'].unique()
# Get unique product names from the entire dataset
unique_product_names = test_df['product_name'].unique()

# Vectorize the unique product names and selected products
tfidf_vectorizer = TfidfVectorizer()
product_name_matrix = tfidf_vectorizer.fit_transform(unique_product_names)
selected_products_matrix = tfidf_vectorizer.transform(selected_products)

# Calculate the cosine similarity matrix between unique product names and selected products
similarity_matrix = cosine_similarity(product_name_matrix, selected_products_matrix)

# Create a dictionary to store num_prod_match for each order_id
order_id_dict = {}
similarity_threshold = 0.5
order_ids_checked = 0

# Iterate through unique order IDs in test_df
unique_order_ids = test_df['order_id'].unique()
for order_id in unique_order_ids:
    order_products = test_df[test_df['order_id'] == order_id]['product_name']
    num_prod_match = 0
    
    for product_name in order_products:
        product_index = np.where(unique_product_names == product_name)[0][0]  # Find the index of the current product
        similarities = similarity_matrix[product_index]  # Get similarities for the current product
        
        # Check if any similarity is above the threshold
        if any(similarities >= similarity_threshold):
            num_prod_match += 1
    
    order_id_dict[order_id] = num_prod_match
    order_ids_checked += 1
    #print(f"Progress: {((order_ids_checked) / len(unique_order_ids)) * 100}%")

In [None]:
result_df = pd.DataFrame(order_id_dict.items(), columns=['order_id', 'num_prod_match'])
result_df.head()

In [None]:
result_df.shape

In [None]:
# calculate metric 1
# count orders with at least a match
metric_sub_1 = len(result_df[result_df['num_prod_match']>0])
print(f"Out of {tot_order} orders, {metric_sub_1} of them utilize the in-aisle items. That's about {metric_sub_1/tot_order*100:.2f}%.")

In [None]:
# calcualte metric 2
metric_sub_2 = result_df['num_prod_match'].sum() / len(result_df)
print(f"On average there are {avg_item:.2f} items in each order, and {metric_sub_2:.2f} of them utilize in-aisle items. That's about {metric_sub_2/avg_item*100:.2f}%")

In [None]:
file_path = 'results_other.txt'

with open(file_path, 'a') as file:
    file.write(f"METHOD 1: Selection based on Pure total sales of each product\n")
    file.write(f"Metric Calculation without accounting for substitutes for METHOD 1\n")
    file.write(f"Out of {tot_order} orders, {metric_1} of them utilize the in-aisle items. That's about {metric_1/tot_order*100:.2f}%.\n")
    file.write(f"On average there are {avg_item:.2f} items in each order, and {metric_2:.2f} of them utilize in-aisle items. That's about {metric_2/avg_item*100:.2f}%\n")
    file.write(f"Metric Calculation accounting for substitutes for METHOD 1\n")
    file.write(f"Out of {tot_order} orders, {metric_sub_1} of them utilize the in-aisle items. That's about {metric_sub_1/tot_order*100:.2f}%.\n")
    file.write(f"On average there are {avg_item:.2f} items in each order, and {metric_sub_2:.2f} of them utilize in-aisle items. That's about {metric_sub_2/avg_item*100:.2f}%\n")
    file.write("\n")

### METHOD 2: Selection based on weightage distribution by total sales per department-aisle

Find out the unique department-aisle combinations and the total sales (total order_id ) associated with it. Now the total order-id should not be just sum of unique order_id. Count all the order_ids, even uplicates associated with that deprtment-aisle combination. Make a plot to show the distribution of total sales of each department-aisle combination. 
Now, assign a number 'number of products to select' for each department-aisle combination based on the total sales from each department-aisle. Assign such that the total count of number of products selected (summed over all department-aisles) should be 100 and every department-aise should get atleast 1 product. 
Now start picking the products, from every deparment-aisle combination, unique products. Number of products to be selected from each department-aisle combination is specified by the column 'number of products to select'. However, the condition should be such that no 2 products selected overall should have similarity of more than 50%. Similarity should be calculated using NLP techniques.  After fulfuling the similarity crieteria, the selected products should be the ones with maximum total sales (maximum unique order_ids associated with them) amongst the eligible products.

In [None]:
# Function to calculate similrity
def calculate_similarity(product_names):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(product_names)
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return similarity_matrix

In [None]:
df = train_other_df

In [None]:
# Step 1: Calculate total sales and total unique products for each unique aisle
aisle_info = df.groupby(['aisle']).agg(
    total_sales=('order_id', 'nunique'),
    total_unique_products=('product_id', 'nunique')
).reset_index()

# Sort aisles by total sales in ascending order
aisle_info = aisle_info.sort_values(by='total_sales', ascending=True)

# Calculate the total number of products to select
total_products_to_select = 9800

# Create a table to store the data of aisle, number of unique products, and number of products to find
aisle_data = []

# Step 4: Loop through each aisle and assign the number of products to select
for index, row in aisle_info.iterrows():
    aisle = row['aisle']
    total_sales = row['total_sales']
    total_products = row['total_unique_products']
    num_products_to_select = int((total_sales / aisle_info['total_sales'].sum()) * total_products_to_select)

    # Ensure we don't exceed the available unique products in this aisle
    num_products_to_select = min(num_products_to_select, total_products)

    # Calculate unselected_products
    unselected_products = total_products - num_products_to_select

    # Create a row for the aisle_data table
    aisle_row = {'aisle': aisle, 'total_unique_products': total_products, 'total_sales': total_sales,
                 'number of products to select': num_products_to_select, 'unselected_products': unselected_products}
    aisle_data.append(aisle_row)

# Convert the aisle_data table to a DataFrame
aisle_info = pd.DataFrame(aisle_data)

products_left_to_find = total_products_to_select - aisle_info['number of products to select'].sum()

# Create aisle_info_2 with aisles having unselected_products not equal to 0
aisle_info_2 = aisle_info[aisle_info['unselected_products'] != 0]
# Drop the 'total_unique_products' column
aisle_info_2.drop(columns=['total_unique_products','number of products to select'], inplace=True)
# Rename the 'unselected_products' column as 'total_unique_products'
aisle_info_2.rename(columns={'unselected_products': 'total_unique_products'}, inplace=True)


In [None]:
aisle_info.head()

In [None]:
 aisle_info_2.head()

In [None]:
products_left_to_find

In [None]:
# Sort aisles by total sales in ascending order
aisle_info_2 = aisle_info_2.sort_values(by='total_sales', ascending=True)

# Calculate the total number of products to select
total_products_to_select = products_left_to_find

# Create a table to store the data of aisle, number of unique products, and number of products to find
aisle_data_2 = []

# Step 4: Loop through each aisle and assign the number of products to select
for index, row in aisle_info_2.iterrows():
    aisle = row['aisle']
    total_sales = row['total_sales']
    total_products = row['total_unique_products']
    num_products_to_select = int((total_sales / aisle_info_2['total_sales'].sum()) * total_products_to_select)

    # Ensure we don't exceed the available unique products in this aisle
    num_products_to_select = min(num_products_to_select, total_products)

    # Calculate unselected_products
    unselected_products = max(0,total_products - num_products_to_select)

    # Create a row for the aisle_data table
    aisle_row = {'aisle': aisle, 'total_unique_products': total_products, 'total_sales': total_sales,
                 'number of products to select': num_products_to_select, 'unselected_products': unselected_products}
    aisle_data_2.append(aisle_row)

# Convert the aisle_data table to a DataFrame
aisle_info_2 = pd.DataFrame(aisle_data_2)

products_left_to_find = total_products_to_select - aisle_info_2['number of products to select'].sum()



In [None]:
products_left_to_find

In [None]:
aisle_info_2.head()

In [None]:
# Create aisle_info_2 with aisles having unselected_products not equal to 0
aisle_info_3 = aisle_info_2[aisle_info_2['unselected_products'] != 0]
# Drop the 'total_unique_products' column
aisle_info_3.drop(columns=['total_unique_products','number of products to select'], inplace=True)
# Rename the 'unselected_products' column as 'total_unique_products'
aisle_info_3.rename(columns={'unselected_products': 'total_unique_products'}, inplace=True)

In [None]:
aisle_info_3.shape

In [None]:
df['aisle'].nunique()

In [None]:
# Create a list of unique aisles from both datasets
unique_aisles = list(set(aisle_info['aisle']).union(set(aisle_info_2['aisle'])))

# Initialize lists to store the columns
aisles_column = []
number_of_products_to_select_column = []
unselected_products_column = []

# Loop through unique aisles and calculate the columns
for aisle in unique_aisles:
    # Get the 'number of products to select' from both datasets and sum them
    num_products_to_select = aisle_info[aisle_info['aisle'] == aisle]['number of products to select'].sum()
    num_products_to_select += aisle_info_2[aisle_info_2['aisle'] == aisle]['number of products to select'].sum()

    # Get the 'unselected_products' from aisle_info_2, or set to 0 if not present
    unselected_products = aisle_info_2[aisle_info_2['aisle'] == aisle]['unselected_products'].sum()
    
    # Append values to respective columns
    aisles_column.append(aisle)
    number_of_products_to_select_column.append(num_products_to_select)
    unselected_products_column.append(unselected_products)

# Create a DataFrame with the calculated columns
final_aisle_info = pd.DataFrame({
    'aisle': aisles_column,
    'number of products to select': number_of_products_to_select_column,
    'unselected_products': unselected_products_column
})
final_aisle_info.head()


In [None]:
final_aisle_info = final_aisle_info.sort_values(by='unselected_products', ascending=True)
final_aisle_info.head(10)

In [None]:
# Iterate through rows of final_aisle_info
for index, row in final_aisle_info.iterrows():
    # Check if 'unselected_products' is non-zero
    if row['unselected_products'] > 0:
        # Increment 'number of products to select' by 1
        final_aisle_info.at[index, 'number of products to select'] += 1
        # Decrement 'unselected_products' by 1
        final_aisle_info.at[index, 'unselected_products'] -= 1

products_left_to_select = 9800 - final_aisle_info['number of products to select'].sum()

In [None]:
products_left_to_select

In [None]:
products_left_to_select = 9
final_aisle_info = final_aisle_info.sort_values(by='unselected_products', ascending=True)
for index, row in final_aisle_info.iterrows():
    # Check if 'unselected_products' is non-zero
    if row['unselected_products'] > 0:
        # Increment 'number of products to select' by 1
        final_aisle_info.at[index, 'number of products to select'] += 1
        # Decrement 'unselected_products' by 1
        final_aisle_info.at[index, 'unselected_products'] -= 1
        products_left_to_select-=1
        if products_left_to_select ==0:
            break
    

In [None]:
products_left_to_select

In [None]:
final_aisle_info.head()

In [None]:
# Find the aisle(s) with number of products to select equal to 0
aisles_with_zero_products_to_select = final_aisle_info[final_aisle_info['number of products to select'] == 0]

# Print the result
print("Aisle(s) with number of products to select equal to 0:")
print(aisles_with_zero_products_to_select)

In [None]:
# Initialize a DataFrame to store selected products
selected_products_df = pd.DataFrame(columns=df.columns)

# Group by 'aisle' and 'product_name' to calculate total sales
product_sales = df.groupby(['aisle', 'product_name'])['order_id'].nunique().reset_index()
product_sales.rename(columns={'order_id': 'total_sales'}, inplace=True)

# Sort products in each aisle by total sales in descending order
sorted_products = product_sales.sort_values(by=['aisle', 'total_sales'], ascending=[True, False])

# Iterate through each aisle and select the top-selling products
for aisle, num_to_select in zip(final_aisle_info['aisle'], final_aisle_info['number of products to select']):
    aisle_products = sorted_products[sorted_products['aisle'] == aisle].head(num_to_select)
    selected_products_df = pd.concat([selected_products_df, df[df['product_name'].isin(aisle_products['product_name'])]])

# Reset the index of the selected products DataFrame
selected_products_df.reset_index(drop=True, inplace=True)

# Create a DataFrame with the selected columns
selected_data = selected_products_df[['product_name', 'product_id', 'aisle_id', 'department_id']]

# Drop duplicate rows to keep only unique products
selected_data.drop_duplicates(inplace=True)

# Reset the index of the selected_data DataFrame
selected_data.reset_index(drop=True, inplace=True)

In [None]:
selected_data.head()

In [None]:
test_df = pd.read_csv('test_other_df.csv')

#### Metric Calculation without accounting for substitutes for METHOD 2

In [None]:
# calculate total unique orders in the test data
tot_order = test_df['order_id'].nunique()
print(f"Total number of orders: {tot_order}")

In [None]:
# calculate the average number of items in each order
avg_item = len(test_df) / tot_order
print(f"Average number of items in each order: {avg_item}")

In [None]:
# join the test data with my pick on 'product_id'
test_selected_data = test_df.merge(selected_data, on='product_id', how='left')

# group by 'order_id' and count the non-null values of 'product_name_y' column
metrics_tmp = test_selected_data[['order_id', 'product_name_y']].groupby('order_id').count().reset_index().rename(columns={'product_name_y': "num_prod_matching"})
metrics_tmp.head()

In [None]:
# calculate metric 1
# count orders with at least a match
metric_1 = len(metrics_tmp[metrics_tmp['num_prod_matching']>0])
print(f"Out of {tot_order} orders, {metric_1} of them utilize the in-aisle items. That's about {metric_1/tot_order*100:.2f}%.")

In [None]:
# calculate metric 2
metric_2 = metrics_tmp['num_prod_matching'].sum() / len(metrics_tmp)
print(f"On average there are {avg_item:.2f} items in each order, and {metric_2:.2f} of them utilize in-aisle items. That's about {metric_2/avg_item*100:.2f}%")

#### Metric Calculation accounting for substitutes for METHOD 2

In [None]:
selected_products=selected_data['product_name'].unique()
# Get unique product names from the entire dataset
unique_product_names = test_df['product_name'].unique()

# Vectorize the unique product names and selected products
tfidf_vectorizer = TfidfVectorizer()
product_name_matrix = tfidf_vectorizer.fit_transform(unique_product_names)
selected_products_matrix = tfidf_vectorizer.transform(selected_products)

# Calculate the cosine similarity matrix between unique product names and selected products
similarity_matrix = cosine_similarity(product_name_matrix, selected_products_matrix)

# Create a dictionary to store num_prod_match for each order_id
order_id_dict = {}
similarity_threshold = 0.5
order_ids_checked = 0

# Iterate through unique order IDs in test_df
unique_order_ids = test_df['order_id'].unique()
for order_id in unique_order_ids:
    order_products = test_df[test_df['order_id'] == order_id]['product_name']
    num_prod_match = 0
    
    for product_name in order_products:
        product_index = np.where(unique_product_names == product_name)[0][0]  # Find the index of the current product
        similarities = similarity_matrix[product_index]  # Get similarities for the current product
        
        # Check if any similarity is above the threshold
        if any(similarities >= similarity_threshold):
            num_prod_match += 1
    
    order_id_dict[order_id] = num_prod_match
    order_ids_checked += 1
    print(f"Progress: {((order_ids_checked) / len(unique_order_ids)) * 100}%")

In [None]:
result_df = pd.DataFrame(order_id_dict.items(), columns=['order_id', 'num_prod_match'])
result_df.head()

In [None]:
# calculate metric 1
# count orders with at least a match
metric_sub_1 = len(result_df[result_df['num_prod_match']>0])
print(f"Out of {tot_order} orders, {metric_sub_1} of them utilize the in-aisle items. That's about {metric_sub_1/tot_order*100:.2f}%.")

In [None]:
# calcualte metric 2
metric_sub_2 = result_df['num_prod_match'].sum() / len(result_df)
print(f"On average there are {avg_item:.2f} items in each order, and {metric_sub_2:.2f} of them utilize in-aisle items. That's about {metric_sub_2/avg_item*100:.2f}%")

In [None]:
# Save the selected_data DataFrame to a CSV file
selected_data.to_csv('selected_products.csv', index=False)

In [None]:
selected_data.head()

In [None]:
file_path = 'results_refrigerated.txt'

with open(file_path, 'a') as file:
    file.write(f"METHOD 2: Selection based on Pure total sales of each product\n")
    file.write(f"Metric Calculation without accounting for substitutes for METHOD 2\n")
    file.write(f"Out of {tot_order} orders, {metric_1} of them utilize the in-aisle items. That's about {metric_1/tot_order*100:.2f}%.\n")
    file.write(f"On average there are {avg_item:.2f} items in each order, and {metric_2:.2f} of them utilize in-aisle items. That's about {metric_2/avg_item*100:.2f}%\n")
    file.write(f"Metric Calculation accounting for substitutes for METHOD 2\n")
    file.write(f"Out of {tot_order} orders, {metric_sub_1} of them utilize the in-aisle items. That's about {metric_sub_1/tot_order*100:.2f}%.\n")
    file.write(f"On average there are {avg_item:.2f} items in each order, and {metric_sub_2:.2f} of them utilize in-aisle items. That's about {metric_sub_2/avg_item*100:.2f}%\n")
    file.write("\n")

### METHOD 3: Selection based on weightage distribution by median per product sale per department-aisle keeping atleast one product each


In [None]:
# Function to calculate text similarity between product names using TF-IDF and cosine similarity
def calculate_similarity(product_names):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(product_names)
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return similarity_matrix

In [None]:
total_products_to_select=9800
# Calculate total sales per unique product within each department-aisle combination
product_sales = df.groupby(['department', 'aisle', 'product_name'])['order_id'].nunique().reset_index()

# Calculate median per product sale within each department-aisle combination
median_per_product_sale = product_sales.groupby(['department', 'aisle'])['order_id'].median().reset_index()

# Assign a number of products to select initially as 1 for each unique department-aisle combination
median_per_product_sale['number_of_products_to_select'] = 1

# Calculate the total number of unique department-aisle combinations
total_combinations = len(median_per_product_sale)

# Calculate the number of products left to select
left_products_to_select = total_products_to_select - total_combinations

# Sort the combinations by median per product sale in descending order
median_per_product_sale = median_per_product_sale.sort_values(by='order_id', ascending=False)

# Distribute the remaining products based on median criterion
i = 0
while left_products_to_select > 0:
    median_per_product_sale.at[i, 'number_of_products_to_select'] += 1
    left_products_to_select -= 1
    i = (i + 1) % total_combinations

# Now the 'median_per_product_sale' DataFrame contains the desired 'number_of_products_to_select'


In [None]:
median_per_product_sale.head()

In [None]:
median_per_product_sale['number_of_products_to_select'].sum()

In [None]:
# Initialize an empty DataFrame to store selected products
selected_products = pd.DataFrame(columns=['product_name', 'department', 'aisle'])

In [None]:
# Iterate through department-aisle combinations
for _, row in median_per_product_sale.iterrows():
    department = row['department']
    aisle = row['aisle']
    num_to_select = row['number_of_products_to_select']
    
    print("Working on department:", department, "aisle:", aisle)
    print("Number of products to select:", num_to_select)
    
    # Filter products for the current department-aisle combination
    products_subset = df[(df['department'] == department) & (df['aisle'] == aisle)]
    
    # Create a dataset with unique product names and their total sales
    product_subset = products_subset.groupby('product_name')['order_id'].nunique().reset_index(name='total_sales')
    
    # Sort products_subset by total sales in descending order
    product_subset = product_subset.sort_values(by='total_sales', ascending=False)
    
    # Initialize similarity threshold
    similarity_threshold = 0.5
    
    # Initialize a variable to track the number of products selected for this combination
    num_selected = 0
    
    # Initialize a list to store selected indices
    selected_indices = []
    
    # Try different similarity thresholds until enough products are found
    while num_selected < num_to_select and similarity_threshold >= 0:
        print("Current similarity threshold:", similarity_threshold)
        
        # Calculate text similarity between product names
        similarity_matrix = calculate_similarity(product_subset['product_name'])
        
        # Find the indices of products that meet the similarity criteria
        indices_to_add = [
            i for i in range(similarity_matrix.shape[0])
            if i not in selected_indices
            and all(similarity_matrix[i, j] < similarity_threshold for j in selected_indices)
            
        ]
        # Limit the selection to a maximum of num_to_select products
        if len(indices_to_add) + num_selected > num_to_select:
            indices_to_add = indices_to_add[:num_to_select - num_selected]
        
        # Update the selected indices and num_selected
        selected_indices.extend(indices_to_add)
        num_selected = len(selected_indices)
        
        # Decrease the similarity threshold
        similarity_threshold -= 0.1
        
        print("Number of products selected so far:", num_selected)
    
    if similarity_threshold < 0:
        print("Not enough products found for department:", department, "aisle:", aisle)
    else:
       # Get the selected product indices from product_subset
        selected_product_indices = [selected_indices[i] for i in range(len(selected_indices)) if i < num_to_select]

        # Create a DataFrame with the selected product indices
        selected_products_subset = products_subset.iloc[selected_product_indices]

        # Append the selected products to the final DataFrame
        selected_products = pd.concat([selected_products, selected_products_subset])

print("Number of total products selected:", len(selected_products))



In [None]:
selected_products.drop(columns=["aisle", "department" ,"order_id"], axis=1, inplace=True)
selected_products.head()

In [None]:
selected_data = selected_products

In [None]:
test_df = pd.read_csv('test_other_df.csv')

#### Metric Calculation without accounting for substitutes for METHOD 3

In [None]:
# calculate total unique orders in the test data
tot_order = test_df['order_id'].nunique()
print(f"Total number of orders: {tot_order}")

In [None]:
# calculate the average number of items in each order
avg_item = len(test_df) / tot_order
print(f"Average number of items in each order: {avg_item}")

In [None]:
# join the test data with my pick on 'product_id'
test_selected_data = test_df.merge(selected_data, on='product_id', how='left')

# group by 'order_id' and count the non-null values of 'product_name_y' column
metrics_tmp = test_selected_data[['order_id', 'product_name_y']].groupby('order_id').count().reset_index().rename(columns={'product_name_y': "num_prod_matching"})
metrics_tmp.head()

In [None]:
# calculate metric 1
# count orders with at least a match
metric_1 = len(metrics_tmp[metrics_tmp['num_prod_matching']>0])
print(f"Out of {tot_order} orders, {metric_1} of them utilize the in-aisle items. That's about {metric_1/tot_order*100:.2f}%.")

In [None]:
# calculate metric 2
metric_2 = metrics_tmp['num_prod_matching'].sum() / len(metrics_tmp)
print(f"On average there are {avg_item:.2f} items in each order, and {metric_2:.2f} of them utilize in-aisle items. That's about {metric_2/avg_item*100:.2f}%")

#### Metric Calculation accounting for substitutes for METHOD 3

In [None]:
selected_products=selected_data['product_name'].unique()
# Get unique product names from the entire dataset
unique_product_names = test_df['product_name'].unique()

# Vectorize the unique product names and selected products
tfidf_vectorizer = TfidfVectorizer()
product_name_matrix = tfidf_vectorizer.fit_transform(unique_product_names)
selected_products_matrix = tfidf_vectorizer.transform(selected_products)

# Calculate the cosine similarity matrix between unique product names and selected products
similarity_matrix = cosine_similarity(product_name_matrix, selected_products_matrix)

# Create a dictionary to store num_prod_match for each order_id
order_id_dict = {}
similarity_threshold = 0.5
order_ids_checked = 0

# Iterate through unique order IDs in test_df
unique_order_ids = test_df['order_id'].unique()
for order_id in unique_order_ids:
    order_products = test_df[test_df['order_id'] == order_id]['product_name']
    num_prod_match = 0
    
    for product_name in order_products:
        product_index = np.where(unique_product_names == product_name)[0][0]  # Find the index of the current product
        similarities = similarity_matrix[product_index]  # Get similarities for the current product
        
        # Check if any similarity is above the threshold
        if any(similarities >= similarity_threshold):
            num_prod_match += 1
    
    order_id_dict[order_id] = num_prod_match
    order_ids_checked += 1
    print(f"Progress: {((order_ids_checked) / len(unique_order_ids)) * 100}%")

In [None]:
result_df = pd.DataFrame(order_id_dict.items(), columns=['order_id', 'num_prod_match'])
result_df.head()

In [None]:
# calculate metric 1
# count orders with at least a match
metric_sub_1 = len(result_df[result_df['num_prod_match']>0])
print(f"Out of {tot_order} orders, {metric_sub_1} of them utilize the in-aisle items. That's about {metric_sub_1/tot_order*100:.2f}%.")

In [None]:
# calcualte metric 2
metric_sub_2 = result_df['num_prod_match'].sum() / len(result_df)
print(f"On average there are {avg_item:.2f} items in each order, and {metric_sub_2:.2f} of them utilize in-aisle items. That's about {metric_sub_2/avg_item*100:.2f}%")

In [None]:
file_path = 'results_refrigerated.txt'

with open(file_path, 'a') as file:
    file.write(f"METHOD 3: Selection based on Pure total sales of each product\n")
    file.write(f"Metric Calculation without accounting for substitutes for METHOD 3\n")
    file.write(f"Out of {tot_order} orders, {metric_1} of them utilize the in-aisle items. That's about {metric_1/tot_order*100:.2f}%.\n")
    file.write(f"On average there are {avg_item:.2f} items in each order, and {metric_2:.2f} of them utilize in-aisle items. That's about {metric_2/avg_item*100:.2f}%\n")
    file.write(f"Metric Calculation accounting for substitutes for METHOD 3\n")
    file.write(f"Out of {tot_order} orders, {metric_sub_1} of them utilize the in-aisle items. That's about {metric_sub_1/tot_order*100:.2f}%.\n")
    file.write(f"On average there are {avg_item:.2f} items in each order, and {metric_sub_2:.2f} of them utilize in-aisle items. That's about {metric_sub_2/avg_item*100:.2f}%\n")
    file.write("\n")

### METHOD 4: Selection based on weightage distribution by median per product sale per department-aisle with no extra condition

In [None]:
# Function to calculate text similarity between product names using TF-IDF and cosine similarity
def calculate_similarity(product_names):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(product_names)
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return similarity_matrix

In [None]:
total_products_to_select=9800
# Calculate total sales per unique product within each department-aisle combination
product_sales = df.groupby(['department', 'aisle', 'product_name'])['order_id'].nunique().reset_index()

# Calculate median per product sale within each department-aisle combination
median_per_product_sale = product_sales.groupby(['department', 'aisle'])['order_id'].median().reset_index()

# Assign a number of products to select initially as 1 for each unique department-aisle combination
median_per_product_sale['number_of_products_to_select'] = 0

# Calculate the total number of unique department-aisle combinations
total_combinations = len(median_per_product_sale)

# Calculate the number of products left to select
left_products_to_select = total_products_to_select - total_combinations

# Sort the combinations by median per product sale in descending order
median_per_product_sale = median_per_product_sale.sort_values(by='order_id', ascending=False)

# Distribute the remaining products based on median criterion
i = 0
while left_products_to_select > 0:
    median_per_product_sale.at[i, 'number_of_products_to_select'] += 1
    left_products_to_select -= 1
    i = (i + 1) % total_combinations

# Now the 'median_per_product_sale' DataFrame contains the desired 'number_of_products_to

In [None]:
median_per_product_sale.head()

In [None]:
median_per_product_sale['number_of_products_to_select'].sum()

In [None]:
# Initialize an empty DataFrame to store selected products
selected_products = pd.DataFrame(columns=['product_name', 'department', 'aisle'])

In [None]:
# Iterate through department-aisle combinations
for _, row in median_per_product_sale.iterrows():
    department = row['department']
    aisle = row['aisle']
    num_to_select = row['number_of_products_to_select']
    
    print("Working on department:", department, "aisle:", aisle)
    print("Number of products to select:", num_to_select)
    
    # Filter products for the current department-aisle combination
    products_subset = df[(df['department'] == department) & (df['aisle'] == aisle)]
    
    # Create a dataset with unique product names and their total sales
    product_subset = products_subset.groupby('product_name')['order_id'].nunique().reset_index(name='total_sales')
    
    # Sort products_subset by total sales in descending order
    product_subset = product_subset.sort_values(by='total_sales', ascending=False)
    
    # Initialize similarity threshold
    similarity_threshold = 0.5
    
    # Initialize a variable to track the number of products selected for this combination
    num_selected = 0
    
    # Initialize a list to store selected indices
    selected_indices = []
    
    # Try different similarity thresholds until enough products are found
    while num_selected < num_to_select and similarity_threshold >= 0:
        print("Current similarity threshold:", similarity_threshold)
        
        # Calculate text similarity between product names
        similarity_matrix = calculate_similarity(product_subset['product_name'])
        
        # Find the indices of products that meet the similarity criteria
        indices_to_add = [
            i for i in range(similarity_matrix.shape[0])
            if i not in selected_indices
            and all(similarity_matrix[i, j] < similarity_threshold for j in selected_indices)
            
        ]
        # Limit the selection to a maximum of num_to_select products
        if len(indices_to_add) + num_selected > num_to_select:
            indices_to_add = indices_to_add[:num_to_select - num_selected]
        
        # Update the selected indices and num_selected
        selected_indices.extend(indices_to_add)
        num_selected = len(selected_indices)
        
        # Decrease the similarity threshold
        similarity_threshold -= 0.1
        
        print("Number of products selected so far:", num_selected)
    
    if similarity_threshold < 0:
        print("Not enough products found for department:", department, "aisle:", aisle)
    else:
       # Get the selected product indices from product_subset
        selected_product_indices = [selected_indices[i] for i in range(len(selected_indices)) if i < num_to_select]

        # Create a DataFrame with the selected product indices
        selected_products_subset = products_subset.iloc[selected_product_indices]

        # Append the selected products to the final DataFrame
        selected_products = pd.concat([selected_products, selected_products_subset])

print("Number of total products selected:", len(selected_products))

In [None]:
columns_to_drop = ['department', 'aisle','order_id']
selected_products = selected_products.drop(columns=columns_to_drop)

In [None]:
selected_data = selected_products

In [None]:
test_df = pd.read_csv('test_other_df.csv')

#### Metric Calculation without accounting for substitutes for METHOD 4

In [None]:
# calculate total unique orders in the test data
tot_order = test_df['order_id'].nunique()
print(f"Total number of orders: {tot_order}")

In [None]:
# calculate the average number of items in each order
avg_item = len(test_df) / tot_order
print(f"Average number of items in each order: {avg_item}")

In [None]:
# join the test data with my pick on 'product_id'
test_selected_data = test_df.merge(selected_data, on='product_id', how='left')

# group by 'order_id' and count the non-null values of 'product_name_y' column
metrics_tmp = test_selected_data[['order_id', 'product_name_y']].groupby('order_id').count().reset_index().rename(columns={'product_name_y': "num_prod_matching"})
metrics_tmp.head()

In [None]:
# calculate metric 1
# count orders with at least a match
metric_1 = len(metrics_tmp[metrics_tmp['num_prod_matching']>0])
print(f"Out of {tot_order} orders, {metric_1} of them utilize the in-aisle items. That's about {metric_1/tot_order*100:.2f}%.")

In [None]:
# calculate metric 2
metric_2 = metrics_tmp['num_prod_matching'].sum() / len(metrics_tmp)
print(f"On average there are {avg_item:.2f} items in each order, and {metric_2:.2f} of them utilize in-aisle items. That's about {metric_2/avg_item*100:.2f}%")

#### Metric Calculation accounting for substitutes for METHOD 4

In [None]:
selected_products=selected_data['product_name'].unique()
# Get unique product names from the entire dataset
unique_product_names = test_df['product_name'].unique()

# Vectorize the unique product names and selected products
tfidf_vectorizer = TfidfVectorizer()
product_name_matrix = tfidf_vectorizer.fit_transform(unique_product_names)
selected_products_matrix = tfidf_vectorizer.transform(selected_products)

# Calculate the cosine similarity matrix between unique product names and selected products
similarity_matrix = cosine_similarity(product_name_matrix, selected_products_matrix)

# Create a dictionary to store num_prod_match for each order_id
order_id_dict = {}
similarity_threshold = 0.5
order_ids_checked = 0

# Iterate through unique order IDs in test_df
unique_order_ids = test_df['order_id'].unique()
for order_id in unique_order_ids:
    order_products = test_df[test_df['order_id'] == order_id]['product_name']
    num_prod_match = 0
    
    for product_name in order_products:
        product_index = np.where(unique_product_names == product_name)[0][0]  # Find the index of the current product
        similarities = similarity_matrix[product_index]  # Get similarities for the current product
        
        # Check if any similarity is above the threshold
        if any(similarities >= similarity_threshold):
            num_prod_match += 1
    
    order_id_dict[order_id] = num_prod_match
    order_ids_checked += 1
    print(f"Progress: {((order_ids_checked) / len(unique_order_ids)) * 100}%")

In [None]:
result_df = pd.DataFrame(order_id_dict.items(), columns=['order_id', 'num_prod_match'])
result_df.head()

In [None]:
# calculate metric 1
# count orders with at least a match
metric_sub_1 = len(result_df[result_df['num_prod_match']>0])
print(f"Out of {tot_order} orders, {metric_sub_1} of them utilize the in-aisle items. That's about {metric_sub_1/tot_order*100:.2f}%.")

In [None]:
# calcualte metric 2
metric_sub_2 = result_df['num_prod_match'].sum() / len(result_df)
print(f"On average there are {avg_item:.2f} items in each order, and {metric_sub_2:.2f} of them utilize in-aisle items. That's about {metric_sub_2/avg_item*100:.2f}%")

In [None]:
file_path = 'results_other.txt'

with open(file_path, 'a') as file:
    file.write(f"METHOD 4: Selection based on Pure total sales of each product\n")
    file.write(f"Metric Calculation without accounting for substitutes for METHOD 4\n")
    file.write(f"Out of {tot_order} orders, {metric_1} of them utilize the in-aisle items. That's about {metric_1/tot_order*100:.2f}%.\n")
    file.write(f"On average there are {avg_item:.2f} items in each order, and {metric_2:.2f} of them utilize in-aisle items. That's about {metric_2/avg_item*100:.2f}%\n")
    file.write(f"Metric Calculation accounting for substitutes for METHOD 4\n")
    file.write(f"Out of {tot_order} orders, {metric_sub_1} of them utilize the in-aisle items. That's about {metric_sub_1/tot_order*100:.2f}%.\n")
    file.write(f"On average there are {avg_item:.2f} items in each order, and {metric_sub_2:.2f} of them utilize in-aisle items. That's about {metric_sub_2/avg_item*100:.2f}%\n")
    file.write("\n")

### METHOD 5: One of each aisle max and remaining using method 1 (top product sales)

In [None]:
# Calculate total sales for each unique aisle
aisle_sales = df.groupby('aisle')['order_id'].sum().reset_index()

# Find the product with maximum sales in each aisle
max_sale_products = df.groupby(['aisle', 'product_name'])['order_id'].sum().reset_index()
max_sale_products = max_sale_products.loc[max_sale_products.groupby('aisle')['order_id'].idxmax()]

# Create the 'selected_products' list
selected_products = max_sale_products['product_name'].tolist()

# Remove data of all the chosen products from the original DataFrame
df = df[~df['product_name'].isin(selected_products)]

# Find 100 products with maximum sale from the changed DataFrame
top_100_products = df.groupby('product_name')['order_id'].sum().reset_index()
top_100_products = top_100_products.sort_values(by='order_id', ascending=False).head(9800-len(selected_products))

# Append the last top 100 products with the selected products
final_product_list = selected_products + top_100_products['product_name'].tolist()

# Filter the original DataFrame based on the final_product_list
selected_products = df[df['product_name'].isin(final_product_list)][['product_name', 'product_id', 'aisle_id', 'department_id']]

# Reset the index of the selected_products DataFrame
selected_products.reset_index(drop=True, inplace=True)



#### Metric Calculation without accounting for substitutes for METHOD 5

In [None]:
# calculate total unique orders in the test data
tot_order = test_df['order_id'].nunique()
print(f"Total number of orders: {tot_order}")

In [None]:
# calculate the average number of items in each order
avg_item = len(test_df) / tot_order
print(f"Average number of items in each order: {avg_item}")

In [None]:
# join the test data with my pick on 'product_id'
test_selected_data = test_df.merge(selected_data, on='product_id', how='left')

# group by 'order_id' and count the non-null values of 'product_name_y' column
metrics_tmp = test_selected_data[['order_id', 'product_name_y']].groupby('order_id').count().reset_index().rename(columns={'product_name_y': "num_prod_matching"})
metrics_tmp.head()

In [None]:
# calculate metric 1
# count orders with at least a match
metric_1 = len(metrics_tmp[metrics_tmp['num_prod_matching']>0])
print(f"Out of {tot_order} orders, {metric_1} of them utilize the in-aisle items. That's about {metric_1/tot_order*100:.2f}%.")

In [None]:
# calculate metric 2
metric_2 = metrics_tmp['num_prod_matching'].sum() / len(metrics_tmp)
print(f"On average there are {avg_item:.2f} items in each order, and {metric_2:.2f} of them utilize in-aisle items. That's about {metric_2/avg_item*100:.2f}%")

#### Metric Calculation accounting for substitutes for METHOD 5

In [None]:
selected_products=selected_data['product_name'].unique()
# Get unique product names from the entire dataset
unique_product_names = test_df['product_name'].unique()

# Vectorize the unique product names and selected products
tfidf_vectorizer = TfidfVectorizer()
product_name_matrix = tfidf_vectorizer.fit_transform(unique_product_names)
selected_products_matrix = tfidf_vectorizer.transform(selected_products)

# Calculate the cosine similarity matrix between unique product names and selected products
similarity_matrix = cosine_similarity(product_name_matrix, selected_products_matrix)

# Create a dictionary to store num_prod_match for each order_id
order_id_dict = {}
similarity_threshold = 0.5
order_ids_checked = 0

# Iterate through unique order IDs in test_df
unique_order_ids = test_df['order_id'].unique()
for order_id in unique_order_ids:
    order_products = test_df[test_df['order_id'] == order_id]['product_name']
    num_prod_match = 0
    
    for product_name in order_products:
        product_index = np.where(unique_product_names == product_name)[0][0]  # Find the index of the current product
        similarities = similarity_matrix[product_index]  # Get similarities for the current product
        
        # Check if any similarity is above the threshold
        if any(similarities >= similarity_threshold):
            num_prod_match += 1
    
    order_id_dict[order_id] = num_prod_match
    order_ids_checked += 1
    print(f"Progress: {((order_ids_checked) / len(unique_order_ids)) * 100}%")

In [None]:
result_df = pd.DataFrame(order_id_dict.items(), columns=['order_id', 'num_prod_match'])
result_df.head()

In [None]:
# calculate metric 1
# count orders with at least a match
metric_sub_1 = len(result_df[result_df['num_prod_match']>0])
print(f"Out of {tot_order} orders, {metric_sub_1} of them utilize the in-aisle items. That's about {metric_sub_1/tot_order*100:.2f}%.")

In [None]:
# calcualte metric 2
metric_sub_2 = result_df['num_prod_match'].sum() / len(result_df)
print(f"On average there are {avg_item:.2f} items in each order, and {metric_sub_2:.2f} of them utilize in-aisle items. That's about {metric_sub_2/avg_item*100:.2f}%")

In [None]:
file_path = 'results_other.txt'

with open(file_path, 'a') as file:
    file.write(f"METHOD 5: One of each aisle max and remaining using method 1 (top product sales)\n")
    file.write(f"Metric Calculation without accounting for substitutes for METHOD 5\n")
    file.write(f"Out of {tot_order} orders, {metric_1} of them utilize the in-aisle items. That's about {metric_1/tot_order*100:.2f}%.\n")
    file.write(f"On average there are {avg_item:.2f} items in each order, and {metric_2:.2f} of them utilize in-aisle items. That's about {metric_2/avg_item*100:.2f}%\n")
    file.write(f"Metric Calculation accounting for substitutes for METHOD 5\n")
    file.write(f"Out of {tot_order} orders, {metric_sub_1} of them utilize the in-aisle items. That's about {metric_sub_1/tot_order*100:.2f}%.\n")
    file.write(f"On average there are {avg_item:.2f} items in each order, and {metric_sub_2:.2f} of them utilize in-aisle items. That's about {metric_sub_2/avg_item*100:.2f}%\n")
    file.write("\n")

### METHOD 6: First take one max from each aisle and then method 2 for remaining (weightage based on total sales per aisle)

In [None]:
# Calculate total sales for each unique aisle
aisle_sales = df.groupby('aisle')['order_id'].sum().reset_index()

# Find the product with maximum sales in each aisle
max_sale_products = df.groupby(['aisle', 'product_name'])['order_id'].sum().reset_index()
max_sale_products = max_sale_products.loc[max_sale_products.groupby('aisle')['order_id'].idxmax()]

# Create the 'selected_products' list
selected_products1 = max_sale_products['product_name'].tolist()

# Remove data of all the chosen products from the original DataFrame
df = df[~df['product_name'].isin(selected_products1)]


# Group by department and aisle to calculate total sales (order count)
department_aisle_sales = df.groupby(['department', 'aisle'])['order_id'].count().reset_index()

# Calculate the total sales for each department-aisle combination
department_aisle_sales['total_sales'] = department_aisle_sales.groupby(['department', 'aisle'])['order_id'].transform('sum')

# Calculate the distribution of number_of_products_to_select out of (9800 - len(selected_products))
department_aisle_sales['number_of_products_to_select'] = (
    (department_aisle_sales['total_sales'] / department_aisle_sales['total_sales'].sum()) * (9800 - len(selected_products))
).round().astype(int)

while department_aisle_sales['number_of_products_to_select'].sum() > (9800 - len(selected_products)):
    # Sort the data in decreasing order of number_of_products_to_select
    department_aisle_sales = department_aisle_sales.sort_values(by='number_of_products_to_select', ascending=False)

    # Calculate the current sum of number_of_products_to_select
    current_sum = department_aisle_sales['number_of_products_to_select'].sum()

    # Calculate the adjustment needed to make the sum exactly 100
    adjustment = (9800 - len(selected_products)) - current_sum

    # Remove the extra values by decrementing them from the top rows
    for index, row in department_aisle_sales.iterrows():
        if adjustment <= 0:
            break

        if row['number_of_products_to_select'] > 0:
            department_aisle_sales.at[index, 'number_of_products_to_select'] -= 1
            adjustment -= 1

    # Ensure that all values are non-negative integers
    department_aisle_sales['number_of_products_to_select'] = department_aisle_sales['number_of_products_to_select'].clip(lower=0)



In [None]:
# Initialize an empty DataFrame to store selected products
selected_products = pd.DataFrame(columns=['product_name', 'product_id', 'aisle_id', 'department_id'])

In [None]:
# Iterate through department-aisle combinations
for _, row in department_aisle_sales.iterrows():
    department = row['department']
    aisle = row['aisle']
    num_to_select = row['number_of_products_to_select']
    
    print("Working on department:", department, "aisle:", aisle)
    print("Number of products to select:", num_to_select)
    
    # Filter products for the current department-aisle combination
    products_subset = df[(df['department'] == department) & (df['aisle'] == aisle)]
    
    # Create a dataset with unique product names and their total sales
    product_subset = products_subset.groupby('product_name')['order_id'].nunique().reset_index(name='total_sales')
    
    # Sort products_subset by total sales in descending order
    product_subset = product_subset.sort_values(by='total_sales', ascending=False)
    
    # Initialize similarity threshold
    similarity_threshold = 0.5
    
    # Initialize a variable to track the number of products selected for this combination
    num_selected = 0
    
    # Initialize a list to store selected indices
    selected_indices = []
    
    # Try different similarity thresholds until enough products are found
    while num_selected < num_to_select and similarity_threshold >= 0:
        print("Current similarity threshold:", similarity_threshold)
        
        # Calculate text similarity between product names
        similarity_matrix = calculate_similarity(product_subset['product_name'])
        
        # Find the indices of products that meet the similarity criteria
        indices_to_add = [
            i for i in range(similarity_matrix.shape[0])
            if i not in selected_indices
            and all(similarity_matrix[i, j] < similarity_threshold for j in selected_indices)
            
        ]
        # Limit the selection to a maximum of num_to_select products
        if len(indices_to_add) + num_selected > num_to_select:
            indices_to_add = indices_to_add[:num_to_select - num_selected]
        
        # Update the selected indices and num_selected
        selected_indices.extend(indices_to_add)
        num_selected = len(selected_indices)
        
        # Decrease the similarity threshold
        similarity_threshold -= 0.1
        
        print("Number of products selected so far:", num_selected)
    
    if similarity_threshold < 0:
        print("Not enough products found for department:", department, "aisle:", aisle)
    else:
       # Get the selected product indices from product_subset
        selected_product_indices = [selected_indices[i] for i in range(len(selected_indices)) if i < num_to_select]

        # Create a DataFrame with the selected product indices
        selected_products_subset = products_subset.iloc[selected_product_indices]

        # Append the selected products to the final DataFrame
        selected_products = pd.concat([selected_products, selected_products_subset])

In [None]:
# Append the last top 100 products with the selected products
final_product_list = selected_products1 + selected_products['product_name'].tolist()

In [None]:
# Filter the original DataFrame based on the final_product_list
selected_products = df[df['product_name'].isin(final_product_list)][['product_name', 'product_id', 'aisle_id', 'department_id']]

# Reset the index of the selected_products DataFrame
selected_products.reset_index(drop=True, inplace=True)

In [None]:
selected_data = selected_products

In [None]:
test_df = pd.read_csv('test_other_df.csv')

#### Metric Calculation without accounting for substitutes for METHOD 6

In [None]:
# calculate total unique orders in the test data
tot_order = test_df['order_id'].nunique()
print(f"Total number of orders: {tot_order}")

In [None]:
# calculate the average number of items in each order
avg_item = len(test_df) / tot_order
print(f"Average number of items in each order: {avg_item}")

In [None]:
# join the test data with my pick on 'product_id'
test_selected_data = test_df.merge(selected_data, on='product_id', how='left')

# group by 'order_id' and count the non-null values of 'product_name_y' column
metrics_tmp = test_selected_data[['order_id', 'product_name_y']].groupby('order_id').count().reset_index().rename(columns={'product_name_y': "num_prod_matching"})
metrics_tmp.head()

In [None]:
# calculate metric 1
# count orders with at least a match
metric_1 = len(metrics_tmp[metrics_tmp['num_prod_matching']>0])
print(f"Out of {tot_order} orders, {metric_1} of them utilize the in-aisle items. That's about {metric_1/tot_order*100:.2f}%.")

In [None]:
# calculate metric 2
metric_2 = metrics_tmp['num_prod_matching'].sum() / len(metrics_tmp)
print(f"On average there are {avg_item:.2f} items in each order, and {metric_2:.2f} of them utilize in-aisle items. That's about {metric_2/avg_item*100:.2f}%")

#### Metric Calculation accounting for substitutes for METHOD 6

In [None]:
selected_products=selected_data['product_name'].unique()
# Get unique product names from the entire dataset
unique_product_names = test_df['product_name'].unique()

# Vectorize the unique product names and selected products
tfidf_vectorizer = TfidfVectorizer()
product_name_matrix = tfidf_vectorizer.fit_transform(unique_product_names)
selected_products_matrix = tfidf_vectorizer.transform(selected_products)

# Calculate the cosine similarity matrix between unique product names and selected products
similarity_matrix = cosine_similarity(product_name_matrix, selected_products_matrix)

# Create a dictionary to store num_prod_match for each order_id
order_id_dict = {}
similarity_threshold = 0.5
order_ids_checked = 0

# Iterate through unique order IDs in test_df
unique_order_ids = test_df['order_id'].unique()
for order_id in unique_order_ids:
    order_products = test_df[test_df['order_id'] == order_id]['product_name']
    num_prod_match = 0
    
    for product_name in order_products:
        product_index = np.where(unique_product_names == product_name)[0][0]  # Find the index of the current product
        similarities = similarity_matrix[product_index]  # Get similarities for the current product
        
        # Check if any similarity is above the threshold
        if any(similarities >= similarity_threshold):
            num_prod_match += 1
    
    order_id_dict[order_id] = num_prod_match
    order_ids_checked += 1
    print(f"Progress: {((order_ids_checked) / len(unique_order_ids)) * 100}%")

In [None]:
result_df = pd.DataFrame(order_id_dict.items(), columns=['order_id', 'num_prod_match'])
result_df.head()

In [None]:
# calculate metric 1
# count orders with at least a match
metric_sub_1 = len(result_df[result_df['num_prod_match']>0])
print(f"Out of {tot_order} orders, {metric_sub_1} of them utilize the in-aisle items. That's about {metric_sub_1/tot_order*100:.2f}%.")

In [None]:
# calcualte metric 2
metric_sub_2 = result_df['num_prod_match'].sum() / len(result_df)
print(f"On average there are {avg_item:.2f} items in each order, and {metric_sub_2:.2f} of them utilize in-aisle items. That's about {metric_sub_2/avg_item*100:.2f}%")

In [None]:
file_path = 'results_other.txt'

with open(file_path, 'a') as file:
    file.write(f"METHOD 6: First take one max from each aisle and then method 2 for remaining (weightage based on total sales per aisle)\n")
    file.write(f"Metric Calculation without accounting for substitutes for METHOD 6\n")
    file.write(f"Out of {tot_order} orders, {metric_1} of them utilize the in-aisle items. That's about {metric_1/tot_order*100:.2f}%.\n")
    file.write(f"On average there are {avg_item:.2f} items in each order, and {metric_2:.2f} of them utilize in-aisle items. That's about {metric_2/avg_item*100:.2f}%\n")
    file.write(f"Metric Calculation accounting for substitutes for METHOD 6\n")
    file.write(f"Out of {tot_order} orders, {metric_sub_1} of them utilize the in-aisle items. That's about {metric_sub_1/tot_order*100:.2f}%.\n")
    file.write(f"On average there are {avg_item:.2f} items in each order, and {metric_sub_2:.2f} of them utilize in-aisle items. That's about {metric_sub_2/avg_item*100:.2f}%\n")
    file.write("\n")