# Modeling 

In [None]:
# Importing libraries
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import pickle
import os
from transformers import pipeline
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Reading cleaned dataframes
df_amz = pd.read_csv('./data/clean_df_amz.csv')
df_ec = pd.read_csv('./data/clean_df_ec.csv')

In [3]:
df_ec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 433938 entries, 0 to 433937
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   event_time     433938 non-null  object 
 1   order_id       433938 non-null  int64  
 2   product_id     433938 non-null  int64  
 3   category_id    433938 non-null  float64
 4   category_code  433938 non-null  object 
 5   brand          419890 non-null  object 
 6   price          433938 non-null  float64
 7   user_id        433938 non-null  float64
 8   event_year     433938 non-null  int64  
 9   year_month     433938 non-null  object 
 10  price_range    433938 non-null  object 
dtypes: float64(3), int64(3), object(5)
memory usage: 36.4+ MB


In [4]:
df_amz.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8820 entries, 0 to 8819
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Product Name           8820 non-null   object 
 1   Category               8820 non-null   object 
 2   Selling Price          8820 non-null   float64
 3   About Product          8596 non-null   object 
 4   Product Specification  7277 non-null   object 
 5   Technical Details      8111 non-null   object 
 6   Image                  8820 non-null   object 
 7   Product Url            8820 non-null   object 
 8   has_range              8820 non-null   bool   
 9   price_range            8819 non-null   object 
dtypes: bool(1), float64(1), object(8)
memory usage: 628.9+ KB


## String Search Approach - Data Joining Process

Taking into account that in both of the datasets there is a category column, I will check if there are similarities searching for the same strings within the category column for both of the datasets. 

In [5]:
# Cleaning spaces and lowcase
df_ec["category_code"] = df_ec["category_code"].str.lower().str.strip()
df_amz["Category"] = df_amz["Category"].str.lower().str.strip()

# Extracting first category in df_ec
df_ec["main_category"] = df_ec["category_code"].apply(lambda x: x.split(".")[0] if isinstance(x, str) else x)

# # Extracting first category in df_amz
df_amz["main_category"] = df_amz["Category"].apply(lambda x: x.split("|")[0] if isinstance(x, str) else x)

# Searching for coincidences
unique_ec_categories = df_ec["main_category"].unique()
unique_amz_categories = df_amz["main_category"].unique()

# Both df intersection
matching_categories = set(unique_ec_categories) & set(unique_amz_categories)
matching_categories


set()

There are no direct matches between the categories in df_ec and df_amz. This suggests that:

* The category names may differ in structure or meaning.
* This apporach is not useful in this dataset. 

## Embedding Approach - Data Joining Process

Embedding models transform data (text, images, audio, etc.) into numerical representations in a high-dimensional vector space. These models generate the vectors so that similar elements are closer in that space. This approach will generate those embeddings and search related to the distance, if they are similar, so they can be matched.

In [3]:
# Embedding model 
model = SentenceTransformer("all-MiniLM-L6-v2")

# Unique categories
unique_ec_categories = df_ec["category_code"].dropna().unique()
unique_amz_categories = df_amz["Category"].dropna().unique()

# Embeddings process
ec_embeddings = model.encode(unique_ec_categories, convert_to_tensor=False)
amz_embeddings = model.encode(unique_amz_categories, convert_to_tensor=False)

In [4]:
# Cosine similarity
similarity_matrix = cosine_similarity(ec_embeddings, amz_embeddings)

# Finding better fit 
best_matches = similarity_matrix.argmax(axis=1)

# Creating mapping dictionary
category_mapping = {unique_ec_categories[i]: unique_amz_categories[best_matches[i]] for i in range(len(best_matches))}

# Some of the matches...
for ec_cat, amz_cat in list(category_mapping.items())[:10]:
    print(f"{ec_cat} -> {amz_cat}")


electronics.tablet -> Electronics | Computers & Accessories | Tablet Accessories | Bags, Cases & Sleeves | Cases
electronics.audio.headphone -> Electronics | Headphones | Over-Ear Headphones
furniture.kitchen.table -> Home & Kitchen | Furniture | Kids' Furniture | Tables & Chairs | Tables
electronics.smartphone -> Electronics | Camera & Photo | Video | Camcorders
appliances.kitchen.refrigerators -> Home & Kitchen | Kitchen & Dining | Kitchen Utensils & Gadgets | Kitchen Accessories | Refrigerator Magnets
appliances.personal.scales -> Home & Kitchen | Kitchen & Dining | Small Appliances | Ovens & Toasters | Toasters
electronics.video.tv -> Electronics | Camera & Photo | Video | Camcorders
computers.components.cpu -> Industrial & Scientific | Industrial Electrical | Passive Components | Capacitors
computers.notebook -> Office Products | Office & School Supplies | Paper | Notebooks & Writing Pads | Subject Notebooks
computers.peripherals.monitor -> Electronics | Camera & Photo | Binocular

In [5]:
# Mapping categories to Ecommerce dataset
df_ec["mapped_category"] = df_ec["category_code"].map(category_mapping)

# Succesfull assigned categories
print(df_ec["mapped_category"].notna().sum(), "Succesfull assigned categories")

# Results
df_ec[["category_code", "mapped_category"]].head(10)

433938 Succesfull assigned categories


Unnamed: 0,category_code,mapped_category
0,electronics.tablet,Electronics | Computers & Accessories | Tablet...
1,electronics.tablet,Electronics | Computers & Accessories | Tablet...
2,electronics.audio.headphone,Electronics | Headphones | Over-Ear Headphones
3,electronics.audio.headphone,Electronics | Headphones | Over-Ear Headphones
4,furniture.kitchen.table,Home & Kitchen | Furniture | Kids' Furniture |...
5,electronics.smartphone,Electronics | Camera & Photo | Video | Camcorders
6,electronics.smartphone,Electronics | Camera & Photo | Video | Camcorders
7,electronics.smartphone,Electronics | Camera & Photo | Video | Camcorders
8,electronics.smartphone,Electronics | Camera & Photo | Video | Camcorders
9,appliances.kitchen.refrigerators,Home & Kitchen | Kitchen & Dining | Kitchen Ut...


In [6]:
df_ec["mapped_category"] = df_ec["mapped_category"].str.strip().str.lower()
df_amz["Category"] = df_amz["Category"].str.strip().str.lower()

In [7]:
# Merge keeping all of the products in amazon dataset
df_merged = df_ec.merge(df_amz, left_on="mapped_category", right_on="Category", how="left")

# Display results
df_merged.head()

Unnamed: 0,event_time,order_id,product_id,category_id,category_code,brand,price,user_id,event_year,year_month,...,Product Name,Category,Selling Price,About Product,Product Specification,Technical Details,Image,Product Url,has_range,price_range_y
0,2020-04-24,2294359932054536986,1515966223509089906,2.268105e+18,electronics.tablet,samsung,162.01,1.515916e+18,2020,2020-04,...,Fisher Price Kid-Tough Apptivity Case for Kind...,electronics | computers & accessories | tablet...,26.19,Make sure this fits by entering your model num...,ProductDimensions:8.5x1x5.8inches|ItemWeight:1...,"Your child will be engaged, entertained and le...",https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Fisher-Kid-Tough-Apptiv...,False,17-30
1,2020-04-24,2294359932054536986,1515966223509089906,2.268105e+18,electronics.tablet,samsung,162.01,1.515916e+18,2020,2020-04,...,Fisher Price Kid-Tough Apptivity Case for Kind...,electronics | computers & accessories | tablet...,26.19,Make sure this fits by entering your model num...,ProductDimensions:8.5x1x5.8inches|ItemWeight:1...,"Your child will be engaged, entertained and le...",https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Fisher-Kid-Tough-Apptiv...,False,17-30
2,2020-04-24,2294444024058086220,2273948319057183658,2.268105e+18,electronics.audio.headphone,huawei,77.52,1.515916e+18,2020,2020-04,...,Trolls Poppy Kid Friendly Headphones with Buil...,electronics | headphones | over-ear headphones,17.99,Make sure this fits by entering your model num...,ProductDimensions:5.5x2.5x7inches|ItemWeight:5...,Go to your orders and start the return Select ...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Friendly-Headphones-Lim...,False,17-30
3,2020-04-24,2294444024058086220,2273948319057183658,2.268105e+18,electronics.audio.headphone,huawei,77.52,1.515916e+18,2020,2020-04,...,Trolls Poppy Kid Friendly Headphones with Buil...,electronics | headphones | over-ear headphones,17.99,Make sure this fits by entering your model num...,ProductDimensions:5.5x2.5x7inches|ItemWeight:5...,Go to your orders and start the return Select ...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Friendly-Headphones-Lim...,False,17-30
4,2020-04-26,2295716521449619559,1515966223509261697,2.268105e+18,furniture.kitchen.table,maestro,39.33,1.515916e+18,2020,2020-04,...,Flash Furniture 25''W x 45''L Trapezoid Red HP...,home & kitchen | furniture | kids' furniture |...,117.26,Collaborative Trapezoid Activity Table | Table...,ASIN:B00777W7GE|ShippingWeight:39pounds(Viewsh...,Go to your orders and start the return Select ...,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/Flash-Furniture-Trapezo...,False,100-500


In [10]:
df_merged.columns

Index(['event_time', 'order_id', 'product_id', 'category_id', 'category_code',
       'brand', 'price', 'user_id', 'event_year', 'year_month',
       'price_range_x', 'mapped_category', 'Product Name', 'Category',
       'Selling Price', 'About Product', 'Product Specification',
       'Technical Details', 'Image', 'Product Url', 'has_range',
       'price_range_y'],
      dtype='object')

In [11]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 753092 entries, 0 to 753091
Data columns (total 22 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   event_time             753092 non-null  object 
 1   order_id               753092 non-null  int64  
 2   product_id             753092 non-null  int64  
 3   category_id            753092 non-null  float64
 4   category_code          753092 non-null  object 
 5   brand                  719677 non-null  object 
 6   price                  753092 non-null  float64
 7   user_id                753092 non-null  float64
 8   event_year             753092 non-null  int64  
 9   year_month             753092 non-null  object 
 10  price_range_x          753092 non-null  object 
 11  mapped_category        753092 non-null  object 
 12  Product Name           753092 non-null  object 
 13  Category               753092 non-null  object 
 14  Selling Price          753092 non-nu

In [8]:
# Converting user id to str type
df_merged["user_id"] = df_merged["user_id"].apply(lambda x: str(int(float(x))) if 'e' in str(x) else str(x))

In [9]:
df_merged['user_id'].dtype

dtype('O')

In [10]:
print(df_merged["user_id"].head())

0    1515915625441993984
1    1515915625441993984
2    1515915625447879424
3    1515915625447879424
4    1515915625450382848
Name: user_id, dtype: object


In [11]:
# Dropping spaces
df_merged['user_id'] = df_merged['user_id'].str.strip()

In [12]:
df_merged['user_id']

0         1515915625441993984
1         1515915625441993984
2         1515915625447879424
3         1515915625447879424
4         1515915625450382848
                 ...         
753087    1515915625513809920
753088    1515915625514888704
753089    1515915625514891264
753090    1515915625514834176
753091    1515915625514834176
Name: user_id, Length: 753092, dtype: object

In [14]:
# Storing df_merged
df_merged.to_csv("./data/df_merged.csv", index=False) 

## Collaborative Filtering Approach - Recomendation System

Building an user-product matrix to apply a KNN algorythm over it. The only column will be product_id and the user_id will be the index. 


In [28]:
# User-product matriz for KNN 
user_product_matrix = df_merged.pivot_table(index="user_id", columns="product_id", aggfunc="size", fill_value=0)

# Spars Matrix 
sparse_matrix = csr_matrix(user_product_matrix)

In [29]:
# Initialize KNN model
knn = NearestNeighbors(metric='cosine', algorithm='brute')

# Fit the model with the sparse interaction matrix
knn.fit(sparse_matrix)

In [30]:
# Initialize KNN model
knn = NearestNeighbors(metric='cosine', algorithm='brute')

# Fit the model with the sparse interaction matrix
knn.fit(sparse_matrix)

def rs(user_id, n_recommendations=10):
    # Check if the user exists in the dataset
    if user_id not in user_product_matrix.index:
        print(f"⚠️ User {user_id} not found. Returning most popular products.")
        return user_product_matrix.sum(axis=0).sort_values(ascending=False).index[:n_recommendations].tolist()

    # Get the index of the user in the interaction matrix
    user_index = user_product_matrix.index.get_loc(user_id)

    # Find the 10 nearest neighbors using the KNN model
    distances, indices = knn.kneighbors(sparse_matrix[user_index], n_neighbors=10)

    # Get the products that the user has already purchased
    user_purchases = set(user_product_matrix.iloc[user_index].to_numpy().nonzero()[0])

    # Collect products purchased by similar users (neighbors)
    recommended_products = []
    for neighbor_index in indices.flatten():
        if neighbor_index == user_index:
            continue  # Skip self

        # Get purchases of the neighbor
        neighbor_purchases = user_product_matrix.iloc[neighbor_index].to_numpy().nonzero()[0]

        if len(neighbor_purchases) > 0:  # Ensure the neighbor has at least one purchase
            recommended_products.extend(neighbor_purchases)

    # Convert the list to a NumPy array for processing
    recommended_products = np.array(recommended_products)

    if recommended_products.size == 0:
        print(f"⚠️ No neighbor recommendations found for user {user_id}. Returning most popular products.")
        return user_product_matrix.sum(axis=0).sort_values(ascending=False).index[:n_recommendations].tolist()

    # Count the frequency of each recommended product
    unique, counts = np.unique(recommended_products, return_counts=True)

    # Sort products by frequency in descending order
    sorted_indices = unique[np.argsort(-counts)]

    # Convert indices to product names and filter out already purchased products
    final_recommendations = [user_product_matrix.columns[i] for i in sorted_indices if i not in user_purchases]

    # If not enough recommendations, add popular products as fallback
    if len(final_recommendations) < n_recommendations:
        print(f"⚠️ Only {len(final_recommendations)} recommendations found for user {user_id}. Adding most popular products.")
        top_products = user_product_matrix.sum(axis=0).sort_values(ascending=False).index
        final_recommendations.extend([p for p in top_products if p not in user_purchases][:n_recommendations - len(final_recommendations)])

    return final_recommendations[:n_recommendations]


In [31]:
# Testing function
test_users = df_merged["user_id"].astype(int).unique()[:5]  # Tomamos los primeros 3

for user in test_users:
    print(f"\n🔍 Recommendations for user {user}:")
    recommendations = rs(user, n_recommendations=5)
    print(recommendations)



🔍 Recommendations for user 1515915625441993984:
⚠️ Only 0 recommendations found for user 1515915625441993984. Adding most popular products.
[1515966223517846928, 1515966223509106786, 1515966223509088532, 1515966223509088613, 1515966223509088567]

🔍 Recommendations for user 1515915625447879424:
[np.int64(1515966223509088531), np.int64(1515966223509104979), np.int64(1515966223509088567), np.int64(1515966223509089269), np.int64(1515966223509089406)]

🔍 Recommendations for user 1515915625450382848:
⚠️ Only 3 recommendations found for user 1515915625450382848. Adding most popular products.
[np.int64(1515966223509130184), np.int64(1515966223527326232), np.int64(2273948219744453140), 1515966223517846928, 1515966223509106786]

🔍 Recommendations for user 1515915625448766464:
⚠️ Only 0 recommendations found for user 1515915625448766464. Adding most popular products.
[1515966223517846928, 1515966223509106786, 1515966223509088532, 1515966223509088613, 1515966223509088567]

🔍 Recommendations for u

As a lot of the users had 0 recommendations, I will try to debug this reason: 

In [None]:
print("Sample user IDs in user_product_matrix:", user_product_matrix.index[:10]) 
print("Test user IDs:", test_users)

# Checking if testing users are in the matrix
for user in test_users:
    print(f"User {user} in matrix: {user in user_product_matrix.index}")


Sample user IDs in user_product_matrix: Index([ 1.515915625439952e+18, 1.5159156254400384e+18, 1.5159156254400517e+18,
       1.5159156254400998e+18, 1.5159156254401216e+18, 1.5159156254408814e+18,
       1.5159156254409362e+18, 1.5159156254409367e+18,  1.515915625440937e+18,
       1.5159156254409372e+18],
      dtype='float64', name='user_id')
Test user IDs: [1515915625441993984 1515915625447879424 1515915625450382848]
User 1515915625441993984 in matrix: True
User 1515915625447879424 in matrix: True
User 1515915625450382848 in matrix: True


In [None]:
# Checking data types
print("Data type in df_merged:", df_merged["user_id"].dtype)
print("Data type in user_product_matrix:", user_product_matrix.index.dtype)

Data type in df_merged: float64
Data type in user_product_matrix: float64


In [None]:
# Checking interactions
print("Matrix shape:", user_product_matrix.shape)
print("Non-zero interactions:", user_product_matrix.to_numpy().nonzero()[0].size)

Matrix shape: (87683, 372)
Non-zero interactions: 349533


It seems that this approach is not effective taking into consideration that most of the users have purchased 1-2 articles, so there are not enough nearest neighbors to build a strong recommendation system, that is the reason why it is recommending the most purchased articles, not based in the algorythm. 

## Embedding Approach - Recomendation System

As the previous approach was not useful, I will use a embedding approach similar to the one used to map the categories. The main purpose of it is to be able to include text columns that handle information related to the products. 

In [15]:
df_merged = pd.read_csv('./data/df_merged.csv')
df_merged.columns

Index(['event_time', 'order_id', 'product_id', 'category_id', 'category_code',
       'brand', 'price', 'user_id', 'event_year', 'year_month',
       'price_range_x', 'mapped_category', 'Product Name', 'Category',
       'Selling Price', 'About Product', 'Product Specification',
       'Technical Details', 'Image', 'Product Url', 'has_range',
       'price_range_y'],
      dtype='object')

In [16]:
# Taking a look into the most relevant columns for this approach
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 753092 entries, 0 to 753091
Data columns (total 22 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   event_time             753092 non-null  object 
 1   order_id               753092 non-null  int64  
 2   product_id             753092 non-null  int64  
 3   category_id            753092 non-null  float64
 4   category_code          753092 non-null  object 
 5   brand                  719677 non-null  object 
 6   price                  753092 non-null  float64
 7   user_id                753092 non-null  int64  
 8   event_year             753092 non-null  int64  
 9   year_month             753092 non-null  object 
 10  price_range_x          753092 non-null  object 
 11  mapped_category        753092 non-null  object 
 12  Product Name           753092 non-null  object 
 13  Category               753092 non-null  object 
 14  Selling Price          753092 non-nu

In [18]:
df_merged['user_id'] = df_merged['user_id'].astype(str)
df_merged['user_id']

0         1515915625441993984
1         1515915625441993984
2         1515915625447879424
3         1515915625447879424
4         1515915625450382848
                 ...         
753087    1515915625513809920
753088    1515915625514888704
753089    1515915625514891264
753090    1515915625514834176
753091    1515915625514834176
Name: user_id, Length: 753092, dtype: object

In [19]:
# Dropping spaces
df_merged['user_id'] = df_merged['user_id'].str.strip()

'About Product' Column

In [20]:
df_merged['About Product'][1]

'Make sure this fits by entering your model number. | Certified "Made for Kindle" Accessory for Kindle Fire(will not fit Kindle Fire HD 7" or 8.9") | Fisher-Price Kid Tough Kindle Fire Apptivity Case lets kids enjoy their very own apps while protecting mom or dad\'s device | Durable case sized just right for kids to hold | Sturdy outer shell | Download free Fisher-Price apps'

In [35]:
df_merged['About Product'].nunique()

61

'Technical Details' Column

In [36]:
df_merged['Technical Details'][1]

"Your child will be engaged, entertained and learning every time he or she uses it. View larger. Fisher-Price® Kid-Tough™ Apptivity™ Case for Kindle Fire How can you show your little one an ‘appsolutely’ fun time using your Kindle Fire with ‘appsolutely’ no worries? With the Fisher-Price® Kid-Tough™ Apptivity™ Case built just for Kindle Fire. This sturdy case (that also comes in pink) is built Kid-Tough™! So it will protect your Kindle Fire from drips and sticky little fingers and whatever else your toddler can dish out! You’ll love the protection, and your child will love the FREE Fisher-Price® learning apps you can download from the Kindle Appstore—no purchase necessary! (Kindle Fire device not included.) The Fisher-Price® Imaginext® Dino-Tech App is a great way for kids to interact with and learn about their favorite dinosaurs! View larger. Kid-Tough™.\xa0Kid Friendly. With a Kid-Tough™ rubberized casing and a clear film over the screen to protect against drips and sticky fingers, t

In [37]:
df_merged['Technical Details'].nunique()

58

'Product Specification' Column

In [38]:
df_merged['Product Specification'][1]

'ProductDimensions:8.5x1x5.8inches|ItemWeight:10.1ounces|ShippingWeight:12.8ounces(Viewshippingratesandpolicies)|DomesticShipping:ItemcanbeshippedwithinU.S.|InternationalShipping:Thisitemisnoteligibleforinternationalshipping.LearnMore|ASIN:B009RH9OJ0|Itemmodelnumber:Y3501|Manufacturerrecommendedage:3yearsandup|Discontinuedbymanufacturer:Yes'

In [39]:
df_merged['Product Specification'].nunique()

49

'Product Name' 

In [40]:
df_merged['Product Name'][1]

'Fisher Price Kid-Tough Apptivity Case for Kindle Fire, Blue (will not fit HD models)'

In [93]:
df_merged['mapped_category'].nunique()

62

In [21]:
# Creating a merged column with all of the text relevant information
df = df_merged

df["product_info"] = (
    df["Product Name"].astype(str) + " " +
    df["Category"].astype(str) + " " +
    df["brand"].astype(str) + " " +
    df["About Product"].astype(str) + " " +
    df["Product Specification"].astype(str)
)


In [43]:
df['product_info'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 753092 entries, 0 to 753091
Series name: product_info
Non-Null Count   Dtype 
--------------   ----- 
753092 non-null  object
dtypes: object(1)
memory usage: 5.7+ MB


In [44]:
# Storing embeddings
def generate_and_save_embeddings(df, filename="embeddings.pkl"):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    unique_product_info = df["product_info"].unique()
    embeddings_list = model.encode(list(unique_product_info))
    embeddings_dict = dict(zip(unique_product_info, embeddings_list))
    with open(filename, "wb") as f:
        pickle.dump(embeddings_dict, f)
    return embeddings_dict

In [46]:
# Loading or generating embeddings
embeddings_file = "./data/embeddings.pkl"
if os.path.exists(embeddings_file):
    with open(embeddings_file, "rb") as f:
        embeddings_dict = pickle.load(f)
else:
    embeddings_dict = generate_and_save_embeddings(df, embeddings_file)

# Mapping embeddings to the dataframe using product_id as the key
df["embeddings"] = df["product_info"].map(embeddings_dict)

# Getting most purchased articules
popular_products = df["product_id"].value_counts().index[:10].tolist()
popular_recommendations = df[df["product_id"].isin(popular_products)][["product_id", "Product Name", "price"]].drop_duplicates()

# Converting embeddings into array 
embedding_matrix = np.stack(df["embeddings"].dropna().values)

In [50]:
# Storing matrix
np.save("./data/embedding_matrix.npy", embedding_matrix)

In [None]:
# Recommendation system
def recommend_products(user_id, df, popular_recommendations, top_n=10):
    # Get the user's purchase history and remove products with missing embeddings
    user_products = df[df["user_id"] == user_id].dropna(subset=["embeddings"])
    print(f"User {user_id} has purchased {len(user_products)} products.")

    # If the user has no purchase history, return diverse popular recommendations
    if user_products.empty:
        available_popular = popular_recommendations.drop_duplicates(subset=["Product Name"])
        return available_popular.sample(n=min(top_n, len(available_popular)), replace=False).to_dict(orient="records")

    # Extract user embeddings from their purchased products
    user_embeddings = np.stack(user_products["embeddings"].values)

    # Get valid embeddings (products with embeddings)
    valid_embeddings = df.dropna(subset=["embeddings"])
    if valid_embeddings.empty:
        available_popular = popular_recommendations.drop_duplicates(subset=["Product Name"])
        return available_popular.sample(n=min(top_n, len(available_popular)), replace=False).to_dict(orient="records")

    embedding_matrix = np.stack(valid_embeddings["embeddings"].values)

    # Compute cosine similarity between user embeddings and all product embeddings
    similarities = cosine_similarity(user_embeddings, embedding_matrix).mean(axis=0)

    # Assign similarity scores to each product
    valid_embeddings = valid_embeddings.copy()
    valid_embeddings["similarity"] = similarities  

    # Remove already purchased products
    purchased_products = set(user_products["product_id"].values)
    df_filtered = valid_embeddings[~valid_embeddings["product_id"].isin(purchased_products)]

    # Sort by similarity (highest first)
    df_filtered = df_filtered.sort_values(by="similarity", ascending=False)

    # Select top products while ensuring variety
    recommended = df_filtered.groupby("Product Name").head(3).reset_index(drop=True).head(top_n)

    # If there are not enough recommendations, fill up with popular products
    if len(recommended) < top_n:
        remaining = top_n - len(recommended)
        available_popular = popular_recommendations.drop_duplicates(subset=["Product Name"])
        additional_products = available_popular.sample(n=min(remaining, len(available_popular)), replace=False)
        recommended = pd.concat([recommended, additional_products])
        print(f"After adding popular products: {len(recommended)} total recommendations.")

    # Ensure we are using only the first image URL (split by '|')
    recommended["Image"] = recommended["Image"].apply(lambda x: x.split('|')[0] if isinstance(x, str) else "")

    # Filter to ensure there are no null values in the required columns
    recommended = recommended.dropna(subset=["Image", "Product Url"])

    # Return the top recommended products with relevant details
    return recommended[["product_id", "Product Name", "price", "Selling Price", "Image", "Product Url", "similarity"]].head(top_n).to_dict(orient="records")


In [58]:
# Testing
user_id = df["user_id"].unique()[5] 
top_recommendations = recommend_products(user_id=user_id, df=df, popular_recommendations=popular_recommendations)
print(top_recommendations)


User 1515915625446798336 has purchased 1 products.
[{'product_id': 1515966223509127751, 'Product Name': 'Uncanny Brands Star Wars Darth Vader Elite 2-Slice Toaster- Star Wars Icon Logo onto Your Toast', 'price': 21.97, 'Selling Price': 42.99, 'Image': 'https://images-na.ssl-images-amazon.com/images/I/41DS9pw8VwL.jpg', 'Product Url': 'https://www.amazon.com/Star-Wars-Darth-Vader-Toaster/dp/B00JFFH1NA', 'similarity': 1.0}, {'product_id': 2273948319115903999, 'Product Name': 'Uncanny Brands Star Wars Darth Vader Elite 2-Slice Toaster- Star Wars Icon Logo onto Your Toast', 'price': 43.96, 'Selling Price': 42.99, 'Image': 'https://images-na.ssl-images-amazon.com/images/I/41DS9pw8VwL.jpg', 'Product Url': 'https://www.amazon.com/Star-Wars-Darth-Vader-Toaster/dp/B00JFFH1NA', 'similarity': 1.0}, {'product_id': 1515966223509088574, 'Product Name': 'Uncanny Brands Star Wars Darth Vader Elite 2-Slice Toaster- Star Wars Icon Logo onto Your Toast', 'price': 15.02, 'Selling Price': 42.99, 'Image': 'h

In [32]:
top_recommendations = recommend_products(user_id='1515915625446798336', df=df, popular_recommendations=popular_recommendations)

User 1515915625446798336 has purchased 1 products.


In [None]:
# Printing some users for testing purposes
for val in df_merged['user_id'].head(10):
    print(int(val)) 

1515915625441993984
1515915625441993984
1515915625447879424
1515915625447879424
1515915625450382848
1515915625450382848
1515915625450382848
1515915625450382848
1515915625450382848
1515915625450382848


In [61]:
top_recommendations = recommend_products(user_id='1515915625441993984', df=df, popular_recommendations=popular_recommendations)
print(top_recommendations)

User 1515915625441993984 has purchased 2 products.
[{'product_id': 1515966223509123398, 'Product Name': 'Fisher Price Kid-Tough Apptivity Case for Kindle Fire, Blue (will not fit HD models)', 'price': 69.42, 'Selling Price': 26.19, 'Image': 'https://images-na.ssl-images-amazon.com/images/I/51LHHv8aC3L.jpg', 'Product Url': 'https://www.amazon.com/Fisher-Kid-Tough-Apptivity-Kindle-models/dp/B009RH9OJ0', 'similarity': 1.0}, {'product_id': 1515966223509089673, 'Product Name': 'Fisher Price Kid-Tough Apptivity Case for Kindle Fire, Blue (will not fit HD models)', 'price': 208.31, 'Selling Price': 26.19, 'Image': 'https://images-na.ssl-images-amazon.com/images/I/51LHHv8aC3L.jpg', 'Product Url': 'https://www.amazon.com/Fisher-Kid-Tough-Apptivity-Kindle-models/dp/B009RH9OJ0', 'similarity': 1.0}, {'product_id': 1515966223509130229, 'Product Name': 'Fisher Price Kid-Tough Apptivity Case for Kindle Fire, Blue (will not fit HD models)', 'price': 601.83, 'Selling Price': 26.19, 'Image': 'https://im

**Future Enhancements:**

* Finding new categories based on text columns: 
For this purpose, it is recommended to use an LLM trained for classification purposes. 

* Define client audiences based on the output of the recommender system.
Categorize clients into different segments based on the recommendations generated by the system, allowing for more targeted insights and actions. These clusters may be defined by selected criteria, such as the prices ranges. 

* Implement a validation approach to compare purchases influenced by different embedding models.
Develop a method to assess and compare how various embedding models impact the categorization of products, by evaluating the purchases that the recommendation system influences.

* Create an API and HTML interface for visualization and testing.
Build an API to serve the recommendation model, accompanied by an HTML interface for visualizing and testing the results interactively.