In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from transformers import BertTokenizer, BertModel, pipeline
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import ast
import re
from datetime import datetime

Import data

In [2]:
# load the dataset of PCA normalized embeddings: 
file_path = rf'.\..\data\embeddings_dim_reduction\df_user_normalized_PCA_merged20250101_162402.csv' # adjust the file_path with the proper name of the file
df_user_normalized_PCA = pd.read_csv(file_path) 

In [4]:
# Verify the number of unique users:
unique_values_user = df_user_normalized_PCA.user_ID.nunique()
unique_values_user

630476

In [5]:
df_user_normalized_PCA['user_ID'].value_counts()

user_ID
AG73BVBKUOH22USSFJA5ZWL7AKXA      165
AEZP6Z2C5AVQDZAJECQYZWQRNG3Q      146
AEMP3A7IKW37CMWFXNKXWW6HGJHA_1    115
AGZUJTI7A3JFKB4FP5JOH6NVAJIQ_1     87
AFDYIK3FNPY2JFBQYUWC6GSBMIRQ_2     83
                                 ... 
AFSMCZTEUW3TI2BSPE25BD5GKXLA        1
AGKNUO4XOIPCSIKDRHO56UQDPXVQ        1
AEJQRDONU2O5LSOD5OC77XO43DFA        1
AFFFHL7GG5FLD2TSUGU65HTN6FMA        1
AGIYQU6RK6TBKBCMWKVPBPBMMJNA        1
Name: count, Length: 630476, dtype: int64

In [6]:
df_user_normalized_PCA.shape

(692536, 312)

In [7]:
unique_combinations = df_user_normalized_PCA[['user_ID', 'product_ID']].value_counts()
unique_combinations

user_ID                       product_ID
AGWOOXMW2IXPKZOWAIWNMCXY7LBQ  B09NS1VG4L    2
AE222BBOVZIF42YOOPNBXL4UUMYA  B013HR1A92    1
AGPGHQIMPLOJD3FR3ODRDJFYSJBQ  B079D87KKM    1
AGPGGDJBP4W2D3QJ2WN3NWHSPA7Q  B08791HQXG    1
AGPGGF3KFAOMNATUGFSZEMRJ6PVQ  B07TXYVLPS    1
                                           ..
AFENAWCNZDSJANL43HMAQDOIN5QQ  B07D33K512    1
AFENAYIMKNX6PGBHATFCTZS2SAAQ  B008QSM704    1
AFENB2HA5MVZWNKRICDWRXR5PCDA  B001E76F6G    1
AFENBUWI2IGQ5ZBTH4XE36QRIDLA  B07FGFWKXM    1
AHZZZSOTVOVACVK2WWXL4ITEAPIA  B00R1TAN7I    1
Name: count, Length: 692535, dtype: int64

## 1. Create User and Product Vectors

* Aggregating the embeddings to compute a single user vector per user_ID and single product vector per product_ID.
* To create a vector for each user, we can average all the review embeddings corresponding to that user. This works because averaging retains a general sense of the user's overall preferences.
* Similarly, we can aggregate the embeddings for each product. For example, average all the embeddings corresponding to each product ID.

### 1.1 Aggregate User Vectors

In [8]:
# Select relevant columns: user_ID and embedding dimensions
vector_cols = [f'dim_norm_PCA{i}' for i in range(300)]
df_user_vectors = df_user_normalized_PCA.groupby('user_ID')[vector_cols].mean().reset_index()

# Rename the columns of vectors to indicate these are user vectors:
df_user_vectors.columns = ['user_ID'] + [f'user_vec_{i}' for i in range(300)]

df_user_vectors.shape

(630476, 301)

In [9]:
df_user_vectors.head()

Unnamed: 0,user_ID,user_vec_0,user_vec_1,user_vec_2,user_vec_3,user_vec_4,user_vec_5,user_vec_6,user_vec_7,user_vec_8,...,user_vec_290,user_vec_291,user_vec_292,user_vec_293,user_vec_294,user_vec_295,user_vec_296,user_vec_297,user_vec_298,user_vec_299
0,AE222BBOVZIF42YOOPNBXL4UUMYA,-0.242854,-0.03092,0.021083,0.178935,0.150784,0.079952,0.032998,0.050808,-0.031869,...,-0.006418,0.001271,0.001262,0.004396,-0.009805,0.001213,-0.006011,0.009523,0.011974,-0.002636
1,AE222FP7YRNFCEQ2W3ZDIGMSYTLQ,-0.092024,-0.088514,-0.154551,0.02428,0.105873,0.005067,0.049238,0.053021,-0.016423,...,-0.002328,0.003291,-0.005497,0.001255,-0.009301,-0.001881,-0.004,-0.004674,-0.002821,0.003872
2,AE222X475JC6ONXMIKZDFGQ7IAUA,0.023666,-0.125203,0.135714,-0.089329,0.014381,0.002513,-0.012603,0.089307,-0.085177,...,0.006148,-0.000662,0.002959,0.001447,-0.003389,-0.003515,-0.005261,-0.000508,0.004328,-0.012459
3,AE222Y4WTST6BUZ4J5Y2H6QMBITQ,0.082812,0.107795,-0.105254,0.095356,0.157562,-0.068382,-0.022698,0.028111,0.003332,...,-0.003291,0.01017,-0.006862,0.012727,-0.00743,-0.011743,-0.009849,-0.014741,0.008599,0.012631
4,AE2232TEZOEWQLAFEX2NA6VBGMYQ,0.030655,0.000936,-0.003937,0.162101,0.020501,-0.131889,0.038334,-0.006152,-0.062543,...,-0.008195,-0.013804,0.008862,0.000283,-0.007259,-0.018776,-0.004691,0.001994,-0.011627,0.005342


In [10]:
# Save the new DataFrame with user vectors
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

file_path_user_vec = rf'.\..\data\text_analysis\user_vectors\user_vectors_{timestamp}.csv'
df_user_vectors.to_csv(file_path_user_vec, index=False)

### 1.2 Aggregate Product Vectors

In [11]:
#product_vector_cols = [f'dim_norm_PCA{i}' for i in range(300)]
df_product_vectors = df_user_normalized_PCA.groupby('product_ID')[vector_cols].mean().reset_index()

df_product_vectors.columns = ['product_ID'] + [f'product_vec_{i}' for i in range(300)]
df_product_vectors.shape

(115576, 301)

In [12]:
df_product_vectors.head()

Unnamed: 0,product_ID,product_vec_0,product_vec_1,product_vec_2,product_vec_3,product_vec_4,product_vec_5,product_vec_6,product_vec_7,product_vec_8,...,product_vec_290,product_vec_291,product_vec_292,product_vec_293,product_vec_294,product_vec_295,product_vec_296,product_vec_297,product_vec_298,product_vec_299
0,5946468,0.193159,0.052318,-0.032258,-0.019251,-0.081948,0.152584,0.109134,-0.013073,-0.004725,...,-0.011916,0.001801,-0.006467,0.012443,-0.002346,-0.010619,-0.006551,0.000536,-0.011857,0.006535
1,123034892,-0.27244,0.014165,0.016354,-0.073234,-0.019927,0.082716,0.084306,-0.058808,0.063041,...,0.000892,-0.004755,-0.009786,-0.002935,0.003419,-0.003759,0.002519,-0.002213,-0.006096,-0.000439
2,124784577,0.103324,0.088912,0.035171,-0.031267,-0.079123,0.035487,0.028678,-0.053064,0.054158,...,-0.005667,0.005014,0.00527,-0.002312,0.004746,0.006099,0.002386,0.009515,-0.002692,0.003523
3,515059560,0.020084,0.272879,-0.159341,-0.101028,0.011918,0.025192,-0.101624,0.057154,0.056128,...,0.005814,-0.003401,0.003856,0.007813,0.013138,-0.004439,0.00631,-0.002885,-0.017636,0.008292
4,615675026,0.242773,-0.23121,-0.03051,-0.112652,-0.020769,0.076741,-0.05815,0.020834,0.002965,...,-0.001046,-0.009334,0.000204,-0.001505,0.002284,-0.009739,0.003949,-0.004986,8e-06,-0.00492


In [13]:
# Save the new DataFrame with product vectors
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

file_path_product_vec = rf'.\..\data\text_analysis\product_vectors\product_vectors_{timestamp}.csv'
df_product_vectors.to_csv(file_path_product_vec, index=False)

## 2. Semantic Analysis

### Compare Reviews - Similarity Research with COS Similarity

* Extract the user and product vectors.
* Compute the cosine similarity between each user and all products.
* Rank products for each user based on similarity scores.

Create a DataFrame that stores all items that user purchased, to remove those from the recommendations

In [14]:
user_item_df = df_user_normalized_PCA.groupby('user_ID')['product_ID'].apply(list).reset_index()
user_item_df

Unnamed: 0,user_ID,product_ID
0,AE222BBOVZIF42YOOPNBXL4UUMYA,[B013HR1A92]
1,AE222FP7YRNFCEQ2W3ZDIGMSYTLQ,[B0BTT658PQ]
2,AE222X475JC6ONXMIKZDFGQ7IAUA,[B00PBDMRES]
3,AE222Y4WTST6BUZ4J5Y2H6QMBITQ,[B00012FPSO]
4,AE2232TEZOEWQLAFEX2NA6VBGMYQ,[B07QNPXBLH]
...,...,...
630471,AHZZYVEU6QFMPFZ2HJUWR22SNK4A,[B07JJ9NFFH]
630472,AHZZZAK24AJ3JNBDUZJGHHWSRVAA,[B00KXFD75M]
630473,AHZZZJP24QUSB5XWW6MAXYBZZZSQ,[B00IG0677G]
630474,AHZZZL7YQJA3RSA6PYK3WMFACYIQ,[B091TKH1JF]


In [15]:
# Extract user vectors
user_ids = df_user_vectors['user_ID'].values
user_vectors = df_user_vectors.iloc[:, 1:].values

# Extract product vectors
product_ids = df_product_vectors['product_ID'].values
product_vectors = df_product_vectors.iloc[:, 1:].values

In [16]:
product_ids

array(['0005946468', '0123034892', '0124784577', ..., 'B0CBXM7WHY',
       'B0CCPDTRK7', 'B0CFZKJ4KY'], dtype=object)

In [17]:
print(f"Shape of User Vectors: {user_vectors.shape}")
print(f"Shape of Product Vectors: {product_vectors.shape}")

Shape of User Vectors: (630476, 300)
Shape of Product Vectors: (115576, 300)


In [18]:
# # Define the number of closest products to compute
# top_n_products = 10
# # Fit the NearestNeighbors model on the product vectors
# nbrs = NearestNeighbors(n_neighbors=top_n_products, metric='cosine').fit(product_vectors)

# # Retrieve the top-N most similar products for each user
# distances, indices = nbrs.kneighbors(user_vectors)

# # `distances` contains the cosine distances (1 - similarity)
# # Convert distances to similarity scores
# similarity_scores = 1 - distances

In [19]:
# def recommend_top_n_products_by_user_id(user_id, user_ids, similarity_scores, indices, product_ids, user_item_df, top_n=5):
#     """
#     Recommend top N products for a given user based on precomputed top-N cosine similarity.

#     Parameters:
#     - user_id: User ID for whom to generate recommendations
#     - user_ids: List of user IDs corresponding to rows in similarity data
#     - similarity_scores: Precomputed top-N similarity scores (users x top-N products)
#     - indices: Indices of the top-N products for each user
#     - product_ids: List of product IDs corresponding to the product vectors
#     - user_item_df: DataFrame containing user-product interactions
#     - top_n: Number of top recommendations to return

#     Returns:
#     - List of (product_id, similarity_score) tuples
#     """

#     # Find the index of the user_ID
#     if user_id not in user_ids:
#         raise ValueError(f"User ID '{user_id}' not found in the user data.")
    
#     user_index = np.where(user_ids == user_id)[0][0]  # Locate the index of user_id

#     # Get top-N similarity scores and product indices for this user
#     user_similarities = similarity_scores[user_index]
#     user_product_indices = indices[user_index]

#     # Retrieve the list of already purchased products for the user
#     purchased_products = user_item_df.loc[user_item_df['user_ID'] == user_id, 'product_ID']
#     if not purchased_products.empty:
#         # Flatten and handle lists or non-hashable elements in purchased_products
#         if purchased_products.apply(lambda x: isinstance(x, list)).any():
#             purchased_products = set([item for sublist in purchased_products for item in sublist])
#         else:
#             purchased_products = set(purchased_products)
#     else:
#         purchased_products = set()

#     # Filter and sort recommendations
#     recommendations = []
#     for i, product_index in enumerate(user_product_indices):
#         product = product_ids[product_index]
#         if product not in purchased_products:
#             recommendations.append((product, user_similarities[i]))
#         if len(recommendations) >= top_n:  # Stop when we have enough recommendations
#             break
    
#     return recommendations

# # Example usage
# user_id_input = "AGKHLEW2SOWHNMFQIJGBECAF7INQ"  # Replace with user_ID
# top_n = 5

# try:
#     recommendations = recommend_top_n_products_by_user_id(
#         user_id_input, 
#         user_ids, 
#         similarity_scores, 
#         indices, 
#         product_ids, 
#         user_item_df, 
#         top_n
#     )

#     print(f"Top-{top_n} Recommendations for User '{user_id_input}':")
#     for product_id, score in recommendations:
#         print(f"Product ID: {product_id}, Similarity Score: {score:.4f}")
# except ValueError as e:
#     print(e)


In [28]:

def recommend_top_n_products_by_user_id(user_id, user_ids, product_ids, product_vectors, user_vectors, user_item_df, top_n=5):
    """
    Recommend top N products for a given user based on precomputed top-N cosine similarity.

    Parameters:
    - user_id: User ID for whom to generate recommendations
    - user_ids: List of user IDs corresponding to rows in similarity data
    - product_ids: List of product IDs corresponding to the product vectors
    - product_vectors: List of product vectors (embeddings of the products)
    - user_vectors: User vectors for calculating similarity (representations of user preferences)
    - user_item_df: DataFrame containing user-product interactions
    - top_n: Number of top recommendations to return

    Returns:
    - List of (product_id, similarity_score) tuples
    """
    # Find the index of the user_ID
    if user_id not in user_ids:
        raise ValueError(f"User ID '{user_id}' not found in the user data.")
    
    user_index = np.where(user_ids == user_id)[0][0]  # Locate the index of user_id

    # Retrieve the list of already purchased products (product_ID) for the user
    purchased_products = user_item_df.loc[user_item_df['user_ID'] == user_id, 'product_ID']
    if not purchased_products.empty:
        # Flatten and handle lists or non-hashable elements in purchased_products (nested structures)
        if purchased_products.apply(lambda x: isinstance(x, list)).any():
            purchased_products = set([item for sublist in purchased_products for item in sublist])
        else:
            purchased_products = set(purchased_products)
    else:
        purchased_products = set()

    # Filter out the purchased product indices and product vectors
    non_purchased_product_indices = [i for i, product_id in enumerate(product_ids) if product_id not in purchased_products]
    non_purchased_product_vectors = [product_vectors[i] for i in non_purchased_product_indices]

    # Fit the Nearest Neighbors model on the non-purchased products
    nbrs = NearestNeighbors(n_neighbors=top_n, metric='cosine').fit(non_purchased_product_vectors)

    # Retrieve the top-N most similar products for this user from the non-purchased products
    distances, indices = nbrs.kneighbors(user_vectors[user_index].reshape(1, -1))  # Assuming user_vectors is already preprocessed

    # Convert distances to similarity scores
    similarity_scores = 1 - distances

    # Map the indices back to product IDs
    recommended_products = [(product_ids[non_purchased_product_indices[i]], similarity_scores[0][i]) for i in range(top_n)]

    return recommended_products

# Example usage
user_id_input = "AEVTGJFLW22HVSWOJLJCBJUN46WA"  # Replace with user_ID AFQLNQNQYFWQZPJQZS6V3NZU4QBQ & AGKHLEW2SOWHNMFQIJGBECAF7INQ
top_n = 5

try:
    recommendations = recommend_top_n_products_by_user_id(
        user_id_input, 
        user_ids, 
        product_ids, 
        product_vectors, 
        user_vectors, 
        user_item_df, 
        top_n
    )

    print(f"Top-{top_n} Recommendations for User '{user_id_input}':")
    for product_id, score in recommendations:
        print(f"Product ID: {product_id}, Similarity Score: {score:.4f}")
except ValueError as e:
    print(e)

Input X contains NaN.
NearestNeighbors does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values


In [21]:
duplicated_user_ids = df_user_normalized_PCA['user_ID'][df_user_normalized_PCA['user_ID'].duplicated()].unique()
duplicated_user_ids

array(['AGKHLEW2SOWHNMFQIJGBECAF7INQ', 'AFQLNQNQYFWQZPJQZS6V3NZU4QBQ',
       'AFSKPY37N3C43SOI5IEXEK5JSIYA', ...,
       'AFV3EYFZLLLBWIXWRZUSRJOHLNBA', 'AEVTGJFLW22HVSWOJLJCBJUN46WA',
       'AHURE3VT2MLCTARMYI7JA7KKDYAA'], dtype=object)

In [29]:
df_PCA_single_user_check = df_user_normalized_PCA[df_user_normalized_PCA['user_ID']=='AEVTGJFLW22HVSWOJLJCBJUN46WA']
df_PCA_single_user_check

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,...,dim_norm_PCA290,dim_norm_PCA291,dim_norm_PCA292,dim_norm_PCA293,dim_norm_PCA294,dim_norm_PCA295,dim_norm_PCA296,dim_norm_PCA297,dim_norm_PCA298,dim_norm_PCA299
692429,5,Combine with wardrobe.,LOVE THE COLORS AND THE BEDAZZLED.,[],B08LBVL2D6,B08LBVL2D6,AEVTGJFLW22HVSWOJLJCBJUN46WA,2021-02-03 03:25:55.427,0,1,...,,,,,,,,,,
692430,5,Coordinate with wardrobe.,I like the colors and material.,[],B08MVCBZGJ,B08MVCBZGJ,AEVTGJFLW22HVSWOJLJCBJUN46WA,2021-02-03 03:24:41.143,0,1,...,,,,,,,,,,


Analyse recommended product

In [23]:
product_ids_to_filter = ['B0170FP8CC', 'B0BQWTXV2Q','B08JQS9FVP','B09TQ2SDKK','B085TBXF1Z']  # List of product IDs you want to filter
filtered_df = df_user_normalized_PCA[df_user_normalized_PCA['product_ID'].isin(product_ids_to_filter)]
filtered_df

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,...,dim_norm_PCA290,dim_norm_PCA291,dim_norm_PCA292,dim_norm_PCA293,dim_norm_PCA294,dim_norm_PCA295,dim_norm_PCA296,dim_norm_PCA297,dim_norm_PCA298,dim_norm_PCA299
2397,4,Super Gentle,My skin is really sensitive.<br /><br />I also...,[],B0BQWTXV2Q,B0BQWTXV2Q,AENH6LSB6BM7XLPEYUL43WBOD6JA,2023-02-06 18:45:25.633,0,0,...,-0.000454,-0.015631,0.001277,0.004874,-0.007868,0.003418,-0.006919,0.011494,0.003974,-0.008224
3380,4,Strong scent but works great,Really happy with this concentrate. I did not ...,[],B08JQS9FVP,B08JQS9FVP,AFPAGWUQX3ELC4PWOOATIP7EDMOA,2021-02-12 20:57:00.886,3,1,...,0.020239,-0.022755,-0.008902,-0.013440,-0.010192,0.007216,0.002646,-0.009904,-0.002265,0.002968
4325,3,Strongly scented,The smell is really strong. Not at all subtle....,[],B0170FP8CC,B0170FP8CC,AE5S2ACTELDBCIGS2M5377BIAB7Q,2017-10-13 22:26:41.602,1,1,...,-0.000909,-0.000427,0.002373,0.001671,-0.007052,0.003842,-0.005304,-0.002918,-0.015488,0.012445
8247,5,Great product in a convenient package with ing...,I admit I ordered this product because I loved...,[],B0170FP8CC,B0170FP8CC,AE7XIIPNUJEEMT45BAXWXGOFMSKA,2016-04-24 14:12:44.000,3,1,...,0.003952,-0.011688,-0.007810,0.007134,0.010604,-0.000062,-0.008174,0.011181,0.003260,-0.000943
9333,3,"OK for a dry shampoo, but scent is overpowering",Smells like a grandma's powder room. I really ...,[],B0170FP8CC,B0170FP8CC,AFT65K7NEYPNCN6HT6TRPIEUUWQQ,2016-02-29 15:46:45.000,6,1,...,-0.005798,-0.005635,0.014930,0.012903,-0.006345,-0.000534,-0.011890,0.005047,-0.007495,0.007666
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
670283,5,Weirdly Effective!,Weirdly effective! I used this product with ze...,[],B0170FP8CC,B0170FP8CC,AHCUZFPTPNAPHD3SAUTBHDJIZQ4A,2016-03-24 18:54:44.000,1,1,...,-0.003829,-0.003003,-0.000603,0.008080,0.008632,-0.002895,0.003991,-0.005007,-0.004113,0.010167
670286,3,Not a bad product.,Nice product. The scent was not as pleasant as...,[],B0170FP8CC,B0170FP8CC,AFOUOYV4PNP24FUUQDTTVY6NPWTA,2016-05-03 00:58:01.000,2,1,...,0.006500,-0.006868,0.000709,0.008141,0.006725,0.005752,0.006550,0.005116,0.000646,-0.003040
670287,5,A natural dry shampoo that does the trick!,Spent a while looking for an all-natural dry s...,[],B0170FP8CC,B0170FP8CC,AGFBBZAQFCJD5VAWDHGGBNX6Z5JQ,2016-03-24 00:44:26.000,8,1,...,-0.003949,-0.027030,0.000594,-0.009969,0.004161,-0.002310,0.019797,0.012089,-0.018807,-0.010738
670288,5,My search is over! Glory be!,I have spent some time trying out dry shampoos...,[],B0170FP8CC,B0170FP8CC,AHGECQRFUZBVECMYIUYU42F6BBHA,2016-03-21 15:59:35.000,2,1,...,0.006149,-0.002670,0.015323,0.001841,0.003552,0.000461,-0.002923,0.003506,-0.006146,-0.007352


In [24]:
pd.set_option('display.max_colwidth', None)

# Display the cleaned_text column
print(filtered_df['cleaned_text'])

2397           my skin is really sensitive.i also dont generally like allinone products. you should see my bathroom cabinets. trying to consolidate, but i get different rashes in different places, and no one thing works for everything.this is very gentle and did not make my skin or face or scalp break out or rash up at all. its not super moisturizing, but it wasnt drying, either. the scent is very light, and the cleanser has a little mintytingly kick. this would have worked for me in my teens or 20s when i couldnt wait to get out of the shower and on with my life.
3380                                                                                                                                            really happy with this concentrate. i did not expect it to work well but am very thankful to be wrong. it thickens to the normal consistency of conditioners after it cools. its very moisturizing for my thick curly hair. my only issue was the scent. very strong herbal scent. it was not