In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from transformers import BertTokenizer, BertModel, pipeline
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import ast
import re
from datetime import datetime

Import data

In [2]:
# load the dataset of PCA normalized embeddings: 
file_path = rf'.\..\data\embeddings_dim_reduction\df_user_normalized_PCA20241226_190558.csv' # adjust the filepath with the proper file 
df_user_normalized_PCA = pd.read_csv(file_path) 

In [3]:
# Verify the number of unique users:
unique_values_user = df_user_normalized_PCA.user_ID.nunique()
unique_values_user

630476

In [4]:
df_user_normalized_PCA.head(2)

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,...,dim_norm_PCA290,dim_norm_PCA291,dim_norm_PCA292,dim_norm_PCA293,dim_norm_PCA294,dim_norm_PCA295,dim_norm_PCA296,dim_norm_PCA297,dim_norm_PCA298,dim_norm_PCA299
0,5,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-05 14:08:48.923,0,1,...,-0.00131,8.4e-05,0.009012,-0.004501,0.001905,0.007937,0.005959,-0.003202,-0.004059,0.00202
1,4,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-04 18:10:55.070,1,1,...,-0.007619,-0.012466,0.01296,0.008836,-0.01119,-0.022729,0.000907,0.000792,-0.001111,-0.001824


## 1. Create User and Product Vectors

* Aggregating the embeddings to compute a single user vector per user_ID and single product vector per product_ID.
* To create a vector for each user, we can average all the review embeddings corresponding to that user. This works because averaging retains a general sense of the user's overall preferences.
* Similarly, we can aggregate the embeddings for each product. For example, average all the embeddings corresponding to each product ID.

### 1.1 Aggregate User Vectors

In [5]:
# Select relevant columns: user_ID and embedding dimensions
vector_cols = [f'dim_norm_PCA{i}' for i in range(300)]
df_user_vectors = df_user_normalized_PCA.groupby('user_ID')[vector_cols].mean().reset_index()

# Rename the columns of vectors to indicate these are user vectors:
df_user_vectors.columns = ['user_ID'] + [f'user_vec_{i}' for i in range(300)]

df_user_vectors.shape

(630476, 301)

In [6]:
df_user_vectors.head()

Unnamed: 0,user_ID,user_vec_0,user_vec_1,user_vec_2,user_vec_3,user_vec_4,user_vec_5,user_vec_6,user_vec_7,user_vec_8,...,user_vec_290,user_vec_291,user_vec_292,user_vec_293,user_vec_294,user_vec_295,user_vec_296,user_vec_297,user_vec_298,user_vec_299
0,AE222BBOVZIF42YOOPNBXL4UUMYA,0.157177,-0.047737,-0.016334,-0.216434,-0.053552,-0.098437,-0.042217,0.004862,0.124219,...,7.4e-05,-0.003814,0.016722,-0.00391,0.001206,0.005939,-0.018628,0.000775,-0.005081,-0.010397
1,AE222FP7YRNFCEQ2W3ZDIGMSYTLQ,0.04597,-0.018998,0.164447,-0.194383,0.170237,0.089281,-0.069243,0.064173,-0.06123,...,0.012602,-0.01437,0.013525,0.015043,0.007685,0.017838,0.014043,0.019642,-0.018238,0.001468
2,AE222X475JC6ONXMIKZDFGQ7IAUA,0.410081,0.025667,-0.12193,0.184316,0.0051,-0.151829,0.046592,0.091186,0.147147,...,0.013997,0.025863,-0.006677,-0.014606,0.005337,0.008045,0.014762,0.001563,0.011351,0.010184
3,AE222Y4WTST6BUZ4J5Y2H6QMBITQ,-0.204608,-0.027967,-0.074068,0.101387,0.089039,-0.18417,-0.045104,-0.112713,0.019092,...,-0.007948,0.013191,-0.006841,-0.000236,-0.015392,-0.00255,-0.005537,0.006153,0.022717,-0.003635
4,AE2232TEZOEWQLAFEX2NA6VBGMYQ,0.306584,-0.059949,0.359815,-0.206749,0.43388,-0.091246,0.383815,0.030456,-0.088499,...,0.000389,-0.002188,0.011391,-0.011571,0.013657,0.018282,-0.001572,0.000488,0.012923,-0.009286


In [7]:
# Save the new DataFrame with user vectors
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

file_path_user_vec = rf'.\..\data\text_analysis\user_vectors\user_vectors_{timestamp}.csv'
df_user_vectors.to_csv(file_path_user_vec, index=False)

### 1.2 Aggregate Product Vectors

In [8]:
#product_vector_cols = [f'dim_norm_PCA{i}' for i in range(300)]
df_product_vectors = df_user_normalized_PCA.groupby('product_ID')[vector_cols].mean().reset_index()

df_product_vectors.columns = ['product_ID'] + [f'product_vec_{i}' for i in range(300)]
df_product_vectors.shape

(115576, 301)

In [9]:
df_product_vectors.head()

Unnamed: 0,product_ID,product_vec_0,product_vec_1,product_vec_2,product_vec_3,product_vec_4,product_vec_5,product_vec_6,product_vec_7,product_vec_8,...,product_vec_290,product_vec_291,product_vec_292,product_vec_293,product_vec_294,product_vec_295,product_vec_296,product_vec_297,product_vec_298,product_vec_299
0,5946468,0.763349,0.091675,0.117928,0.099401,0.132281,0.0709,-0.223043,-0.012843,-0.009337,...,-0.000546,0.003127,0.005684,0.003426,-0.006655,-0.001777,-0.009676,0.00057,-0.001297,0.007379
1,123034892,0.578087,-0.060032,-0.136238,-0.096158,-0.007847,-0.121033,-0.108942,0.071681,0.014017,...,-0.002423,-0.005551,0.002831,-0.002318,0.00319,0.003064,-0.000896,-0.000299,-0.010328,0.001244
2,124784577,0.055595,-0.048237,0.045403,0.001712,0.206265,0.038459,0.181421,0.045733,-0.04182,...,-0.006233,-0.002329,0.004678,0.007249,-0.001499,0.003474,0.001119,0.008357,-0.00644,0.007834
3,515059560,-0.141502,0.079938,0.102094,-0.057478,-0.011861,-0.127184,-0.016523,-0.101494,-0.038372,...,-0.004293,-0.011067,0.000231,0.007716,-0.00315,0.018886,-0.001149,-0.00464,0.000215,0.00194
4,615675026,-0.187639,0.00682,0.150007,-0.152691,0.051482,-0.066133,-0.137043,-0.074076,0.021345,...,-0.000969,0.008569,-0.00359,-0.01526,0.002899,-0.004042,0.004385,-0.012813,0.003399,-0.00738


In [10]:
# Save the new DataFrame with product vectors
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

file_path_product_vec = rf'.\..\data\text_analysis\product_vectors\product_vectors_{timestamp}.csv'
df_product_vectors.to_csv(file_path_product_vec, index=False)

## 2. Semantic Analysis

### Compare Reviews - Similarity Research with COS Similarity

* Extract the user and product vectors.
* Compute the cosine similarity between each user and all products.
* Rank products for each user based on similarity scores.

Create a DataFrame that stores all items that user purchased, to remove those from the recommendations

In [11]:
user_item_df = df_user_normalized_PCA.groupby('user_ID')['product_ID'].apply(list).reset_index()
user_item_df

Unnamed: 0,user_ID,product_ID
0,AE222BBOVZIF42YOOPNBXL4UUMYA,[B013HR1A92]
1,AE222FP7YRNFCEQ2W3ZDIGMSYTLQ,[B0BTT658PQ]
2,AE222X475JC6ONXMIKZDFGQ7IAUA,[B00PBDMRES]
3,AE222Y4WTST6BUZ4J5Y2H6QMBITQ,[B00012FPSO]
4,AE2232TEZOEWQLAFEX2NA6VBGMYQ,[B07QNPXBLH]
...,...,...
630471,AHZZYVEU6QFMPFZ2HJUWR22SNK4A,[B07JJ9NFFH]
630472,AHZZZAK24AJ3JNBDUZJGHHWSRVAA,[B00KXFD75M]
630473,AHZZZJP24QUSB5XWW6MAXYBZZZSQ,[B00IG0677G]
630474,AHZZZL7YQJA3RSA6PYK3WMFACYIQ,[B091TKH1JF]


In [26]:
# Extract user vectors
user_ids = df_user_vectors['user_ID'].values
user_vectors = df_user_vectors.iloc[:, 1:].values

# Extract product vectors
product_ids = df_product_vectors['product_ID'].values
product_vectors = df_product_vectors.iloc[:, 1:].values

# Subset of product vectors
product_vectors_subset = df_product_vectors.iloc[:1000, 1:].values
product_ids_subset = df_product_vectors['product_ID'].iloc[:1000].values

user_vectors_subset = df_user_vectors.iloc[:1000, 1:].values
user_ids_subset = df_user_vectors['user_ID'].iloc[:1000].values

In [23]:
product_vectors_subset.shape

(1000, 300)

In [13]:
product_ids

array(['0005946468', '0123034892', '0124784577', ..., 'B0CBXM7WHY',
       'B0CCPDTRK7', 'B0CFZKJ4KY'], dtype=object)

In [14]:
print(f"Shape of User Vectors: {user_vectors.shape}")
print(f"Shape of Product Vectors: {product_vectors.shape}")

Shape of User Vectors: (630476, 300)
Shape of Product Vectors: (115576, 300)


In [15]:
# # Compute cosine similarity between all users and all products
# cosine_similarities = cosine_similarity(user_vectors, product_vectors)

# print(f"Cosine Similarity Matrix Shape: {cosine_similarities.shape}")
# # Rows correspond to users, columns correspond to products

In [24]:
# Define the number of closest products to compute
top_n_products = 10
# Fit the NearestNeighbors model on the product vectors
nbrs = NearestNeighbors(n_neighbors=top_n_products, metric='cosine').fit(product_vectors_subset)

# Retrieve the top-N most similar products for each user
distances, indices = nbrs.kneighbors(user_vectors_subset)

# `distances` contains the cosine distances (1 - similarity)
# Convert distances to similarity scores
similarity_scores = 1 - distances

In [28]:
def recommend_top_n_products_by_user_id(user_id, user_ids, similarity_scores, indices, product_ids, user_item_df, top_n=5):
    """
    Recommend top N products for a given user based on precomputed top-N cosine similarity.

    Parameters:
    - user_id: User ID for whom to generate recommendations
    - user_ids: List of user IDs corresponding to rows in similarity data
    - similarity_scores: Precomputed top-N similarity scores (users x top-N products)
    - indices: Indices of the top-N products for each user
    - product_ids: List of product IDs corresponding to the product vectors
    - user_item_df: DataFrame containing user-product interactions
    - top_n: Number of top recommendations to return

    Returns:
    - List of (product_id, similarity_score) tuples
    """

    # Find the index of the user_ID
    if user_id not in user_ids:
        raise ValueError(f"User ID '{user_id}' not found in the user data.")
    
    user_index = np.where(user_ids == user_id)[0][0]  # Locate the index of user_id

    # Get top-N similarity scores and product indices for this user
    user_similarities = similarity_scores[user_index]
    user_product_indices = indices[user_index]

    # Retrieve the list of already purchased products for the user
    purchased_products = user_item_df.loc[user_item_df['user_ID'] == user_id, 'product_ID']
    if not purchased_products.empty:
        # Flatten and handle lists or non-hashable elements in purchased_products
        if purchased_products.apply(lambda x: isinstance(x, list)).any():
            purchased_products = set([item for sublist in purchased_products for item in sublist])
        else:
            purchased_products = set(purchased_products)
    else:
        purchased_products = set()

    # Filter and sort recommendations
    recommendations = []
    for i, product_index in enumerate(user_product_indices):
        product = product_ids[product_index]
        if product not in purchased_products:
            recommendations.append((product, user_similarities[i]))
        if len(recommendations) >= top_n:  # Stop when we have enough recommendations
            break
    
    return recommendations

# Example usage
user_id_input = "AE222BBOVZIF42YOOPNBXL4UUMYA"  # Replace with user_ID
top_n = 5

try:
    recommendations = recommend_top_n_products_by_user_id(
        user_id_input, 
        user_ids_subset, 
        similarity_scores, 
        indices, 
        product_ids_subset, 
        user_item_df, 
        top_n
    )

    print(f"Top-{top_n} Recommendations for User '{user_id_input}':")
    for product_id, score in recommendations:
        print(f"Product ID: {product_id}, Similarity Score: {score:.4f}")
except ValueError as e:
    print(e)


Top-5 Recommendations for User 'AE222BBOVZIF42YOOPNBXL4UUMYA':
Product ID: B000HTM14W, Similarity Score: 0.7946
Product ID: B000FQT4AQ, Similarity Score: 0.7718
Product ID: B00032A9TM, Similarity Score: 0.6903
Product ID: B0000Y3DSM, Similarity Score: 0.5904
Product ID: B000143M46, Similarity Score: 0.5413


In [17]:
duplicated_user_ids = df_user_normalized_PCA['user_ID'][df_user_normalized_PCA['user_ID'].duplicated()].unique()
duplicated_user_ids

array(['AGKHLEW2SOWHNMFQIJGBECAF7INQ', 'AFQLNQNQYFWQZPJQZS6V3NZU4QBQ',
       'AFSKPY37N3C43SOI5IEXEK5JSIYA', ...,
       'AFV3EYFZLLLBWIXWRZUSRJOHLNBA', 'AEVTGJFLW22HVSWOJLJCBJUN46WA',
       'AHURE3VT2MLCTARMYI7JA7KKDYAA'], dtype=object)

In [18]:
df_PCA_single_user_check = df_user_normalized_PCA[df_user_normalized_PCA['user_ID']=='AGKHLEW2SOWHNMFQIJGBECAF7INQ']
df_PCA_single_user_check

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,...,dim_norm_PCA290,dim_norm_PCA291,dim_norm_PCA292,dim_norm_PCA293,dim_norm_PCA294,dim_norm_PCA295,dim_norm_PCA296,dim_norm_PCA297,dim_norm_PCA298,dim_norm_PCA299
0,5,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-05 14:08:48.923,0,1,...,-0.00131,8.4e-05,0.009012,-0.004501,0.001905,0.007937,0.005959,-0.003202,-0.004059,0.00202
1,4,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-04 18:10:55.070,1,1,...,-0.007619,-0.012466,0.01296,0.008836,-0.01119,-0.022729,0.000907,0.000792,-0.001111,-0.001824


Analyse recommended product

In [19]:
product_ids_to_filter = ['B0170FP8CC', 'B0BQWTXV2Q','B08JQS9FVP','B09TQ2SDKK','B085TBXF1Z']  # List of product IDs you want to filter
filtered_df = df_user_normalized_PCA[df_user_normalized_PCA['product_ID'].isin(product_ids_to_filter)]
filtered_df

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,...,dim_norm_PCA290,dim_norm_PCA291,dim_norm_PCA292,dim_norm_PCA293,dim_norm_PCA294,dim_norm_PCA295,dim_norm_PCA296,dim_norm_PCA297,dim_norm_PCA298,dim_norm_PCA299
2397,4,Super Gentle,My skin is really sensitive.<br /><br />I also...,[],B0BQWTXV2Q,B0BQWTXV2Q,AENH6LSB6BM7XLPEYUL43WBOD6JA,2023-02-06 18:45:25.633,0,0,...,0.007063,0.002446,-0.006040,-0.008231,0.004288,-0.006763,-0.001727,0.002089,0.000214,-0.000074
3380,4,Strong scent but works great,Really happy with this concentrate. I did not ...,[],B08JQS9FVP,B08JQS9FVP,AFPAGWUQX3ELC4PWOOATIP7EDMOA,2021-02-12 20:57:00.886,3,1,...,0.006424,0.001414,-0.003435,0.002831,0.008546,-0.006090,0.004720,0.003691,0.002900,0.001315
4325,3,Strongly scented,The smell is really strong. Not at all subtle....,[],B0170FP8CC,B0170FP8CC,AE5S2ACTELDBCIGS2M5377BIAB7Q,2017-10-13 22:26:41.602,1,1,...,-0.002320,-0.002075,0.002487,0.003864,0.003495,-0.001506,-0.006819,0.005816,0.004416,0.002107
8247,5,Great product in a convenient package with ing...,I admit I ordered this product because I loved...,[],B0170FP8CC,B0170FP8CC,AE7XIIPNUJEEMT45BAXWXGOFMSKA,2016-04-24 14:12:44.000,3,1,...,0.003089,0.004819,0.013711,0.003629,-0.001966,-0.014495,-0.001875,0.001959,0.000782,0.002939
9333,3,"OK for a dry shampoo, but scent is overpowering",Smells like a grandma's powder room. I really ...,[],B0170FP8CC,B0170FP8CC,AFT65K7NEYPNCN6HT6TRPIEUUWQQ,2016-02-29 15:46:45.000,6,1,...,-0.006272,-0.007236,-0.002280,-0.006038,0.007961,-0.003946,-0.006548,-0.006539,-0.007910,-0.001555
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
670283,5,Weirdly Effective!,Weirdly effective! I used this product with ze...,[],B0170FP8CC,B0170FP8CC,AHCUZFPTPNAPHD3SAUTBHDJIZQ4A,2016-03-24 18:54:44.000,1,1,...,-0.001899,-0.003507,-0.000793,0.009144,0.004677,-0.002173,-0.004628,-0.012225,0.005429,-0.000884
670286,3,Not a bad product.,Nice product. The scent was not as pleasant as...,[],B0170FP8CC,B0170FP8CC,AFOUOYV4PNP24FUUQDTTVY6NPWTA,2016-05-03 00:58:01.000,2,1,...,-0.003165,0.005141,0.005009,-0.015252,0.009643,-0.014977,0.008718,-0.012248,0.011041,0.010497
670287,5,A natural dry shampoo that does the trick!,Spent a while looking for an all-natural dry s...,[],B0170FP8CC,B0170FP8CC,AGFBBZAQFCJD5VAWDHGGBNX6Z5JQ,2016-03-24 00:44:26.000,8,1,...,-0.000411,-0.005750,-0.002548,-0.000776,0.007142,0.004554,-0.001083,-0.003726,-0.002694,-0.008319
670288,5,My search is over! Glory be!,I have spent some time trying out dry shampoos...,[],B0170FP8CC,B0170FP8CC,AHGECQRFUZBVECMYIUYU42F6BBHA,2016-03-21 15:59:35.000,2,1,...,-0.003751,-0.006232,-0.000384,0.002274,-0.003372,-0.001389,-0.011212,0.001119,0.007097,-0.005818


In [20]:
pd.set_option('display.max_colwidth', None)

# Display the cleaned_text column
print(filtered_df['cleaned_text'])

2397           my skin is really sensitive.i also dont generally like allinone products. you should see my bathroom cabinets. trying to consolidate, but i get different rashes in different places, and no one thing works for everything.this is very gentle and did not make my skin or face or scalp break out or rash up at all. its not super moisturizing, but it wasnt drying, either. the scent is very light, and the cleanser has a little mintytingly kick. this would have worked for me in my teens or 20s when i couldnt wait to get out of the shower and on with my life.
3380                                                                                                                                            really happy with this concentrate. i did not expect it to work well but am very thankful to be wrong. it thickens to the normal consistency of conditioners after it cools. its very moisturizing for my thick curly hair. my only issue was the scent. very strong herbal scent. it was not