In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from transformers import BertTokenizer, BertModel, pipeline
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import ast
import re
from datetime import datetime

Import data

In [2]:
# load the dataset of PCA normalized embeddings: 
file_path = rf'.\..\data\embeddings_dim_reduction\df_user_normalized_PCA.csv' # adjust the filepath with the proper file 
df_user_normalized_PCA = pd.read_csv(file_path) 

In [3]:
# Verify the number of unique users:
unique_values_user = df_user_normalized_PCA.user_ID.nunique()
unique_values_user

630476

In [4]:
df_user_normalized_PCA.head(2)

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,...,dim_norm_PCA290,dim_norm_PCA291,dim_norm_PCA292,dim_norm_PCA293,dim_norm_PCA294,dim_norm_PCA295,dim_norm_PCA296,dim_norm_PCA297,dim_norm_PCA298,dim_norm_PCA299
0,5,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-05 14:08:48.923,0,1,...,-0.00131,8.4e-05,0.009012,-0.004501,0.001905,0.007937,0.005959,-0.003202,-0.004059,0.00202
1,4,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-04 18:10:55.070,1,1,...,-0.007619,-0.012466,0.01296,0.008836,-0.01119,-0.022729,0.000907,0.000792,-0.001111,-0.001824


## 1. Create User and Product Vectors

* Aggregating the embeddings to compute a single user vector per user_ID and single product vector per product_ID.
* To create a vector for each user, we can average all the review embeddings corresponding to that user. This works because averaging retains a general sense of the user's overall preferences.
* Similarly, we can aggregate the embeddings for each product. For example, average all the embeddings corresponding to each product ID.

### 1.1 Aggregate User Vectors

In [5]:
# Select relevant columns: user_ID and embedding dimensions
vector_cols = [f'dim_norm_PCA{i}' for i in range(300)]
df_user_vectors = df_user_normalized_PCA.groupby('user_ID')[vector_cols].mean().reset_index()

# Rename the columns of vectors to indicate these are user vectors:
df_user_vectors.columns = ['user_ID'] + [f'user_vec_{i}' for i in range(300)]

df_user_vectors.shape

(630476, 301)

In [6]:
df_user_vectors.head()

Unnamed: 0,user_ID,user_vec_0,user_vec_1,user_vec_2,user_vec_3,user_vec_4,user_vec_5,user_vec_6,user_vec_7,user_vec_8,...,user_vec_290,user_vec_291,user_vec_292,user_vec_293,user_vec_294,user_vec_295,user_vec_296,user_vec_297,user_vec_298,user_vec_299
0,AE222BBOVZIF42YOOPNBXL4UUMYA,0.157177,-0.047737,-0.016334,-0.216434,-0.053552,-0.098437,-0.042217,0.004862,0.124219,...,7.4e-05,-0.003814,0.016722,-0.00391,0.001206,0.005939,-0.018628,0.000775,-0.005081,-0.010397
1,AE222FP7YRNFCEQ2W3ZDIGMSYTLQ,0.04597,-0.018998,0.164447,-0.194383,0.170237,0.089281,-0.069243,0.064173,-0.06123,...,0.012602,-0.01437,0.013525,0.015043,0.007685,0.017838,0.014043,0.019642,-0.018238,0.001468
2,AE222X475JC6ONXMIKZDFGQ7IAUA,0.410081,0.025667,-0.12193,0.184316,0.0051,-0.151829,0.046592,0.091186,0.147147,...,0.013997,0.025863,-0.006677,-0.014606,0.005337,0.008045,0.014762,0.001563,0.011351,0.010184
3,AE222Y4WTST6BUZ4J5Y2H6QMBITQ,-0.204608,-0.027967,-0.074068,0.101387,0.089039,-0.18417,-0.045104,-0.112713,0.019092,...,-0.007948,0.013191,-0.006841,-0.000236,-0.015392,-0.00255,-0.005537,0.006153,0.022717,-0.003635
4,AE2232TEZOEWQLAFEX2NA6VBGMYQ,0.306584,-0.059949,0.359815,-0.206749,0.43388,-0.091246,0.383815,0.030456,-0.088499,...,0.000389,-0.002188,0.011391,-0.011571,0.013657,0.018282,-0.001572,0.000488,0.012923,-0.009286


In [7]:
# Save the new DataFrame with user vectors
file_path_user_vec = rf'.\..\data\text_analysis\user_vectors\user_vectors.csv'
df_user_vectors.to_csv(file_path_user_vec, index=False)

### 1.2 Aggregate Product Vectors

In [8]:
#product_vector_cols = [f'dim_norm_PCA{i}' for i in range(300)]
df_product_vectors = df_user_normalized_PCA.groupby('product_ID')[vector_cols].mean().reset_index()

df_product_vectors.columns = ['product_ID'] + [f'product_vec_{i}' for i in range(300)]
df_product_vectors.shape

(115576, 301)

In [9]:
df_product_vectors.head()

Unnamed: 0,product_ID,product_vec_0,product_vec_1,product_vec_2,product_vec_3,product_vec_4,product_vec_5,product_vec_6,product_vec_7,product_vec_8,...,product_vec_290,product_vec_291,product_vec_292,product_vec_293,product_vec_294,product_vec_295,product_vec_296,product_vec_297,product_vec_298,product_vec_299
0,5946468,0.763349,0.091675,0.117928,0.099401,0.132281,0.0709,-0.223043,-0.012843,-0.009337,...,-0.000546,0.003127,0.005684,0.003426,-0.006655,-0.001777,-0.009676,0.00057,-0.001297,0.007379
1,123034892,0.578087,-0.060032,-0.136238,-0.096158,-0.007847,-0.121033,-0.108942,0.071681,0.014017,...,-0.002423,-0.005551,0.002831,-0.002318,0.00319,0.003064,-0.000896,-0.000299,-0.010328,0.001244
2,124784577,0.055595,-0.048237,0.045403,0.001712,0.206265,0.038459,0.181421,0.045733,-0.04182,...,-0.006233,-0.002329,0.004678,0.007249,-0.001499,0.003474,0.001119,0.008357,-0.00644,0.007834
3,515059560,-0.141502,0.079938,0.102094,-0.057478,-0.011861,-0.127184,-0.016523,-0.101494,-0.038372,...,-0.004293,-0.011067,0.000231,0.007716,-0.00315,0.018886,-0.001149,-0.00464,0.000215,0.00194
4,615675026,-0.187639,0.00682,0.150007,-0.152691,0.051482,-0.066133,-0.137043,-0.074076,0.021345,...,-0.000969,0.008569,-0.00359,-0.01526,0.002899,-0.004042,0.004385,-0.012813,0.003399,-0.00738


In [10]:
# Save the new DataFrame with product vectors
file_path_product_vec = rf'.\..\data\text_analysis\product_vectors\product_vectors.csv'
df_product_vectors.to_csv(file_path_product_vec, index=False)

## 2. Calculate COS similarity matrix for 300 closest products for each user

In [12]:
# Extract user vectors
user_ids = df_user_vectors['user_ID'].values
user_vectors = df_user_vectors.iloc[:, 1:].values

# Extract product vectors
product_ids = df_product_vectors['product_ID'].values
product_vectors = df_product_vectors.iloc[:, 1:].values

In [14]:
product_ids

array(['0005946468', '0123034892', '0124784577', ..., 'B0CBXM7WHY',
       'B0CCPDTRK7', 'B0CFZKJ4KY'], dtype=object)

In [15]:
print(f"Shape of User Vectors: {user_vectors.shape}")
print(f"Shape of Product Vectors: {product_vectors.shape}")

Shape of User Vectors: (630476, 300)
Shape of Product Vectors: (115576, 300)


In [23]:
# Define the number of closest products to compute
top_n_products = 300 # find the top 300 closest products for each user instead of computing all pairwise similarities.
# Fit the NearestNeighbors model on the product vectors
nbrs = NearestNeighbors(n_neighbors=top_n_products, metric='cosine').fit(product_vectors)

# Retrieve the top-N most similar products for each user
distances, indices = nbrs.kneighbors(user_vectors)

# `distances` contains the cosine distances (1 - similarity)
# Convert distances to similarity scores
similarity_scores = 1 - distances

In [64]:
# Save the indices and similarity scores
np.savez(r'.\..\data\cos_similarity\recommendations_300.npz', indices=indices, similarity_scores=similarity_scores)