In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from transformers import BertTokenizer, BertModel, pipeline
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import ast
import re
from datetime import datetime

Import data

In [2]:
# Upload the data frame with BERT embeddings:
file_path_embeddings = r'.\..\data\embeddings_output\df_user_embeddings_BERT_merged.csv' # Update this with your file path 
df_user_embeddings = pd.read_csv(file_path_embeddings)

In [3]:
df_user_embeddings.head(2)

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,...,dim_758,dim_759,dim_760,dim_761,dim_762,dim_763,dim_764,dim_765,dim_766,dim_767
0,5,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-05 14:08:48.923,0,1,...,-0.002711,-0.460343,0.018323,-0.174986,0.113852,0.09699,-0.149028,-0.27807,-0.180932,0.246443
1,4,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-04 18:10:55.070,1,1,...,-0.179098,-0.336104,0.01637,0.050976,0.144952,-0.161408,-0.334452,-0.221662,-0.220744,0.071765


In [4]:
df_user_embeddings.shape

(692331, 785)

In [5]:
# Identify columns of interest (dim_0 to dim_767)
columns_of_interest = [f'dim_{i}' for i in range(768)]

# Count rows with NaN in these columns
num_rows_with_nan = df_user_embeddings[columns_of_interest].isna().any(axis=1).sum()

# Display the count
print(num_rows_with_nan)

0


In [6]:
# Upload user DataFrame:
file_path_user_df = r'.\..\data\data_clean\merged_user_meta_df.csv'  # Update this with your file path
df_user = pd.read_csv(file_path_user_df)
df_user.shape

(693941, 14)

In [7]:
missing_values = df_user.isnull().sum()
missing_values

rating                           0
review_title                     0
text_review                      0
user_images                      0
product_ID                       0
parent_ID                        0
user_ID                          0
timestamp                        0
helpful_review_vote              0
user_purchase_verification       0
year                             0
cleaned_text                  1405
parent_asin                      0
cleaned_title                  205
dtype: int64

In [8]:
df_user.dropna(inplace=True)
df_user.shape

(692331, 14)

### 1. Normalization of BERT embeddings:

In [9]:
# Select only columns with embedding vectors (e.g., `dim_0`, `dim_1`, ..., `dim_780`)
embedding_columns = [col for col in df_user_embeddings.columns if col.startswith('dim_')]
# Convert to NumPy array
embedding_matrix = df_user_embeddings[embedding_columns].values  
embedding_matrix

array([[ 0.17056495,  0.04877963,  0.4597311 , ..., -0.2780695 ,
        -0.18093206,  0.24644315],
       [ 0.26909658,  0.2322717 ,  0.4237466 , ..., -0.22166176,
        -0.22074397,  0.07176462],
       [ 0.1310498 , -0.05716905,  0.7067867 , ..., -0.32598308,
        -0.50271666, -0.17449133],
       ...,
       [ 0.10488835, -0.18813547,  0.3528914 , ..., -0.28464743,
        -0.16916603, -0.12779067],
       [ 0.05978642,  0.00406198,  0.50121933, ..., -0.30922773,
         0.14436744, -0.17629299],
       [-0.03877427,  0.00091828,  0.28811532, ..., -0.2674641 ,
        -0.12735286,  0.24892004]])

In [10]:
# Compute L2 norm along rows
l2_norms = np.linalg.norm(embedding_matrix, axis=1, keepdims=True)  

In [11]:
# Normalize the embedding columns
normalized_embeddings = embedding_matrix / l2_norms  # Divide each row by its L2 norm
normalized_embeddings

array([[ 0.01986643,  0.00568157,  0.05354686, ..., -0.03238795,
        -0.02107394,  0.02870429],
       [ 0.03096874,  0.02673078,  0.04876649, ..., -0.02550974,
        -0.02540412,  0.00825897],
       [ 0.01324259, -0.00577694,  0.07142085, ..., -0.03294062,
        -0.05079956, -0.01763236],
       ...,
       [ 0.01123646, -0.02015455,  0.03780449, ..., -0.03049366,
        -0.01812239, -0.01368994],
       [ 0.00654949,  0.00044498,  0.05490762, ..., -0.03387531,
         0.01581518, -0.01931256],
       [-0.00473396,  0.00011211,  0.03517608, ..., -0.03265477,
        -0.01554855,  0.03039072]])

Generate the output DataFrame with normalized embeddings:

In [12]:
num_dimensions = normalized_embeddings.shape[1]
norm_embedding_columns_names = [f'dim_norm_{i}' for i in range(num_dimensions)]

In [13]:
embeddings_norm = pd.DataFrame(normalized_embeddings, columns=norm_embedding_columns_names)

In [14]:
df_user.shape

(692331, 14)

In [15]:
df_normalized_embeddings = pd.concat([df_user.reset_index(drop=True), embeddings_norm], axis=1)
df_normalized_embeddings.head(2)

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,...,dim_norm_758,dim_norm_759,dim_norm_760,dim_norm_761,dim_norm_762,dim_norm_763,dim_norm_764,dim_norm_765,dim_norm_766,dim_norm_767
0,5,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-05 14:08:48.923,0,1,...,-0.000316,-0.053618,0.002134,-0.020381,0.013261,0.011297,-0.017358,-0.032388,-0.021074,0.028704
1,4,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-04 18:10:55.070,1,1,...,-0.020611,-0.03868,0.001884,0.005867,0.016682,-0.018575,-0.03849,-0.02551,-0.025404,0.008259


In [16]:
missing_values = df_normalized_embeddings.isnull().sum()
missing_values

rating          0
review_title    0
text_review     0
user_images     0
product_ID      0
               ..
dim_norm_763    0
dim_norm_764    0
dim_norm_765    0
dim_norm_766    0
dim_norm_767    0
Length: 782, dtype: int64

In [17]:
# Identify columns of interest (dim_0 to dim_767)
columns_of_interest = [f'dim_norm_{i}' for i in range(768)]

# Count rows with NaN in these columns
num_rows_with_nan_norm = df_normalized_embeddings[columns_of_interest].isna().any(axis=1).sum()

# Display the count
print(num_rows_with_nan_norm)

0


### 2. Dimentionality reduction with PCA

* Similar steps but here the output is a final user dataframe with vectors of 300 dimensions (In the beginning I have chosen 10 only because that's what they have done in the artile but the variance dropped to 45%, so I set up for now 300 which gives ~95%.) We can always adjust it

In [18]:
pd.set_option('display.max_colwidth', 50)  # Adjust column width
pd.set_option('display.width', 100)       # Adjust width of display

In [19]:
# Perform dimensionality reduction to 300 components using PCA
pca = PCA(n_components=300)

In [20]:
pca_embeddings = pca.fit_transform(normalized_embeddings)

In [21]:
print(pca_embeddings.shape)

(692331, 300)


In [22]:
pca_embeddings

array([[-1.93043088e-01, -9.40082550e-05, -3.13472032e-02, ...,
         2.58044285e-03,  2.19500385e-02,  5.14330075e-03],
       [-1.04640375e-01, -1.21517591e-02, -3.88902963e-02, ...,
         4.06126707e-03,  3.28373620e-03, -4.92526029e-04],
       [ 8.56322562e-02,  2.08808937e-01, -4.26144971e-02, ...,
         1.43494994e-02,  5.83178027e-05, -4.30649823e-03],
       ...,
       [ 1.97261689e-02,  2.71948796e-01,  7.76883388e-02, ...,
         2.98825706e-04,  1.00408370e-02, -1.17193217e-02],
       [-5.53435766e-02, -2.48316794e-02,  2.53586109e-01, ...,
        -7.00086893e-03, -2.41649081e-03, -1.48632799e-02],
       [-1.86513871e-01, -3.51712366e-02, -4.08929099e-02, ...,
        -1.86971775e-02,  3.35082544e-03,  2.00230557e-03]])

### 3. PCA - Variance analysis

* Verify if the size of reduced embeddings is big enough to still keep the meaning of the textual data

In [23]:
embeddings_norm.shape

(692331, 768)

In [24]:
# Fit PCA to the original embeddings
pca = PCA(n_components=300)
pca.fit(embeddings_norm)

# Check explained variance
explained_variance = pca.explained_variance_ratio_
total_variance = sum(explained_variance)

print("Explained Variance Ratio for 300 Dimensions:", explained_variance)
print("Total Explained Variance Captured by 300 Dimensions:", total_variance)

Explained Variance Ratio for 300 Dimensions: [0.09388509 0.06763783 0.05461393 0.04048936 0.03222465 0.02656826
 0.02530867 0.02390561 0.02212245 0.02004999 0.01728898 0.01607446
 0.01496901 0.01451433 0.01337311 0.01198212 0.01150465 0.01016104
 0.00972864 0.00959086 0.00907417 0.00862593 0.00843039 0.0080386
 0.0078605  0.00746758 0.00727066 0.00710616 0.00694043 0.00666061
 0.0062848  0.00609209 0.0057726  0.00557458 0.00550601 0.00524735
 0.00503372 0.00495834 0.00489018 0.00459477 0.00455589 0.00445912
 0.00426084 0.00419414 0.00413557 0.00410876 0.00378109 0.00371074
 0.0036158  0.00358189 0.00345707 0.00339095 0.00333523 0.00324629
 0.00319223 0.00313115 0.0030613  0.00295225 0.0029167  0.00284382
 0.00278852 0.00276716 0.00272038 0.00267801 0.00258487 0.00254854
 0.00249327 0.00244202 0.00242671 0.00239366 0.00237287 0.00229181
 0.00228183 0.00225001 0.00219681 0.00214667 0.00212555 0.00209668
 0.00206847 0.00205754 0.00199271 0.0019618  0.00195612 0.00191887
 0.00188277 0.0018

### 4. Generate a merged DataFrame from df_user, including columns for embedding dimensions.

In [25]:
# Create column names for embedding dimensions - normalized data after dimentionality reducion
num_dimensions_pca = pca_embeddings.shape[1]
normalized_columns_PCA = [f'dim_norm_PCA{i}' for i in range(num_dimensions_pca)]

In [26]:
embeddings_df_PCA = pd.DataFrame(pca_embeddings, columns=normalized_columns_PCA)

In [27]:
df_user_normalized_PCA = pd.concat([df_user.reset_index(drop=True), embeddings_df_PCA], axis=1)

In [28]:
df_user_normalized_PCA.head(5)

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,...,dim_norm_PCA290,dim_norm_PCA291,dim_norm_PCA292,dim_norm_PCA293,dim_norm_PCA294,dim_norm_PCA295,dim_norm_PCA296,dim_norm_PCA297,dim_norm_PCA298,dim_norm_PCA299
0,5,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-05 14:08:48.923,0,1,...,0.004485,0.005451,-0.002474,-0.003272,-0.000586,0.001671,0.002865,0.00258,0.02195,0.005143
1,4,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-04 18:10:55.070,1,1,...,0.001461,0.001386,0.003221,-0.005483,0.003578,0.006443,-0.008957,0.004061,0.003284,-0.000493
2,5,Yes!,"Smells good, feels great!",[],B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2020-05-16 21:41:06.052,2,1,...,0.002023,-0.00431,-0.00386,0.007212,0.001286,0.007776,0.003396,0.014349,5.8e-05,-0.004306
3,1,Synthetic feeling,Felt synthetic,[],B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2022-01-28 18:13:50.220,0,1,...,-0.006163,-0.016397,-0.000758,-0.007233,0.005454,0.000667,-0.012256,-0.00658,0.010938,-0.017094
4,5,A+,Love it,[],B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2020-12-30 10:02:43.534,0,1,...,0.005643,0.012471,0.005314,-0.001744,0.010407,-0.01262,-0.008812,-0.009284,0.008086,0.001676


In [None]:
# Identify columns of interest 
columns_of_interest_PCA = [f'dim_norm_PCA{i}' for i in range(294)]

# Count rows with NaN in these columns
num_rows_with_nan_norm_PCA = df_user_normalized_PCA[columns_of_interest_PCA].isna().any(axis=1).sum()

# Display the count
print(num_rows_with_nan_norm_PCA)

0


Save the output DataFrame:

In [30]:
# Save the DataFrame with normalized & reduced embeddings:

file_path = rf'.\..\data\embeddings_dim_reduction\df_user_normalized_PCA_merged.csv'
df_user_normalized_PCA.to_csv(file_path, index=False)