In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from transformers import BertTokenizer, BertModel, pipeline
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import ast
import re

Import data

In [83]:
file_path = r'.\..\data\data_clean\user_clean_data_ecommerce.csv'  # Update this with your file path
df_user = pd.read_csv(file_path)

In [84]:
df_user['cleaned_text'][0]

'this spray is really nice. it smells really good, goes on really fine, and does the trick. i will say it feels like you need a lot of it though to get the texture i want. i have a lot of hair, medium thickness. i am comparing to other brands with yucky chemicals so im gonna stick with this. try it!'

In [85]:
df_user.head(5)

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,year,cleaned_text
0,5,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-05 14:08:48.923,0,1,2020,this spray is really nice. it smells really go...
1,4,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-04 18:10:55.070,1,1,2020,"this product does what i need it to do, i just..."
2,5,Yes!,"Smells good, feels great!",[],B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2020-05-16 21:41:06.052,2,1,2020,"smells good, feels great!"
3,1,Synthetic feeling,Felt synthetic,[],B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2022-01-28 18:13:50.220,0,1,2022,felt synthetic
4,5,A+,Love it,[],B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2020-12-30 10:02:43.534,0,1,2020,love it


In [86]:
df_user.dropna(inplace=True)
df_user.shape

(692536, 12)

In [87]:
df_user['text_length'] = df_user['cleaned_text'].apply(len)  # Create a new column with text lengths

# Find the max and average text length
max_length = df_user['text_length'].max()
average_length = df_user['text_length'].mean()

print(f'Max text length (characters): {max_length}')
print(f'Average text length (characters): {average_length}')

Max text length (characters): 14692
Average text length (characters): 170.48680357411024


Split the dataset

In [88]:
train_data, test_data = train_test_split(df_user, test_size=0.2, random_state=42)

In [89]:
print(train_data.shape)
print(test_data.shape)

(554028, 13)
(138508, 13)


Create the embeddings for the first 50 000 rows of the training dataset:

In [90]:
train_subset_data_01 = train_data[:50000].copy()

In [91]:
train_subset_data_unique_users = train_subset_data_01.user_ID.nunique()
train_subset_data_unique_users

49334

### Embeddings with BERT

* BertTokenizer:
This is a tokenizer class from the transformers library provided by Hugging Face. It converts raw text into token IDs that a BERT model can process.
BERT base uncased:
"Base": Refers to the smaller version of BERT with 12 layers, 768 hidden units, and 110 million parameters.

In [137]:
# Check if a CUDA-compatible GPU is available on your system. "cuda:0" refers to the first GPU on the system (if there are multiple GPUs). If no GPU is available, device defaults to "cpu".
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')



In [138]:
# Move the model to the GPU
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

### Old funciton for creating the embeddings:

In [58]:
# # Function to get BERT embeddings
# def get_bert_embeddings(text):
#     # Tokenize input text
#     inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512) # for now I'm gonna keep 512 (so it is max that BERT can take, because some sentences are very long)
    
#     # Move input tensors to the same device as the model (GPU or CPU)
#     inputs = {key: value.to(device) for key, value in inputs.items()}
    
#     # Forward pass through the model
#     outputs = model(**inputs)
    
#     # Get the embeddings (mean of the last hidden state)
#     return outputs.last_hidden_state.mean(dim=1).squeeze().detach().cpu().numpy()  # Move the result back to CPU if needed

### Updated function by chunking for long text:

In [139]:
# Updated function to get BERT embeddings with chunking
def get_bert_embeddings(text):
    """
    Generates BERT embeddings for a given text, with support for chunking if the text is longer than 512 tokens.
    """
    # Tokenize the input text into subwords
    tokens = tokenizer.tokenize(text)  # Tokenize into subwords
    max_length = 512  # Maximum token length BERT can handle
    overlap = 50      # Overlap between chunks for better context continuity

    # If the text is short, process it directly
    if len(tokens) <= max_length:
        # Tokenize, truncate, and pad
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
        
        # Move inputs to the same device as the model
        inputs = {key: value.to(device) for key, value in inputs.items()}
        
        # Forward pass through the model
        outputs = model(**inputs)
        
        # Get the embeddings (mean of the last hidden state)
        return outputs.last_hidden_state.mean(dim=1).squeeze().detach().cpu().numpy()

    # For long texts, split into chunks
    chunks = []
    for i in range(0, len(tokens), max_length - overlap):
        # Create a chunk of tokens with overlap
        chunk = tokens[i:i + max_length]
        # Convert tokens back to string format
        chunk_text = tokenizer.convert_tokens_to_string(chunk)
        
        # Tokenize, truncate, and pad the chunk
        inputs = tokenizer(chunk_text, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
        
        # Move inputs to the same device as the model
        inputs = {key: value.to(device) for key, value in inputs.items()}
        
        # Forward pass through the model
        outputs = model(**inputs)
        
        # Get the embeddings (mean of the last hidden state for this chunk)
        chunk_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().detach().cpu().numpy()
        chunks.append(chunk_embedding)
    
    # Aggregate chunk embeddings (mean pooling across all chunks)
    final_embedding = sum(chunks) / len(chunks)  # Average embeddings across chunks
    return final_embedding

In [140]:
print(torch.cuda.is_available())

True


### Creating embeddings with BERT - run for the selected data 

* Generating embeddings for the user's review with BERT,
* From each cell with review text data, a vector of 768 dimensions will be generated and stored in a cell, in a column called 'embeddings',
* Generated embeddings will be transformed from the lists of dimenstions into separated columns (one column per dimension) - therefore 768 columns will be created.

In [141]:
train_subset_data_01['embeddings'] = train_subset_data_01['cleaned_text'].apply(get_bert_embeddings)

# test_data['embeddings'] = test_data['cleaned_text'].apply(get_bert_embeddings)

In [61]:
# Verify if the column with 'embeddings' was created:
train_subset_data_01.head(2)

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,year,cleaned_text,text_length,embeddings
308320,3,Looks great but not practical for me,[[VIDEOID:c9f78d09ec648d2bdc92f5ad9698e698]] T...,[{'small_image_url': 'https://m.media-amazon.c...,B09JNRPRKT,B09JNRPRKT,AFOKWZIJI7K6Z6R4T7SSXRTWZNOQ,2022-02-22 19:21:19.256,0,0,2022,videoidcfdecdbdcfade the look of this is very ...,751,"[-0.33231324, 0.0543172, 0.4494611, 0.07581194..."
41091,4,This stuff is pretty interesting. I'm enjoying...,This stuff is pretty interesting. I'm enjoying...,[],B0006O028U,B0006O028U,AHQXHCQIGUBPQNJLQZNLBV7ESFPA,2015-01-04 00:21:08.000,1,1,2015,this stuff is pretty interesting im enjoying t...,109,"[0.049274728, 0.12549251, 0.12806682, 0.248745..."


In [62]:
# From the lists of embeddings generate an array:
train_subset_data_stack = np.stack(train_subset_data_01['embeddings'].values)

In [106]:
train_subset_data_stack

array([[-0.33231324,  0.0543172 ,  0.4494611 , ..., -0.21085577,
         0.05723506,  0.13091712],
       [ 0.04927473,  0.12549251,  0.12806682, ..., -0.23682863,
        -0.08841495, -0.03180593],
       [-0.100202  ,  0.00652486,  0.3701543 , ..., -0.37329656,
        -0.02400355,  0.27764842],
       ...,
       [ 0.10768944, -0.30886057,  0.5483214 , ..., -0.4442686 ,
         0.0786995 , -0.23316805],
       [-0.05036072, -0.18728136,  0.4251778 , ..., -0.37866172,
        -0.20116793,  0.00801033],
       [ 0.23054314,  0.20489387,  0.37537226, ..., -0.38489497,
        -0.10621028, -0.05856288]], dtype=float32)

In [64]:
# Restrict to the first 50,000 rows:
train_data_50000 = train_data.iloc[:50000].copy()

In [65]:
# Create column names for embedding dimensions:
num_dimensions = train_subset_data_stack.shape[1]
embedding_columns_names = [f'dim_{i}' for i in range(num_dimensions)]

In [66]:
# Create the DataFrame of embeddings where each column contain one dimension of vector:
embedding_df = pd.DataFrame(train_subset_data_stack, columns=embedding_columns_names)

In [67]:
# Add generated embeddings to our original user data frame (the dataframe has only X number of rows, due to the splitting the original dataset):
df_user_embedding_01 = pd.concat([train_data_50000.reset_index(drop=True), embedding_df], axis=1)

In [68]:
df_user_embedding_01.shape

(50000, 781)

In [69]:
df_user_embedding_01.head(2)

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,...,dim_758,dim_759,dim_760,dim_761,dim_762,dim_763,dim_764,dim_765,dim_766,dim_767
0,3,Looks great but not practical for me,[[VIDEOID:c9f78d09ec648d2bdc92f5ad9698e698]] T...,[{'small_image_url': 'https://m.media-amazon.c...,B09JNRPRKT,B09JNRPRKT,AFOKWZIJI7K6Z6R4T7SSXRTWZNOQ,2022-02-22 19:21:19.256,0,0,...,-0.067687,-0.20555,0.072889,-0.180823,0.028919,0.010474,-0.138984,-0.210856,0.057235,0.130917
1,4,This stuff is pretty interesting. I'm enjoying...,This stuff is pretty interesting. I'm enjoying...,[],B0006O028U,B0006O028U,AHQXHCQIGUBPQNJLQZNLBV7ESFPA,2015-01-04 00:21:08.000,1,1,...,-0.303478,0.09138,0.122427,-0.18641,-0.027961,-0.25528,0.025104,-0.236829,-0.088415,-0.031806


In [70]:
# Save the original data frame with generated embeddings (new version with chunks):
df_user_embedding_01.to_csv(r'.\..\data\embeddings_output\df_user_embedding_fullsize_01_ch.csv', index=False)
# df_user_embedding_01.to_csv(r'.\..\data\embeddings_output\df_user_embedding_fullsize_01.csv', index=False)

## Next steps - dimentionality reduction with PCA

* Similar steps but here the output is a final user dataframe with vectors of 300 dimensions (In the beginning I have chosen 10 only because that's what they have done in the artile but the variance dropped to 45%, so I set up for now 300 which gives ~95%.) We can always adjust it

In [23]:
# load the data - if not run just now:

# train_subset_data_ = pd.read_csv(r'.\..\data\embeddings_output\train_subset_embeddings_02.csv') #converters={"embeddings": str})

In [72]:
train_subset_data_01.head(2)

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,year,cleaned_text,text_length,embeddings
308320,3,Looks great but not practical for me,[[VIDEOID:c9f78d09ec648d2bdc92f5ad9698e698]] T...,[{'small_image_url': 'https://m.media-amazon.c...,B09JNRPRKT,B09JNRPRKT,AFOKWZIJI7K6Z6R4T7SSXRTWZNOQ,2022-02-22 19:21:19.256,0,0,2022,videoidcfdecdbdcfade the look of this is very ...,751,"[-0.33231324, 0.0543172, 0.4494611, 0.07581194..."
41091,4,This stuff is pretty interesting. I'm enjoying...,This stuff is pretty interesting. I'm enjoying...,[],B0006O028U,B0006O028U,AHQXHCQIGUBPQNJLQZNLBV7ESFPA,2015-01-04 00:21:08.000,1,1,2015,this stuff is pretty interesting im enjoying t...,109,"[0.049274728, 0.12549251, 0.12806682, 0.248745..."


In [73]:
# pca = PCA(n_components=10) # I have tried with 10 as they speficied in the article but the variance dropped significantly to 45%. 
pca = PCA(n_components=300)
# train_subset_data = np.stack(train_subset_data_01['embeddings'].values)

In [74]:
train_subset_data_stack

array([[-0.33231324,  0.0543172 ,  0.4494611 , ..., -0.21085577,
         0.05723506,  0.13091712],
       [ 0.04927473,  0.12549251,  0.12806682, ..., -0.23682863,
        -0.08841495, -0.03180593],
       [-0.100202  ,  0.00652486,  0.3701543 , ..., -0.37329656,
        -0.02400355,  0.27764842],
       ...,
       [ 0.10768944, -0.30886057,  0.5483214 , ..., -0.4442686 ,
         0.0786995 , -0.23316805],
       [-0.05036072, -0.18728136,  0.4251778 , ..., -0.37866172,
        -0.20116793,  0.00801033],
       [ 0.23054314,  0.20489387,  0.37537226, ..., -0.38489497,
        -0.10621028, -0.05856288]], dtype=float32)

In [75]:
train_pca_subset = pca.fit_transform(train_subset_data_stack)
# test_pca = pca.transform(test_embeddings)

In [76]:
print(train_pca_subset.shape)

(50000, 300)


In [77]:
train_pca_subset

array([[-2.2205727 ,  0.8206018 , -1.1507324 , ..., -0.01921606,
        -0.05422936,  0.02481445],
       [-0.85250354,  0.78876585,  0.3090128 , ..., -0.08624018,
         0.07009531, -0.09616351],
       [-0.02784091,  0.02324333, -2.0407574 , ...,  0.02132656,
        -0.06447431, -0.0215942 ],
       ...,
       [-1.5288411 , -0.0258608 , -0.570478  , ..., -0.07366744,
        -0.14109671,  0.07452551],
       [-1.1219059 , -0.31601486, -1.5046704 , ...,  0.0679103 ,
         0.05569949,  0.0265916 ],
       [ 0.43417406, -0.77137023,  0.14073612, ..., -0.15397538,
         0.01573911, -0.05586669]], dtype=float32)

Create a dataframe with UserID and embedding dimensions columns:

In [78]:
# Create column names for embedding dimensions
num_dimensions = train_pca_subset.shape[1]
embedding_columns_PCA = [f'dim_{i}' for i in range(num_dimensions)]

In [79]:
embedding_df_PCA = pd.DataFrame(train_pca_subset, columns=embedding_columns_PCA)

In [80]:
final_df_user_PCA = pd.concat([train_data_50000.reset_index(drop=True), embedding_df_PCA], axis=1)

In [81]:
# Save PCA version witch 'chunk' text modification (added _ch in a name):
final_df_user_PCA.to_csv(r'.\..\data\embeddings_dim_reduction\df_final_user_PCA_ch.csv', index=False)
# final_df_user_PCA.to_csv(r'.\..\data\embeddings_dim_reduction\df_final_user_PCA.csv', index=False)

In [96]:
final_df_user_PCA.head(5)

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,...,dim_290,dim_291,dim_292,dim_293,dim_294,dim_295,dim_296,dim_297,dim_298,dim_299
0,3,Looks great but not practical for me,[[VIDEOID:c9f78d09ec648d2bdc92f5ad9698e698]] T...,[{'small_image_url': 'https://m.media-amazon.c...,B09JNRPRKT,B09JNRPRKT,AFOKWZIJI7K6Z6R4T7SSXRTWZNOQ,2022-02-22 19:21:19.256,0,0,...,0.023845,0.057696,0.002842,0.000553,-0.082094,0.000306,-0.032792,-0.019216,-0.054229,0.024814
1,4,This stuff is pretty interesting. I'm enjoying...,This stuff is pretty interesting. I'm enjoying...,[],B0006O028U,B0006O028U,AHQXHCQIGUBPQNJLQZNLBV7ESFPA,2015-01-04 00:21:08.000,1,1,...,-0.094351,0.018936,-0.10471,-0.104361,0.007777,0.098594,-0.002512,-0.08624,0.070095,-0.096164
2,1,Every negative review here is true,Too big and strangely shaped. Can’t use for po...,[],B08YJY6QXX,B08YJY6QXX,AGC3FCMCABBZEG5LLFG2AZSFGLQA,2022-02-21 18:05:21.190,2,1,...,0.050436,0.040444,-0.043215,0.171432,-0.053496,0.150505,-0.002388,0.021327,-0.064474,-0.021594
3,4,Simple,These are okay they give your lips a little ti...,[],B09F74KLTZ,B09F74KLTZ,AFIV7RJS375WQLJNNF7OPBLGIHOQ,2021-10-04 17:17:35.117,0,0,...,0.022718,-0.02746,0.217338,-0.05678,0.092378,-0.008888,0.106759,-0.007923,0.0065,0.012411
4,4,Good Clip Ins,I used these clip ins on a recent 5-day trip a...,[],B07FNPN49W,B07FNPN49W,AHNQJWY7VTX2GIHBKNGD7MS2XMUA,2018-09-06 17:50:02.665,0,1,...,0.080945,-0.065782,0.11769,0.131351,-0.188997,0.006255,0.102548,0.056575,-0.051365,0.026357


In [25]:
similarity_matrix = cosine_similarity(train_pca_subset)

NameError: name 'train_pca_subset' is not defined

In [84]:
similarity_matrix[0][1]

0.10673274

In [87]:
similarity = cosine_similarity([train_pca_subset[12]], [train_pca_subset[10]])
print("Similarity:", similarity[0][0])

Similarity: 0.30784366


## PCA - Variance analysis

* Verify if the size of reduced embeddings is big enough to still keep the meaning of the textual data

Original embeddings:

In [26]:
user_embeddings_original = pd.read_csv(r'.\..\data\embeddings_output\df_user_embedding_fullsize_01_ch.csv')
user_embeddings_original.head(2)

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,...,dim_758,dim_759,dim_760,dim_761,dim_762,dim_763,dim_764,dim_765,dim_766,dim_767
0,4,good for the price,I only got 19 and 1 was missing a jewel and I ...,[],B00L62HMQG,B00L62HMQG,AF3BP6BP64XWBLQ6GTZKASW33EEQ,2015-08-05 09:04:21.000,0,1,...,-0.130055,-0.093996,0.183146,-0.123054,-0.168095,0.001715,-0.0283,-0.136207,0.255881,-0.052698
1,5,Five Stars,Really great blush for the price.,[],B00BMW24TU,B00BMW24TU,AGNRWZQ345LQCQ2M2X67JHJPK6OA,2015-08-22 01:13:49.000,0,1,...,-0.115743,-0.310988,-0.029846,-0.399967,0.149936,0.052017,-0.098667,-0.185217,0.117753,-0.354692


In [43]:
user_embeddings_original.shape

(50000, 781)

In [44]:
unique_values_user_original = user_embeddings_original.user_ID.nunique()
unique_values_user_original

49334

In [29]:
original_embeddings = user_embeddings_original[[f"dim_{i}" for i in range(768)]]

In [30]:
# Fit PCA to the original embeddings
pca = PCA(n_components=300)
pca.fit(original_embeddings)

# Check explained variance
explained_variance = pca.explained_variance_ratio_
total_variance = sum(explained_variance)

print("Explained Variance Ratio for 300 Dimensions:", explained_variance)
print("Total Explained Variance Captured by 300 Dimensions:", total_variance)

Explained Variance Ratio for 300 Dimensions: [0.15770545 0.04569084 0.04178757 0.0338528  0.03102464 0.02787922
 0.02636755 0.02194665 0.02034436 0.0183385  0.01677296 0.01598295
 0.01471035 0.01364272 0.01254376 0.01161625 0.01046816 0.01018101
 0.0099389  0.00890585 0.0086756  0.0080092  0.00779689 0.00759841
 0.00722639 0.00686972 0.00682086 0.00650492 0.0063136  0.00610409
 0.00603635 0.00575361 0.00541211 0.0052985  0.00505044 0.00496439
 0.00490285 0.0046469  0.00454809 0.00427476 0.00419961 0.00404253
 0.00397964 0.00389777 0.0037465  0.00370564 0.00361358 0.00351666
 0.00347613 0.00337534 0.00336742 0.00329549 0.00314648 0.00312515
 0.00297752 0.00290766 0.00289047 0.00285901 0.00279358 0.00277922
 0.00272794 0.0026373  0.00260225 0.00255192 0.00245783 0.00243023
 0.00236718 0.0023391  0.00229463 0.00223147 0.00219138 0.00214752
 0.00213039 0.0020896  0.00206988 0.00203307 0.0020144  0.00195577
 0.00191488 0.00186871 0.00184891 0.00182126 0.00179657 0.00174935
 0.00173724 0.001

# Once all embeddings are calculated we can go to the next step (so far calculations are made only for the sample of 50000 rows of the training dataset)

## Create User and Product Vectors

* Aggregating the embeddings to compute a single user vector per user_ID and single product vector per product_ID.
* To create a vector for each user, you can average all the review embeddings corresponding to that user. This works because averaging retains a general sense of the user's overall preferences.
* Similarly, you can aggregate the embeddings for each product. For example, average all the embeddings corresponding to each product ID.

In [41]:
# load the dataset of PCA embeddings: 
final_df_user_PCA = pd.read_csv(r'.\..\data\embeddings_dim_reduction\df_final_user_PCA_ch.csv')

In [None]:
# Verify the number of unique users:
unique_values_user = final_df_user_PCA.user_ID.nunique()
unique_values_user

49334

In [79]:
final_df_user_PCA.head(5)

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,...,dim_290,dim_291,dim_292,dim_293,dim_294,dim_295,dim_296,dim_297,dim_298,dim_299
0,4,good for the price,I only got 19 and 1 was missing a jewel and I ...,[],B00L62HMQG,B00L62HMQG,AF3BP6BP64XWBLQ6GTZKASW33EEQ,2015-08-05 09:04:21.000,0,1,...,0.023684,-0.200487,-0.199311,-0.138646,0.029778,0.006494,-0.068038,0.025784,0.145278,-0.015662
1,5,Five Stars,Really great blush for the price.,[],B00BMW24TU,B00BMW24TU,AGNRWZQ345LQCQ2M2X67JHJPK6OA,2015-08-22 01:13:49.000,0,1,...,-0.117988,0.022417,0.060439,0.083243,-0.167081,-0.018106,-0.091886,-0.027956,-0.079749,0.010902
2,5,Good,Very nice,[],B094VMYQJP,B094VMYQJP,AGKNJHA77X4LVIXVP56MHF4C22SA,2021-11-11 06:12:08.453,0,1,...,-0.048496,-0.075755,0.01743,0.068834,0.051646,0.043402,-0.035908,0.04453,-0.11166,-0.062793
3,5,Great Value and Easy to Use !!,I bought these bottles to add my pre-mixed pai...,[],B0758V2K3K,B0758V2K3K,AFADUDWDAFJK7YSILOUBAHSR7I5Q,2022-03-06 21:20:40.790,0,1,...,0.008855,0.045144,-0.019571,0.043039,-0.053009,0.051053,0.041208,-0.015779,-0.011635,0.013147
4,5,Best ever,This is the best curling iron I ever had in my...,[],B001MP0QH4,B01N4885PD,AHHVPEX3YEH6Z6Q5PDIJY45SOLKQ,2019-12-06 17:24:50.705,9,1,...,-0.122009,0.052409,-0.013907,0.069467,-0.043739,-0.085013,0.055296,-0.097808,-0.022167,-0.02381


Create a dataframe that stores all items that the user purchased, to remove those from the recommendations

In [76]:
user_item_df = final_df_user_PCA.groupby('user_ID')['product_ID'].apply(list).reset_index()
user_item_df

Unnamed: 0,user_ID,product_ID
0,AE222BBOVZIF42YOOPNBXL4UUMYA,[B013HR1A92]
1,AE223UUOHC3V2XF4JOTTDDSBODSQ,[B06Y5Y3R5L]
2,AE225T2ZALMW2LGAWZGONMYD2VJA,[B09Y1BWYCR]
3,AE225THXPLSO5QQ3PHQP7SPETM2A,[B07RVPC6X5]
4,AE226AADLLWDPLLWOEGKUCBNZH4A,[B007MAJFD4]
...,...,...
49329,AHZZOHEFU5EO466C2BAPPVV7CEJA,[B08PVXSM6Z]
49330,AHZZOMWUCOOYQWZUIK4QQPKQE46Q,[B08KHCMN8X]
49331,AHZZPMMB4Y5DHXAXOTBFALIKPU7A,[B09GFTS277]
49332,AHZZSYRRKQ4QAOTMYS5RHYWJU4HA,[B00BLWWY7S]


### 1. Aggregate User Vectors

In [33]:
# Select relevant columns: user_ID and embedding dimensions
user_vector_cols = [f'dim_{i}' for i in range(300)]
df_user_vectors = final_df_user_PCA.groupby('user_ID')[user_vector_cols].mean().reset_index()

# Rename the columns to indicate these are user vectors:
df_user_vectors.columns = ['user_ID'] + [f'user_vec_{i}' for i in range(300)]

df_user_vectors.shape

In [14]:
df_user_vectors.head()

Unnamed: 0,user_ID,user_vec_0,user_vec_1,user_vec_2,user_vec_3,user_vec_4,user_vec_5,user_vec_6,user_vec_7,user_vec_8,...,user_vec_290,user_vec_291,user_vec_292,user_vec_293,user_vec_294,user_vec_295,user_vec_296,user_vec_297,user_vec_298,user_vec_299
0,AE222BBOVZIF42YOOPNBXL4UUMYA,1.193942,-0.031829,-0.545995,1.647325,-1.807717,0.085607,-1.147378,0.331433,1.707212,...,0.152986,0.04813,-0.123529,-0.185242,-0.26888,0.062162,0.003835,-0.081791,-0.077197,-0.119425
1,AE223UUOHC3V2XF4JOTTDDSBODSQ,7.270741,0.962162,0.227079,-1.170762,0.693367,-2.156547,-0.341253,-0.047256,0.562146,...,-0.077601,0.075907,0.050284,0.149839,0.057757,0.048207,-0.0325,-0.105583,-0.046921,-0.064304
2,AE225T2ZALMW2LGAWZGONMYD2VJA,0.174955,-0.948436,1.882202,-0.447089,-0.092247,0.112225,-0.193614,-0.125204,0.054014,...,0.191429,0.162861,0.145655,0.075837,-0.109375,0.035572,0.023073,-0.105869,0.081911,-0.139805
3,AE225THXPLSO5QQ3PHQP7SPETM2A,-1.935537,0.112038,-0.131342,0.424573,0.312248,-0.921699,-0.190502,-0.187693,0.513127,...,-0.038536,0.039262,-0.007145,-0.011664,0.028899,-0.078002,-0.131282,-0.067536,0.03016,-0.014879
4,AE226AADLLWDPLLWOEGKUCBNZH4A,-1.642184,0.869897,0.106127,-0.015727,-0.932361,-0.748179,-0.607914,0.023849,-0.454445,...,-0.019929,0.009594,0.11243,0.026911,0.091934,0.016892,-0.052036,-0.014126,-0.083984,0.025487


In [53]:
# Save the vectors
df_user_vectors.to_csv(r'.\..\data\text_analysis\user_vectors\user_vectors_ch.csv', index=False)

### 2. Aggregate Product Vectors

In [78]:
product_vector_cols = [f'dim_{i}' for i in range(300)]
df_product_vectors = final_df_user_PCA.groupby('product_ID')[product_vector_cols].mean().reset_index()

df_product_vectors.columns = ['product_ID'] + [f'product_vec_{i}' for i in range(300)]
df_product_vectors.shape

(25915, 301)

In [80]:
df_product_vectors.head()

Unnamed: 0,product_ID,product_vec_0,product_vec_1,product_vec_2,product_vec_3,product_vec_4,product_vec_5,product_vec_6,product_vec_7,product_vec_8,...,product_vec_290,product_vec_291,product_vec_292,product_vec_293,product_vec_294,product_vec_295,product_vec_296,product_vec_297,product_vec_298,product_vec_299
0,0005946468,6.73088,0.135069,1.12349,-1.27947,0.91243,-1.771424,-0.112034,0.214289,0.129676,...,0.063355,0.001758,-0.136104,0.019494,0.016054,-0.045845,-0.029626,-0.109935,0.105087,0.086668
1,069267599X,-0.525602,0.345588,1.282988,-0.358174,-0.505468,-0.233787,-0.778394,0.085404,-0.501695,...,0.017629,-0.023244,-0.071321,-0.105923,-0.049329,-0.067786,0.009429,-0.019416,0.064645,0.040169
2,0816091846,0.029595,0.861594,0.458656,-0.14302,0.075874,-0.613368,-1.10383,-0.114904,-0.403521,...,-0.053765,0.016102,0.232078,0.120994,0.037491,-0.009946,-0.099035,-0.017921,0.088834,0.049036
3,0977217213,3.834755,0.29167,-0.880154,-0.309399,0.787637,-0.309833,0.748977,-0.470044,0.135276,...,-0.043916,-0.193804,0.018712,0.435845,-0.007382,0.115578,0.043922,-0.301531,-0.078577,-0.266278
4,1421790432,-1.453551,0.149348,1.918752,-0.578319,-0.207081,-0.307189,-1.363799,-0.655058,-0.021827,...,-0.019989,-0.017782,0.097501,0.073597,0.113653,-0.119419,-0.181618,0.002024,0.007175,-0.008132


In [56]:
# Save product vectors
df_product_vectors.to_csv(r'.\..\data\text_analysis\product_vectors\product_vectors_ch.csv', index=False)

# Semantic Analysis

## Compare Reviews - Similarity Research with COS Similarity

* Extract the user and product vectors.
* Compute the cosine similarity between each user and all products.
* Rank products for each user based on similarity scores.

In [58]:
# Extract user vectors
user_ids = df_user_vectors['user_ID'].values
user_vectors = df_user_vectors.iloc[:, 1:].values

# Extract product vectors
product_ids = df_product_vectors['product_ID'].values
product_vectors = df_product_vectors.iloc[:, 1:].values

In [60]:
print(f"Shape of User Vectors: {user_vectors.shape}")
print(f"Shape of Product Vectors: {product_vectors.shape}")

Shape of User Vectors: (49334, 300)
Shape of Product Vectors: (25915, 300)


In [61]:
# Compute cosine similarity between all users and all products
cosine_similarities = cosine_similarity(user_vectors, product_vectors)

print(f"Cosine Similarity Matrix Shape: {cosine_similarities.shape}")
# Rows correspond to users, columns correspond to products

Cosine Similarity Matrix Shape: (49334, 25915)


In [None]:
def recommend_top_n_products_by_user_id(user_id, user_ids, similarity_matrix, product_ids, top_n=5):
    """
    Recommend top N products for a given user based on cosine similarity.

    Parameters:
    - user_index: Index of the user in the similarity matrix
    - similarity_matrix: Cosine similarity matrix (users x products)
    - product_ids: List of product IDs corresponding to columns in similarity matrix
    - top_n: Number of top recommendations to return

    Returns:
    - List of (product_id, similarity_score) tuples
    """

    # Find the index of the user_ID
    if user_id not in user_ids:
        raise ValueError(f"User ID '{user_id}' not found in the user data.")
    
    user_index = np.where(user_ids == user_id)[0][0]  # Locate the index of user_id
    
    # Get similarity scores for the user
    user_similarities = similarity_matrix[user_index]
    
    # Step 4: Retrieve the list of already purchased products for the user
    purchased_products = user_item_df.loc[user_item_df['user_ID'] == user_id, 'product_ID']
    if not purchased_products.empty:
        purchased_products = set(purchased_products.iloc[0])  # Convert to a set for faster lookups
    else:
        purchased_products = set()

    # Step 5: Sort product indices by similarity scores in descending order
    sorted_indices = user_similarities.argsort()[::-1]  # Sort in descending order
    
    # Step 6: Filter out already purchased products
    recommendations = []
    for i in sorted_indices:
        product = product_ids[i]
        if product not in purchased_products:
            recommendations.append((product, user_similarities[i]))
        if len(recommendations) >= top_n:  # Stop when we have enough recommendations
            break
    
    return recommendations
# Example usage: Get recommendations for a specific user_ID
user_id_input = "AGGS22XG63AQTYBR3APBQA25HAYQ"  # Replace with any valid user_ID
top_n = 5

try:
    recommendations = recommend_top_n_products_by_user_id(user_id_input, user_ids, cosine_similarities, product_ids, top_n)
    print(f"Top-{top_n} Recommendations for User '{user_id_input}':")
    for product_id, score in recommendations:
        print(f"Product ID: {product_id}, Similarity Score: {score:.4f}")
except ValueError as e:
    print(e)

Top-5 Recommendations for User 'AGGS22XG63AQTYBR3APBQA25HAYQ':
Product ID: B0B8Q3WTGP, Similarity Score: 0.6704
Product ID: B01N41MYDC, Similarity Score: 0.6363
Product ID: B08GWZKPPL, Similarity Score: 0.6258
Product ID: B00069Q306, Similarity Score: 0.6242
Product ID: B00XFGNPE4, Similarity Score: 0.6240


In [70]:
duplicate_user_ids = final_df_user_PCA['user_ID'][final_df_user_PCA['user_ID'].duplicated()].unique()
duplicate_user_ids

array(['AEZP6Z2C5AVQDZAJECQYZWQRNG3Q', 'AGGS22XG63AQTYBR3APBQA25HAYQ',
       'AERDBP26RSGAGWWORPE6MQADNR2Q', 'AHVQR7U4PJ7VTG36DQMRSIHK7O6A',
       'AFWVN52MRBWOTIK7UGXBWGOY4HBA', 'AHGQDSOBAIVAAKJFIVEGS344MSXA',
       'AFPBV45MYM2HLBT2AH2JJ3FUADUQ', 'AHY2TURQPNIDXZGH2CMQLZ343YMQ',
       'AHYOSWORVZFXM5QMRIAW3JTTFFIQ', 'AE5UWYX65MMYGGAHNP2UJONMRRXQ',
       'AGI627XXI5DPCMEHBJKZU7WPQK6Q', 'AECQQBG6YRYCOJL2NCB2H3V6LD6Q',
       'AE5IMGWRBJA7JQFBQTBK25HDYGVA', 'AG7JCEMC64AM7JPATDVGP6YZOTXA',
       'AH6YII7DVISC5SBK62F5POULSI6A', 'AGOYO44Q3YUXRQ5N2YZRVSYTYEWQ',
       'AFRJTKBKVBAHKPIV3OIYAGS6AFRQ', 'AGZUJTI7A3JFKB4FP5JOH6NVAJIQ_1',
       'AG6PJGM3ZTNOH6RY4G6GKPCMHT5A', 'AGFTTUBGWZRDVULR3NRZYQNS5IHQ',
       'AF3XZ6UDOR4V6SMVUKWQ3IZ3JO5Q', 'AEUEEZXVXRWGO5LWPIS25R2LCKKA',
       'AG73BVBKUOH22USSFJA5ZWL7AKXA', 'AGMSR3KN2YEMEKFWWXEPIZMJKYAA',
       'AGCAJ2SSP4VUHIX3CHMEGFEYPD4Q', 'AFXF3EGQTQDXMRLDWFU7UBFQZB7Q',
       'AGBLLVBF7XRK6LIZW52ATUFQBGQA', 'AEAXAJACFMXIAAH4WOHRMXPSZWFA',
    

In [82]:
df_PCA_single_user_check = final_df_user_PCA[final_df_user_PCA['user_ID']=='AGGS22XG63AQTYBR3APBQA25HAYQ']
df_PCA_single_user_check

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,...,dim_290,dim_291,dim_292,dim_293,dim_294,dim_295,dim_296,dim_297,dim_298,dim_299
175,3,"Does not hold a lot of hair, even though ...","Does not hold a lot of hair, even though the d...",[],B000X20Y4C,B000X20Y4C,AGGS22XG63AQTYBR3APBQA25HAYQ,2016-04-29 02:18:14.000,2,1,...,-0.005839,-0.046676,-0.107566,0.079659,0.033594,0.008136,-0.009612,-0.064018,-0.000251,0.001041
2347,5,Love it,When the item arrived to my house and I was ab...,[],B00DPNKDR4,B0CDNZ7F2V,AGGS22XG63AQTYBR3APBQA25HAYQ,2016-04-29 00:54:59.000,8,1,...,0.069342,0.064106,-0.045113,-0.067857,-0.002163,-0.066787,-0.015612,0.014508,-0.040077,0.008095


Analyse recommended product

In [106]:
product_ids_to_filter = ['B00XFGNPE4', 'B01N41MYDC','B08GWZKPPL','B0B8Q3WTGP','B00069Q306']  # List of product IDs you want to filter
filtered_df = final_df_user_PCA[final_df_user_PCA['product_ID'].isin(product_ids_to_filter)]
filtered_df

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,...,dim_290,dim_291,dim_292,dim_293,dim_294,dim_295,dim_296,dim_297,dim_298,dim_299
8375,4,Epil Stop 'n Spray -- worked for me!,"After reading all the negative reviews, I wasn...",[],B00069Q306,B00069Q306,AFIQ3YH27NICN4VXZ5C32K5S75IA,2007-05-16 19:06:02.000,5,0,...,-0.092526,-0.04631,-0.027638,0.01068,-0.064729,0.015496,-0.007137,-0.024726,-0.068805,0.013464
22425,5,i finally found a great non-toxic styling prod...,"I rarely use hair products, because a) I don't...",[],B00XFGNPE4,B00XFGNPE4,AFXIGVQQTHA3WEZNZZ73MNI6XZQA,2015-07-26 12:36:52.000,0,0,...,-0.040314,0.013792,0.004826,-0.162115,-0.019986,-0.07666,0.069831,0.072245,0.011234,0.037644
39088,5,"Lightweight, Works Great!",I have naturally curly hair and blow dry it st...,[{'small_image_url': 'https://m.media-amazon.c...,B0B8Q3WTGP,B0B8Q3WTGP,AEITAF3YU6VVHFTOZZLPRFBLJE4A,2022-08-29 12:33:13.546,0,0,...,0.062836,-0.031361,-0.06878,-0.07106,-0.037476,-0.03698,0.009345,0.015377,-0.003733,0.005879
43411,2,Disappointed,I was disappointed in this pair of cuticle cli...,[],B01N41MYDC,B01N41MYDC,AGWADILSGQN3BRNWRRYCHDXTJOKQ,2017-04-26 17:38:21.000,0,1,...,-0.083816,0.046867,0.032366,-0.120519,0.104655,0.002889,-0.01391,-0.004298,0.036815,-0.027696
49737,3,Horrible chemical smell when it heats,What I hate most about certain products is tha...,[],B08GWZKPPL,B08GWZKPPL,AFWUV4IQC7OFBDAXTPNIKGX5VXDA,2020-12-11 02:20:18.437,0,1,...,-0.006793,-0.011986,-0.045659,-0.059339,-0.024505,0.078402,0.060712,0.020307,0.023315,-0.052604


In [114]:
pd.set_option('display.max_colwidth', None)

# Display the cleaned_text column
print(filtered_df['cleaned_text'])

8375                                                          after reading all the negative reviews, i wasnt expecting epil stop n spray to be very effective. boy, was i surprised.i did not have to spray it on multiple times to remove hair. one layer worked fine. i let it set in for about 10 minutes, applied a reasonable amount of pressure during the removal process and all the hair came off!many people have reported that it burned or irritated their skin. not so in my case. the only complaint i have is that the spray really does smell. but so does nair. no big deal.this actually worked better than nair and most of the hair removal products ive used.i dont know how effective the rollon or wipes are, but after seeing how well the spray worked, i just might try those too.
22425                                                                                                                                                       i rarely use hair products, because a i dont know how to use t

# Sentiment Analysis

* To classify customer reviews into sentiment categories, such as positive, neutral, or negative.
* Fine-tune BERT (Optional):

Train a BERT-based model on your labeled sentiment data (e.g., reviews with labels like "positive," "negative").
This improves sentiment prediction accuracy for domain-specific language.

* Direct Embedding-Based Classification:

Alternatively, use pre-trained embeddings and train a simple classifier (e.g., logistic regression) on top.

* Output:

Predict the sentiment for each review and quantify overall customer satisfaction.