In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from transformers import BertTokenizer, BertModel, pipeline
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import ast
import re

Import data

In [43]:
file_path = r'.\..\data\data_clean\user_clean_data_ecommerce.csv'  # Update this with your file path
df_user = pd.read_csv(file_path)

In [45]:
df_user['cleaned_text'][0]

'this spray is really nice it smells really good goes on really fine and does the trick i will say it feels like you need a lot of it though to get the texture i want i have a lot of hair medium thickness i am comparing to other brands with yucky chemicals so im gonna stick with this try it'

In [48]:
df_user.head(5)

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,year,cleaned_text
0,5,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-05 14:08:48.923,0,1,2020,this spray is really nice it smells really goo...
1,4,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-04 18:10:55.070,1,1,2020,this product does what i need it to do i just ...
2,5,Yes!,"Smells good, feels great!",[],B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2020-05-16 21:41:06.052,2,1,2020,smells good feels great
3,1,Synthetic feeling,Felt synthetic,[],B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2022-01-28 18:13:50.220,0,1,2022,felt synthetic
4,5,A+,Love it,[],B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2020-12-30 10:02:43.534,0,1,2020,love it


In [50]:
df_user.dropna(inplace=True)
df_user.shape

(692383, 12)

In [51]:
df_user.isnull().sum()

rating                        0
review_title                  0
text_review                   0
user_images                   0
product_ID                    0
parent_ID                     0
user_ID                       0
timestamp                     0
helpful_review_vote           0
user_purchase_verification    0
year                          0
cleaned_text                  0
dtype: int64

In [52]:
df_user['text_length'] = df_user['cleaned_text'].apply(len)  # Create a new column with text lengths

# Find the max and average text length
max_length = df_user['text_length'].max()
average_length = df_user['text_length'].mean()

print(f'Max text length (characters): {max_length}')
print(f'Average text length (characters): {average_length}')

Max text length (characters): 14084
Average text length (characters): 166.36590875281456


Split the dataset

In [53]:
train_data, test_data = train_test_split(df_user, test_size=0.2, random_state=42)

In [54]:
print(train_data.shape)
print(test_data.shape)

(553906, 13)
(138477, 13)


Create the embeddings for the first 50 000 rows of the training dataset:

In [55]:
train_subset_data_01 = train_data[:50000].copy()

### Embeddings with BERT

* BertTokenizer:
This is a tokenizer class from the transformers library provided by Hugging Face. It converts raw text into token IDs that a BERT model can process.
BERT base uncased:
"Base": Refers to the smaller version of BERT with 12 layers, 768 hidden units, and 110 million parameters.

In [56]:
# Check if a CUDA-compatible GPU is available on your system. "cuda:0" refers to the first GPU on the system (if there are multiple GPUs). If no GPU is available, device defaults to "cpu".
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')



In [57]:
# Move the model to the GPU
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [58]:
# Function to get BERT embeddings
def get_bert_embeddings(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512) # for now I'm gonna keep 512 (so it is max that BERT can take, because some sentences are very long)
    
    # Move input tensors to the same device as the model (GPU or CPU)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # Forward pass through the model
    outputs = model(**inputs)
    
    # Get the embeddings (mean of the last hidden state)
    return outputs.last_hidden_state.mean(dim=1).squeeze().detach().cpu().numpy()  # Move the result back to CPU if needed

In [59]:
print(torch.cuda.is_available())

True


### Creating embeddings with BERT - run for the selected data 

* Generating embeddings for the user's review with BERT,
* From each cell with review text data, a vector of 768 dimensions will be generated and stored in a cell, in a column called 'embeddings',
* Generated embeddings will be transformed from the lists of dimenstions into separated columns (one column per dimension) - therefore 768 columns will be created.

In [60]:
train_subset_data_01['embeddings'] = train_subset_data_01['cleaned_text'].apply(get_bert_embeddings)

# test_data['embeddings'] = test_data['cleaned_text'].apply(get_bert_embeddings)

In [61]:
# Verify if the column with 'embeddings' was created:
train_subset_data_01.head(2)

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,year,cleaned_text,text_length,embeddings
308320,3,Looks great but not practical for me,[[VIDEOID:c9f78d09ec648d2bdc92f5ad9698e698]] T...,[{'small_image_url': 'https://m.media-amazon.c...,B09JNRPRKT,B09JNRPRKT,AFOKWZIJI7K6Z6R4T7SSXRTWZNOQ,2022-02-22 19:21:19.256,0,0,2022,videoidcfdecdbdcfade the look of this is very ...,751,"[-0.33231324, 0.0543172, 0.4494611, 0.07581194..."
41091,4,This stuff is pretty interesting. I'm enjoying...,This stuff is pretty interesting. I'm enjoying...,[],B0006O028U,B0006O028U,AHQXHCQIGUBPQNJLQZNLBV7ESFPA,2015-01-04 00:21:08.000,1,1,2015,this stuff is pretty interesting im enjoying t...,109,"[0.049274728, 0.12549251, 0.12806682, 0.248745..."


In [62]:
# From the lists of embeddings generate an array:
train_subset_data_stack = np.stack(train_subset_data_01['embeddings'].values)

In [63]:
train_subset_data_stack

array([[-0.33231324,  0.0543172 ,  0.4494611 , ..., -0.21085577,
         0.05723506,  0.13091712],
       [ 0.04927473,  0.12549251,  0.12806682, ..., -0.23682863,
        -0.08841495, -0.03180593],
       [-0.100202  ,  0.00652486,  0.3701543 , ..., -0.37329656,
        -0.02400355,  0.27764842],
       ...,
       [ 0.10768944, -0.30886057,  0.5483214 , ..., -0.4442686 ,
         0.0786995 , -0.23316805],
       [-0.05036072, -0.18728136,  0.4251778 , ..., -0.37866172,
        -0.20116793,  0.00801033],
       [ 0.23054314,  0.20489387,  0.37537226, ..., -0.38489497,
        -0.10621028, -0.05856288]], dtype=float32)

In [64]:
# Restrict to the first 50,000 rows:
train_data_50000 = train_data.iloc[:50000].copy()

In [65]:
# Create column names for embedding dimensions:
num_dimensions = train_subset_data_stack.shape[1]
embedding_columns_names = [f'dim_{i}' for i in range(num_dimensions)]

In [66]:
# Create the DataFrame of embeddings where each column contain one dimension of vector:
embedding_df = pd.DataFrame(train_subset_data_stack, columns=embedding_columns_names)

In [67]:
# Add generated embeddings to our original user data frame (the dataframe has only X number of rows, due to the splitting the original dataset):
df_user_embedding_01 = pd.concat([train_data_50000.reset_index(drop=True), embedding_df], axis=1)

In [68]:
df_user_embedding_01.shape

(50000, 781)

In [69]:
df_user_embedding_01.head(2)

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,...,dim_758,dim_759,dim_760,dim_761,dim_762,dim_763,dim_764,dim_765,dim_766,dim_767
0,3,Looks great but not practical for me,[[VIDEOID:c9f78d09ec648d2bdc92f5ad9698e698]] T...,[{'small_image_url': 'https://m.media-amazon.c...,B09JNRPRKT,B09JNRPRKT,AFOKWZIJI7K6Z6R4T7SSXRTWZNOQ,2022-02-22 19:21:19.256,0,0,...,-0.067687,-0.20555,0.072889,-0.180823,0.028919,0.010474,-0.138984,-0.210856,0.057235,0.130917
1,4,This stuff is pretty interesting. I'm enjoying...,This stuff is pretty interesting. I'm enjoying...,[],B0006O028U,B0006O028U,AHQXHCQIGUBPQNJLQZNLBV7ESFPA,2015-01-04 00:21:08.000,1,1,...,-0.303478,0.09138,0.122427,-0.18641,-0.027961,-0.25528,0.025104,-0.236829,-0.088415,-0.031806


In [70]:
# Save the original data frame with generated embeddings:
df_user_embedding_01.to_csv(r'.\..\data\embeddings_output\df_user_embedding_fullsize_01.csv', index=False)

## Next steps - dimentionality reduction with PCA

* Similar steps but here the output is a final user dataframe with vectors of 300 dimensions (In the beginning I have chosen 10 only because that's what they have done in the artile but the variance dropped to 45%, so I set up for now 300 which gives ~95%.) We can always adjust it

In [23]:
# load the data - if not run just now:

# train_subset_data_ = pd.read_csv(r'.\..\data\embeddings_output\train_subset_embeddings_02.csv') #converters={"embeddings": str})

In [72]:
train_subset_data_01.head(2)

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,year,cleaned_text,text_length,embeddings
308320,3,Looks great but not practical for me,[[VIDEOID:c9f78d09ec648d2bdc92f5ad9698e698]] T...,[{'small_image_url': 'https://m.media-amazon.c...,B09JNRPRKT,B09JNRPRKT,AFOKWZIJI7K6Z6R4T7SSXRTWZNOQ,2022-02-22 19:21:19.256,0,0,2022,videoidcfdecdbdcfade the look of this is very ...,751,"[-0.33231324, 0.0543172, 0.4494611, 0.07581194..."
41091,4,This stuff is pretty interesting. I'm enjoying...,This stuff is pretty interesting. I'm enjoying...,[],B0006O028U,B0006O028U,AHQXHCQIGUBPQNJLQZNLBV7ESFPA,2015-01-04 00:21:08.000,1,1,2015,this stuff is pretty interesting im enjoying t...,109,"[0.049274728, 0.12549251, 0.12806682, 0.248745..."


In [73]:
# pca = PCA(n_components=10) # I have tried with 10 as they speficied in the article but the variance dropped significantly to 45%. 
pca = PCA(n_components=300)
# train_subset_data = np.stack(train_subset_data_01['embeddings'].values)

In [74]:
train_subset_data_stack

array([[-0.33231324,  0.0543172 ,  0.4494611 , ..., -0.21085577,
         0.05723506,  0.13091712],
       [ 0.04927473,  0.12549251,  0.12806682, ..., -0.23682863,
        -0.08841495, -0.03180593],
       [-0.100202  ,  0.00652486,  0.3701543 , ..., -0.37329656,
        -0.02400355,  0.27764842],
       ...,
       [ 0.10768944, -0.30886057,  0.5483214 , ..., -0.4442686 ,
         0.0786995 , -0.23316805],
       [-0.05036072, -0.18728136,  0.4251778 , ..., -0.37866172,
        -0.20116793,  0.00801033],
       [ 0.23054314,  0.20489387,  0.37537226, ..., -0.38489497,
        -0.10621028, -0.05856288]], dtype=float32)

In [75]:
train_pca_subset = pca.fit_transform(train_subset_data_stack)
# test_pca = pca.transform(test_embeddings)

In [76]:
print(train_pca_subset.shape)

(50000, 300)


In [77]:
train_pca_subset

array([[-2.2205727 ,  0.8206018 , -1.1507324 , ..., -0.01921606,
        -0.05422936,  0.02481445],
       [-0.85250354,  0.78876585,  0.3090128 , ..., -0.08624018,
         0.07009531, -0.09616351],
       [-0.02784091,  0.02324333, -2.0407574 , ...,  0.02132656,
        -0.06447431, -0.0215942 ],
       ...,
       [-1.5288411 , -0.0258608 , -0.570478  , ..., -0.07366744,
        -0.14109671,  0.07452551],
       [-1.1219059 , -0.31601486, -1.5046704 , ...,  0.0679103 ,
         0.05569949,  0.0265916 ],
       [ 0.43417406, -0.77137023,  0.14073612, ..., -0.15397538,
         0.01573911, -0.05586669]], dtype=float32)

Create a dataframe with UserID and embedding dimensions columns:

In [78]:
# Create column names for embedding dimensions
num_dimensions = train_pca_subset.shape[1]
embedding_columns_PCA = [f'dim_{i}' for i in range(num_dimensions)]

In [79]:
embedding_df_PCA = pd.DataFrame(train_pca_subset, columns=embedding_columns_PCA)

In [80]:
final_df_user_PCA = pd.concat([train_data_50000.reset_index(drop=True), embedding_df_PCA], axis=1)

In [81]:
final_df_user_PCA.to_csv(r'.\..\data\embeddings_dim_reduction\df_final_user_PCA.csv', index=False)

In [96]:
final_df_user_PCA.head(5)

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,...,dim_290,dim_291,dim_292,dim_293,dim_294,dim_295,dim_296,dim_297,dim_298,dim_299
0,3,Looks great but not practical for me,[[VIDEOID:c9f78d09ec648d2bdc92f5ad9698e698]] T...,[{'small_image_url': 'https://m.media-amazon.c...,B09JNRPRKT,B09JNRPRKT,AFOKWZIJI7K6Z6R4T7SSXRTWZNOQ,2022-02-22 19:21:19.256,0,0,...,0.023845,0.057696,0.002842,0.000553,-0.082094,0.000306,-0.032792,-0.019216,-0.054229,0.024814
1,4,This stuff is pretty interesting. I'm enjoying...,This stuff is pretty interesting. I'm enjoying...,[],B0006O028U,B0006O028U,AHQXHCQIGUBPQNJLQZNLBV7ESFPA,2015-01-04 00:21:08.000,1,1,...,-0.094351,0.018936,-0.10471,-0.104361,0.007777,0.098594,-0.002512,-0.08624,0.070095,-0.096164
2,1,Every negative review here is true,Too big and strangely shaped. Can’t use for po...,[],B08YJY6QXX,B08YJY6QXX,AGC3FCMCABBZEG5LLFG2AZSFGLQA,2022-02-21 18:05:21.190,2,1,...,0.050436,0.040444,-0.043215,0.171432,-0.053496,0.150505,-0.002388,0.021327,-0.064474,-0.021594
3,4,Simple,These are okay they give your lips a little ti...,[],B09F74KLTZ,B09F74KLTZ,AFIV7RJS375WQLJNNF7OPBLGIHOQ,2021-10-04 17:17:35.117,0,0,...,0.022718,-0.02746,0.217338,-0.05678,0.092378,-0.008888,0.106759,-0.007923,0.0065,0.012411
4,4,Good Clip Ins,I used these clip ins on a recent 5-day trip a...,[],B07FNPN49W,B07FNPN49W,AHNQJWY7VTX2GIHBKNGD7MS2XMUA,2018-09-06 17:50:02.665,0,1,...,0.080945,-0.065782,0.11769,0.131351,-0.188997,0.006255,0.102548,0.056575,-0.051365,0.026357


In [83]:
similarity_matrix = cosine_similarity(train_pca_subset)

In [84]:
similarity_matrix[0][1]

0.10673274

In [87]:
similarity = cosine_similarity([train_pca_subset[12]], [train_pca_subset[10]])
print("Similarity:", similarity[0][0])

Similarity: 0.30784366


## Variance analysis

Original embeddings:

In [89]:
user_embeddings_original = pd.read_csv(r'.\..\data\embeddings_output\df_user_embedding_fullsize_01.csv')
user_embeddings_original.head(2)

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,...,dim_758,dim_759,dim_760,dim_761,dim_762,dim_763,dim_764,dim_765,dim_766,dim_767
0,3,Looks great but not practical for me,[[VIDEOID:c9f78d09ec648d2bdc92f5ad9698e698]] T...,[{'small_image_url': 'https://m.media-amazon.c...,B09JNRPRKT,B09JNRPRKT,AFOKWZIJI7K6Z6R4T7SSXRTWZNOQ,2022-02-22 19:21:19.256,0,0,...,-0.067687,-0.20555,0.072889,-0.180823,0.028919,0.010474,-0.138984,-0.210856,0.057235,0.130917
1,4,This stuff is pretty interesting. I'm enjoying...,This stuff is pretty interesting. I'm enjoying...,[],B0006O028U,B0006O028U,AHQXHCQIGUBPQNJLQZNLBV7ESFPA,2015-01-04 00:21:08.000,1,1,...,-0.303478,0.09138,0.122427,-0.18641,-0.027961,-0.25528,0.025104,-0.236829,-0.088415,-0.031806


In [90]:
original_embeddings = user_embeddings_original[[f"dim_{i}" for i in range(768)]]

In [91]:
# Fit PCA to the original embeddings
pca = PCA(n_components=300)
pca.fit(original_embeddings)

# Check explained variance
explained_variance = pca.explained_variance_ratio_
total_variance = sum(explained_variance)

print("Explained Variance Ratio for 300 Dimensions:", explained_variance)
print("Total Explained Variance Captured by 300 Dimensions:", total_variance)

Explained Variance Ratio for 300 Dimensions: [0.20868374 0.04104146 0.03257889 0.03212807 0.03028176 0.0271303
 0.02321218 0.02139147 0.02046016 0.01756858 0.01645912 0.01460024
 0.01337152 0.01251322 0.01159026 0.0104906  0.00974576 0.00903349
 0.00875786 0.00849473 0.00808074 0.00759961 0.00706855 0.00701897
 0.00645697 0.00632001 0.00614428 0.00599658 0.00574176 0.00559984
 0.00538578 0.00521532 0.00506382 0.00491469 0.00462164 0.0044721
 0.00437525 0.00436016 0.00424496 0.00402911 0.00401478 0.00389828
 0.00383993 0.00377082 0.00362658 0.00352763 0.0034726  0.00337115
 0.0033391  0.00328946 0.00313695 0.00303278 0.00297965 0.00290999
 0.00289475 0.00275689 0.00274672 0.00259472 0.00256635 0.00254136
 0.00251571 0.00248007 0.00245774 0.00240559 0.00235431 0.00232246
 0.00226369 0.00219719 0.00215519 0.00213213 0.00211301 0.0020895
 0.0020615  0.00203986 0.00200546 0.00195323 0.00191271 0.00190367
 0.00187765 0.0018429  0.00183172 0.00178094 0.00175558 0.00171754
 0.00170059 0.001695

# Once all embeddings are calculated we can go to the next step (so far calculations are made only for the sample of 50000 rows of the training dataset)

## Create User and Product Vector

* Aggregating the embeddings to compute a single user vector per user_ID and single product vector per product_ID.
* To create a vector for each user, you can average all the review embeddings corresponding to that user. This works because averaging retains a general sense of the user's overall preferences.
* Similarly, you can aggregate the embeddings for each product. For example, average all the embeddings corresponding to each product ID.

### 1. Aggregate User Vectors

In [93]:
# Select relevant columns: user_ID and embedding dimensions
user_vector_cols = [f'dim_{i}' for i in range(300)]
df_user_vectors = final_df_user_PCA.groupby('user_ID')[user_vector_cols].mean().reset_index()

In [94]:
df_user_vectors

Unnamed: 0,user_ID,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,...,dim_290,dim_291,dim_292,dim_293,dim_294,dim_295,dim_296,dim_297,dim_298,dim_299
0,AE225IKAL7QABBT2BSTLIGFKHAQQ,0.208377,1.522931,-1.117696,-0.636341,-1.111270,-1.021139,1.514467,1.011144,0.312553,...,0.034778,-0.160210,0.018794,-0.042957,-0.259030,0.032602,0.047290,0.216677,-0.052517,0.081157
1,AE22C2GQK3BBQNQJHVCU7KOEX5TQ,-0.025180,-1.702101,0.091881,-1.591924,0.755298,0.020142,0.234630,-0.608135,-1.216132,...,-0.201371,-0.000446,0.027550,-0.061505,0.062629,0.008393,0.025916,-0.052586,-0.057025,0.023120
2,AE22DOZY7CRT5OJLXRRHJYO6GY2A,-1.784766,-0.477997,-0.435100,-0.002652,0.207864,-0.198191,-0.089657,0.581121,0.008543,...,0.040483,-0.030430,0.101152,0.057111,-0.018026,0.016207,0.108748,-0.063591,-0.066409,-0.098862
3,AE22EURACV6BNCXXYEWZWBKKXVNA,0.008605,1.014055,-1.042147,0.622563,0.648501,-1.803104,-1.200704,-0.730483,-0.309668,...,0.100619,0.074527,0.100501,-0.011970,0.074697,0.122734,-0.011861,-0.009350,0.005459,-0.022839
4,AE22J6SMWQO5HAG4JZODVAEMQXNQ,-1.886114,-0.161519,-0.408325,-0.460785,-0.414641,1.540108,-1.274296,-0.210389,0.562990,...,-0.007666,-0.067625,0.124841,-0.058047,-0.000059,-0.126879,0.012326,0.049777,-0.022917,0.059209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49374,AHZZK3BZYDZYCVG5S5JDJIR7JXLA,-2.582652,0.107823,-0.448086,1.180751,-0.293276,0.735802,0.233716,-0.069590,-0.715500,...,0.003632,-0.110156,0.141721,0.017690,0.069116,0.053896,-0.097167,-0.040307,-0.046386,-0.072058
49375,AHZZOQ677OZSRA2WRNAIYSAKKZQQ,-1.542746,0.199729,0.083376,-0.139327,0.621125,-0.193690,1.497363,-0.686839,0.062886,...,-0.094849,-0.143646,0.007565,-0.043028,-0.056675,-0.057168,-0.098793,0.131166,0.087973,0.031357
49376,AHZZSLZAXLSFXBQ6ZMGF3SDVMXJQ,-2.516476,0.180381,-0.234578,0.544458,-0.418455,0.687271,-0.192078,-0.153829,-0.453198,...,0.036466,-0.014416,-0.054526,-0.081890,0.143874,-0.099452,-0.191830,0.005010,-0.107094,-0.014023
49377,AHZZTNXIDGRSANUOKJCLRL3XZ7MQ,-2.058221,0.100081,-0.988637,-0.509999,-0.448426,0.129182,-0.083300,0.370595,-0.080013,...,-0.030503,0.098914,-0.068510,-0.021767,0.045006,-0.075067,0.026532,0.033063,0.056145,0.045279


In [97]:
# Rename the columns to indicate these are user vectors:
df_user_vectors.columns = ['user_ID'] + [f'user_vec_{i}' for i in range(300)]

In [98]:
df_user_vectors.shape

(49379, 301)

In [99]:
df_user_vectors.head()

Unnamed: 0,user_ID,user_vec_0,user_vec_1,user_vec_2,user_vec_3,user_vec_4,user_vec_5,user_vec_6,user_vec_7,user_vec_8,...,user_vec_290,user_vec_291,user_vec_292,user_vec_293,user_vec_294,user_vec_295,user_vec_296,user_vec_297,user_vec_298,user_vec_299
0,AE225IKAL7QABBT2BSTLIGFKHAQQ,0.208377,1.522931,-1.117696,-0.636341,-1.11127,-1.021139,1.514467,1.011144,0.312553,...,0.034778,-0.16021,0.018794,-0.042957,-0.25903,0.032602,0.04729,0.216677,-0.052517,0.081157
1,AE22C2GQK3BBQNQJHVCU7KOEX5TQ,-0.02518,-1.702101,0.091881,-1.591924,0.755298,0.020142,0.23463,-0.608135,-1.216132,...,-0.201371,-0.000446,0.02755,-0.061505,0.062629,0.008393,0.025916,-0.052586,-0.057025,0.02312
2,AE22DOZY7CRT5OJLXRRHJYO6GY2A,-1.784766,-0.477997,-0.4351,-0.002652,0.207864,-0.198191,-0.089657,0.581121,0.008543,...,0.040483,-0.03043,0.101152,0.057111,-0.018026,0.016207,0.108748,-0.063591,-0.066409,-0.098862
3,AE22EURACV6BNCXXYEWZWBKKXVNA,0.008605,1.014055,-1.042147,0.622563,0.648501,-1.803104,-1.200704,-0.730483,-0.309668,...,0.100619,0.074527,0.100501,-0.01197,0.074697,0.122734,-0.011861,-0.00935,0.005459,-0.022839
4,AE22J6SMWQO5HAG4JZODVAEMQXNQ,-1.886114,-0.161519,-0.408325,-0.460785,-0.414641,1.540108,-1.274296,-0.210389,0.56299,...,-0.007666,-0.067625,0.124841,-0.058047,-5.9e-05,-0.126879,0.012326,0.049777,-0.022917,0.059209


In [22]:
# Save the vectors
df_user_vectors.to_csv(r'.\..\data\text_analysis\user_vectors\user_vectors.csv', index=False)

### 2. Aggregate Product Vectors

In [100]:
product_vector_cols = [f'dim_{i}' for i in range(300)]
df_product_vectors = final_df_user_PCA.groupby('product_ID')[product_vector_cols].mean().reset_index()

In [101]:
df_product_vectors.columns = ['product_ID'] + [f'product_vec_{i}' for i in range(300)]

In [105]:
df_product_vectors.shape

(26168, 301)

In [103]:
# Save the vectors
df_product_vectors.to_csv(r'.\..\data\text_analysis\product_vectors\product_vectors.csv', index=False)

# Semantic Analysis

## Compare Reviews - Similarity Research with COS Similarity - ???

In [None]:
# Compute cosine similarity between all reviews
similarity_matrix = cosine_similarity(embeddings)

# Print the similarity matrix
print(similarity_matrix)

## Group Reviews - Clustering

In [None]:
from sklearn.cluster import KMeans

# Choose the number of clusters (e.g., 3)
num_clusters = 3
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(embeddings)

# Assign each review to a cluster
clusters = kmeans.labels_

# Group reviews by cluster
for cluster_id in range(num_clusters):
    print(f"\nCluster {cluster_id}:")
    for i, review in enumerate(reviews):
        if clusters[i] == cluster_id:
            print(f" - {review}")

Possible Use Cases:
Identify common pain points (e.g., "late delivery," "poor quality").
Group reviews into topics for better business insights.

# Sentiment Analysis

* To classify customer reviews into sentiment categories, such as positive, neutral, or negative.
* Fine-tune BERT (Optional):

Train a BERT-based model on your labeled sentiment data (e.g., reviews with labels like "positive," "negative").
This improves sentiment prediction accuracy for domain-specific language.

* Direct Embedding-Based Classification:

Alternatively, use pre-trained embeddings and train a simple classifier (e.g., logistic regression) on top.

* Output:

Predict the sentiment for each review and quantify overall customer satisfaction.