In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from transformers import BertTokenizer, BertModel, pipeline
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import ast
import re
from datetime import datetime

### Import data

In [2]:
file_path = r'.\..\data\data_clean\user_clean_data_ecommerce.csv'  # Update this with your file path
df_user = pd.read_csv(file_path)

In [3]:
missing_values = df_user.isnull().sum()
missing_values

rating                           0
review_title                     0
text_review                      0
user_images                      0
product_ID                       0
parent_ID                        0
user_ID                          0
timestamp                        0
helpful_review_vote              0
user_purchase_verification       0
year                             0
cleaned_text                  1405
dtype: int64

In [4]:
missing_values_df = df_user[df_user['cleaned_text'].isna()][['user_ID', 'cleaned_text']]

Unnamed: 0,user_ID,cleaned_text
1018,AEL44KNEOUWZC4V6JJDVVTTLJ4KQ,
3535,AHTNEEIQCRWSYXPYOKOASB7G5LNQ,
6110,AGZ4TGVPDLTEHWAZYACZVUVM3PHA,
7711,AFL3CX6PBM6KNH3DAEEBX7HSRODA,
8185,AHSOYEMHO2QD4VBBZLTWSM4TXE2A,
...,...,...
692815,AFRQR3NPUFAIADAGINAAKIJ4BNQQ,
693035,AFNND72LU26MI253ZENFEVOWLKFA,
693180,AHB4JAWKZXYWN6HARHEHOWBG3XKQ,
693231,AG5DOSMAAN27WB33FSR2GFV3IEIA,


In [5]:
df_user.dropna(inplace=True)
df_user.shape

(692536, 12)

In [6]:
missing_values = df_user.isnull().sum()
missing_values

rating                        0
review_title                  0
text_review                   0
user_images                   0
product_ID                    0
parent_ID                     0
user_ID                       0
timestamp                     0
helpful_review_vote           0
user_purchase_verification    0
year                          0
cleaned_text                  0
dtype: int64

In [7]:
df_user['text_length'] = df_user['cleaned_text'].apply(len)  # Create a new column with text lengths

# Find the max and average text length
max_length = df_user['text_length'].max()
average_length = df_user['text_length'].mean()

print(f'Max text length (characters): {max_length}')
print(f'Average text length (characters): {average_length}')

Max text length (characters): 14692
Average text length (characters): 170.48680357411024


### Create BERT embeddings for the whole dataset:

* BertTokenizer:
This is a tokenizer class from the transformers library provided by Hugging Face. It converts raw text into token IDs that a BERT model can process.
BERT base uncased:
"Base": Refers to the smaller version of BERT with 12 layers, 768 hidden units, and 110 million parameters.

* Generating embeddings for the user's review with BERT,
* From each cell with review text data, a vector of 768 dimensions will be generated and stored in a cell, in a column called 'embeddings',
* Generated embeddings will be transformed from the lists of dimenstions into separated columns (one column per dimension) - therefore 768 columns will be created.

In [8]:
# Check if a CUDA-compatible GPU is available on your system. "cuda:0" refers to the first GPU on the system (if there are multiple GPUs). If no GPU is available, device defaults to "cpu".
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')



In [9]:
# Move the model to the GPU
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

Function to get BERT embeddings with the implementation of chunking

In [10]:
def get_bert_embeddings(text):
    """
    Generates BERT embeddings for a given text, with support for chunking if the text is longer than 512 tokens.
    """
    # Tokenize the input text into subwords
    tokens = tokenizer.tokenize(text)  # Tokenize into subwords
    max_length = 512  # Maximum token length BERT can handle
    overlap = 50      # Overlap between chunks for better context continuity

    # If the text is short, process it directly
    if len(tokens) <= max_length:
        # Tokenize, truncate, and pad
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
        
        # Move inputs to the same device as the model
        inputs = {key: value.to(device) for key, value in inputs.items()}
        
        # Forward pass through the model
        outputs = model(**inputs)
        
        # Get the embeddings (mean of the last hidden state)
        return outputs.last_hidden_state.mean(dim=1).squeeze().detach().cpu().numpy()

    # For long texts, split into chunks
    chunks = []
    for i in range(0, len(tokens), max_length - overlap):
        # Create a chunk of tokens with overlap
        chunk = tokens[i:i + max_length]
        # Convert tokens back to string format
        chunk_text = tokenizer.convert_tokens_to_string(chunk)
        
        # Tokenize, truncate, and pad the chunk
        inputs = tokenizer(chunk_text, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
        
        # Move inputs to the same device as the model
        inputs = {key: value.to(device) for key, value in inputs.items()}
        
        # Forward pass through the model
        outputs = model(**inputs)
        
        # Get the embeddings (mean of the last hidden state for this chunk)
        chunk_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().detach().cpu().numpy()
        chunks.append(chunk_embedding)
    
    # Aggregate chunk embeddings (mean pooling across all chunks)
    final_embedding = sum(chunks) / len(chunks)  # Average embeddings across chunks
    return final_embedding

In [11]:
print(torch.cuda.is_available())

True


### Generating a DataFrame containing columns of embeddings dimensions

In [16]:
df_user.head()

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,year,cleaned_text,text_length
0,5,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-05 14:08:48.923,0,1,2020,this spray is really nice. it smells really go...,299
1,4,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-04 18:10:55.070,1,1,2020,"this product does what i need it to do, i just...",233
2,5,Yes!,"Smells good, feels great!",[],B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2020-05-16 21:41:06.052,2,1,2020,"smells good, feels great!",25
3,1,Synthetic feeling,Felt synthetic,[],B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2022-01-28 18:13:50.220,0,1,2022,felt synthetic,14
4,5,A+,Love it,[],B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2020-12-30 10:02:43.534,0,1,2020,love it,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
693936,4,Four Stars,Conditioner is great shampoo not as I expected,[],B006YUIWKA,B006YUIWKA,AFIXGFVEGLMOTMBTJL7H3VSIETDQ,2016-11-04 02:37:01.000,0,1,2016,conditioner is great shampoo not as i expected,46
693937,1,Pretty,Did not work! Used the whole bottle and my hai...,[],B006YUIWKA,B006YUIWKA,AFV7YZFOJF564EZGET5LG45K4QEA,2016-12-05 03:32:10.000,0,0,2016,did not work! used the whole bottle and my hai...,82
693938,5,Great sunless tanner,Product as expected. Shipping was on time.,[],B06ZZV9MZT,B06ZZV9MZT,AHYDCWDMMVMLBX7FY7M7JKADKRDQ,2020-05-27 02:52:54.067,0,1,2020,product as expected. shipping was on time.,42
693939,5,The Crown on top is a Ring!!!,"Not only is it a delicious fragrance, but also...",[],B000HB6VLE,B000HB6VLE,AF6ZIAEN7TQ2WY5ZL77F6JDPV7XQ,2007-07-18 22:36:49.000,4,0,2007,"not only is it a delicious fragrance, but also...",169


In [17]:
df_user['embeddings'] = df_user['cleaned_text'].apply(get_bert_embeddings)

In [27]:
df_user.head()

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,year,cleaned_text,text_length,embeddings
0,5,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-05 14:08:48.923,0,1,2020,this spray is really nice. it smells really go...,299,"[0.1840019, -0.028732283, 0.44851863, 0.079564..."
1,4,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-04 18:10:55.070,1,1,2020,"this product does what i need it to do, i just...",233,"[0.08434828, 0.3667166, 0.31517023, 0.04347265..."
2,5,Yes!,"Smells good, feels great!",[],B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2020-05-16 21:41:06.052,2,1,2020,"smells good, feels great!",25,"[0.4264943, 0.15444493, 0.34537777, 0.06797459..."
3,1,Synthetic feeling,Felt synthetic,[],B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2022-01-28 18:13:50.220,0,1,2022,felt synthetic,14,"[0.1945716, -0.017593045, -0.17701837, 0.21122..."
4,5,A+,Love it,[],B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2020-12-30 10:02:43.534,0,1,2020,love it,7,"[0.30804315, 0.044757985, -0.08509967, 0.28211..."


In [19]:
# Embeddings are added in a DataFrame as a column containing a list of dimensions. Below an array of embeddings is generated 
# to implement them in a dataframe as columns of dimensions.

subset_data_stack = np.stack(df_user['embeddings'].values)

In [20]:
# Generating the list of column names of embeddings dimensions:
num_dimensions = subset_data_stack.shape[1]
embedding_columns_names = [f'dim_{i}' for i in range(num_dimensions)]

In [21]:
# Create DataFrame of embeddings where each column contains a single dimension of vector:
embedding_df = pd.DataFrame(subset_data_stack, columns=embedding_columns_names)

In [22]:
# Add generated embeddings DataFrame to the original user DataFrame:
df_user_embeddings_final = pd.concat([df_user.reset_index(drop=True), embedding_df], axis=1)

In [23]:
df_user_embeddings_final

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,...,dim_758,dim_759,dim_760,dim_761,dim_762,dim_763,dim_764,dim_765,dim_766,dim_767
0,5,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-05 14:08:48.923,0,1,...,-0.094609,-0.411200,-0.062443,-0.214518,0.074292,0.104339,-0.135383,-0.334618,-0.169996,0.315318
1,4,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-04 18:10:55.070,1,1,...,-0.100695,-0.222924,0.112311,-0.042177,0.077755,-0.127271,-0.111048,-0.031468,-0.161275,0.247013
2,5,Yes!,"Smells good, feels great!",[],B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2020-05-16 21:41:06.052,2,1,...,-0.093840,-0.259505,0.129799,-0.107022,0.138637,-0.081212,-0.017655,-0.731368,-0.361127,-0.105736
3,1,Synthetic feeling,Felt synthetic,[],B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2022-01-28 18:13:50.220,0,1,...,0.259995,-0.170665,0.285434,-0.391459,-0.135314,-0.281631,0.095484,-0.111428,0.037819,-0.030503
4,5,A+,Love it,[],B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2020-12-30 10:02:43.534,0,1,...,0.421028,-0.134179,0.317967,-0.471531,-0.134610,-0.314950,-0.286948,-0.066816,-0.102167,-0.157501
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
692531,4,Four Stars,Conditioner is great shampoo not as I expected,[],B006YUIWKA,B006YUIWKA,AFIXGFVEGLMOTMBTJL7H3VSIETDQ,2016-11-04 02:37:01.000,0,1,...,0.260648,-0.008889,-0.091507,-0.231533,0.081567,-0.260971,0.141322,-0.430366,-0.349280,0.127441
692532,1,Pretty,Did not work! Used the whole bottle and my hai...,[],B006YUIWKA,B006YUIWKA,AFV7YZFOJF564EZGET5LG45K4QEA,2016-12-05 03:32:10.000,0,0,...,0.201984,-0.066559,-0.020144,-0.228285,0.365405,0.037775,-0.286134,-0.398248,-0.016306,0.173659
692533,5,Great sunless tanner,Product as expected. Shipping was on time.,[],B06ZZV9MZT,B06ZZV9MZT,AHYDCWDMMVMLBX7FY7M7JKADKRDQ,2020-05-27 02:52:54.067,0,1,...,0.299091,-0.057872,0.236871,-0.455242,-0.074439,-0.193222,-0.351080,0.116173,0.079833,-0.447324
692534,5,The Crown on top is a Ring!!!,"Not only is it a delicious fragrance, but also...",[],B000HB6VLE,B000HB6VLE,AF6ZIAEN7TQ2WY5ZL77F6JDPV7XQ,2007-07-18 22:36:49.000,4,0,...,-0.144703,-0.158117,0.057744,-0.158837,-0.244748,-0.111022,0.051020,-0.243764,0.115653,-0.268967


Save the output

In [None]:
# Save the original DataFrame with generated embeddings:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

file_path = rf'.\..\data\embeddings_output\df_user_embeddings_BERT_{timestamp}.csv'
df_user_embeddings_final.to_csv(file_path, index=False)

In [25]:
# Save the compresed file:
df_user_embeddings_final.to_csv(r".\..\data\embeddings_output\df_user_embeddings_BERT_.csv.gz", compression="gzip", index=False)