In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from transformers import BertTokenizer, BertModel, pipeline
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import ast
import re
from datetime import datetime

### Import data

In [8]:
file_path = r'.\..\data\data_clean\merged_user_meta_df.csv'  # Update this with your file path
df_user = pd.read_csv(file_path)
df_user.head()

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,year,cleaned_text,parent_asin,cleaned_title
0,5,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-05 14:08:48.923,0,1,2020,this spray is really nice. it smells really go...,B00YQ6X8EO,herbivore natural sea mist texturizing salt sp...
1,4,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-04 18:10:55.070,1,1,2020,"this product does what i need it to do, i just...",B081TJ8YS3,all natural vegan dry shampoo powder eco frien...
2,5,Yes!,"Smells good, feels great!",[],B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2020-05-16 21:41:06.052,2,1,2020,"smells good, feels great!",B097R46CSY,new road beauty creamsicle variety 3 pack para...
3,1,Synthetic feeling,Felt synthetic,[],B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2022-01-28 18:13:50.220,0,1,2022,felt synthetic,B09JS339BZ,muaowig ombre body wave bundles 1b grey human ...
4,5,A+,Love it,[],B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2020-12-30 10:02:43.534,0,1,2020,love it,B08BZ63GMJ,yinhua electric nail drill kit portable profes...


In [9]:
missing_values = df_user.isnull().sum()
missing_values

rating                           0
review_title                     0
text_review                      0
user_images                      0
product_ID                       0
parent_ID                        0
user_ID                          0
timestamp                        0
helpful_review_vote              0
user_purchase_verification       0
year                             0
cleaned_text                  1405
parent_asin                      0
cleaned_title                  205
dtype: int64

In [11]:
missing_values_df = df_user[df_user['cleaned_text'].isna()][['user_ID', 'cleaned_text']]
missing_values_df

Unnamed: 0,user_ID,cleaned_text
1018,AEL44KNEOUWZC4V6JJDVVTTLJ4KQ,
3535,AHTNEEIQCRWSYXPYOKOASB7G5LNQ,
6110,AGZ4TGVPDLTEHWAZYACZVUVM3PHA,
7711,AFL3CX6PBM6KNH3DAEEBX7HSRODA,
8185,AHSOYEMHO2QD4VBBZLTWSM4TXE2A,
...,...,...
692815,AFRQR3NPUFAIADAGINAAKIJ4BNQQ,
693035,AFNND72LU26MI253ZENFEVOWLKFA,
693180,AHB4JAWKZXYWN6HARHEHOWBG3XKQ,
693231,AG5DOSMAAN27WB33FSR2GFV3IEIA,


In [12]:
df_user.dropna(inplace=True)
df_user.shape

(692331, 14)

In [13]:
missing_values = df_user.isnull().sum()
missing_values

rating                        0
review_title                  0
text_review                   0
user_images                   0
product_ID                    0
parent_ID                     0
user_ID                       0
timestamp                     0
helpful_review_vote           0
user_purchase_verification    0
year                          0
cleaned_text                  0
parent_asin                   0
cleaned_title                 0
dtype: int64

In [14]:
def merge_text_columns(df, col1, col2, new_col_name):
    """
    Merges two textual columns in a DataFrame into a new column with values combined by a comma.

    Parameters:
    - df: pandas DataFrame
    - col1: Name of the first column to merge
    - col2: Name of the second column to merge
    - new_col_name: Name of the new column to create

    Returns:
    - The updated DataFrame with the new column added.
    """
    if col1 not in df.columns or col2 not in df.columns:
        raise ValueError(f"One or both of the specified columns '{col1}' or '{col2}' do not exist in the DataFrame.")
    
    # Create the new column by merging with a comma, handling None/NaN values
    df[new_col_name] = df[col1].fillna('') + ', ' + df[col2].fillna('')
    df[new_col_name] = df[new_col_name].str.strip(', ')  # Remove extra commas if either column is empty
    
    return df


# Merge 'Column1' and 'Column2' into a new column 'MergedColumn'
df_user = merge_text_columns(df_user, 'cleaned_text', 'cleaned_title', 'merged_review_title')

df_user.head()

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,year,cleaned_text,parent_asin,cleaned_title,merged_review_title
0,5,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-05 14:08:48.923,0,1,2020,this spray is really nice. it smells really go...,B00YQ6X8EO,herbivore natural sea mist texturizing salt sp...,this spray is really nice. it smells really go...
1,4,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-04 18:10:55.070,1,1,2020,"this product does what i need it to do, i just...",B081TJ8YS3,all natural vegan dry shampoo powder eco frien...,"this product does what i need it to do, i just..."
2,5,Yes!,"Smells good, feels great!",[],B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2020-05-16 21:41:06.052,2,1,2020,"smells good, feels great!",B097R46CSY,new road beauty creamsicle variety 3 pack para...,"smells good, feels great!, new road beauty cre..."
3,1,Synthetic feeling,Felt synthetic,[],B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2022-01-28 18:13:50.220,0,1,2022,felt synthetic,B09JS339BZ,muaowig ombre body wave bundles 1b grey human ...,"felt synthetic, muaowig ombre body wave bundle..."
4,5,A+,Love it,[],B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2020-12-30 10:02:43.534,0,1,2020,love it,B08BZ63GMJ,yinhua electric nail drill kit portable profes...,"love it, yinhua electric nail drill kit portab..."


In [17]:
df_user['text_length'] = df_user['merged_review_title'].apply(len)  # Create a new column with text lengths

# Find the max and average text length
max_length = df_user['text_length'].max()
average_length = df_user['text_length'].mean()

print(f'Max text length (characters): {max_length}')
print(f'Average text length (characters): {average_length}')

Max text length (characters): 14728
Average text length (characters): 283.5129670634422


### Generate BERT embeddings for the whole dataset:

* BertTokenizer:
This is a tokenizer class from the transformers library provided by Hugging Face. It converts raw text into token IDs that a BERT model can process.
BERT base uncased:
"Base": Refers to the smaller version of BERT with 12 layers, 768 hidden units, and 110 million parameters.

* Generating embeddings for the user's review with BERT,
* From each cell with review text data, a vector of 768 dimensions will be generated and stored in a cell, in a column called 'embeddings',
* Generated embeddings will be transformed from the lists of dimenstions into separated columns (one column per dimension) - therefore 768 columns will be created.

In [18]:
# Check if a CUDA-compatible GPU is available on your system. "cuda:0" refers to the first GPU on the system (if there are multiple GPUs). If no GPU is available, device defaults to "cpu".
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')



In [19]:
# Move the model to the GPU
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

Function to get BERT embeddings with the implementation of chunking

In [20]:
def get_bert_embeddings(text):
    """
    Generates BERT embeddings for a given text, with support for chunking if the text is longer than 512 tokens.
    """
    # Tokenize the input text into subwords
    tokens = tokenizer.tokenize(text)  # Tokenize into subwords
    max_length = 512  # Maximum token length BERT can handle
    overlap = 50      # Overlap between chunks for better context continuity

    # If the text is short, process it directly
    if len(tokens) <= max_length:
        # Tokenize, truncate, and pad
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
        
        # Move inputs to the same device as the model
        inputs = {key: value.to(device) for key, value in inputs.items()}
        
        # Forward pass through the model
        outputs = model(**inputs)
        
        # Get the embeddings (mean of the last hidden state)
        return outputs.last_hidden_state.mean(dim=1).squeeze().detach().cpu().numpy()

    # For long texts, split into chunks
    chunks = []
    for i in range(0, len(tokens), max_length - overlap):
        # Create a chunk of tokens with overlap
        chunk = tokens[i:i + max_length]
        # Convert tokens back to string format
        chunk_text = tokenizer.convert_tokens_to_string(chunk)
        
        # Tokenize, truncate, and pad the chunk
        inputs = tokenizer(chunk_text, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
        
        # Move inputs to the same device as the model
        inputs = {key: value.to(device) for key, value in inputs.items()}
        
        # Forward pass through the model
        outputs = model(**inputs)
        
        # Get the embeddings (mean of the last hidden state for this chunk)
        chunk_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().detach().cpu().numpy()
        chunks.append(chunk_embedding)
    
    # Aggregate chunk embeddings (mean pooling across all chunks)
    final_embedding = sum(chunks) / len(chunks)  # Average embeddings across chunks
    return final_embedding

In [21]:
print(torch.cuda.is_available())

True


### Generating a DataFrame containing columns of embeddings dimensions

In [22]:
df_user.head()

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,year,cleaned_text,parent_asin,cleaned_title,merged_review_title,text_length
0,5,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-05 14:08:48.923,0,1,2020,this spray is really nice. it smells really go...,B00YQ6X8EO,herbivore natural sea mist texturizing salt sp...,this spray is really nice. it smells really go...,363
1,4,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-04 18:10:55.070,1,1,2020,"this product does what i need it to do, i just...",B081TJ8YS3,all natural vegan dry shampoo powder eco frien...,"this product does what i need it to do, i just...",402
2,5,Yes!,"Smells good, feels great!",[],B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2020-05-16 21:41:06.052,2,1,2020,"smells good, feels great!",B097R46CSY,new road beauty creamsicle variety 3 pack para...,"smells good, feels great!, new road beauty cre...",169
3,1,Synthetic feeling,Felt synthetic,[],B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2022-01-28 18:13:50.220,0,1,2022,felt synthetic,B09JS339BZ,muaowig ombre body wave bundles 1b grey human ...,"felt synthetic, muaowig ombre body wave bundle...",190
4,5,A+,Love it,[],B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2020-12-30 10:02:43.534,0,1,2020,love it,B08BZ63GMJ,yinhua electric nail drill kit portable profes...,"love it, yinhua electric nail drill kit portab...",199


In [23]:
df_user['embeddings'] = df_user['merged_review_title'].apply(get_bert_embeddings)

In [24]:
df_user.head()

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,year,cleaned_text,parent_asin,cleaned_title,merged_review_title,text_length,embeddings
0,5,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-05 14:08:48.923,0,1,2020,this spray is really nice. it smells really go...,B00YQ6X8EO,herbivore natural sea mist texturizing salt sp...,this spray is really nice. it smells really go...,363,"[0.17056495, 0.04877963, 0.4597311, 0.09702098..."
1,4,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-04 18:10:55.070,1,1,2020,"this product does what i need it to do, i just...",B081TJ8YS3,all natural vegan dry shampoo powder eco frien...,"this product does what i need it to do, i just...",402,"[0.26909658, 0.2322717, 0.4237466, -0.05495877..."
2,5,Yes!,"Smells good, feels great!",[],B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2020-05-16 21:41:06.052,2,1,2020,"smells good, feels great!",B097R46CSY,new road beauty creamsicle variety 3 pack para...,"smells good, feels great!, new road beauty cre...",169,"[0.1310498, -0.05716905, 0.7067867, 0.23912415..."
3,1,Synthetic feeling,Felt synthetic,[],B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2022-01-28 18:13:50.220,0,1,2022,felt synthetic,B09JS339BZ,muaowig ombre body wave bundles 1b grey human ...,"felt synthetic, muaowig ombre body wave bundle...",190,"[0.3617491, -0.16617718, 0.67184556, -0.130346..."
4,5,A+,Love it,[],B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2020-12-30 10:02:43.534,0,1,2020,love it,B08BZ63GMJ,yinhua electric nail drill kit portable profes...,"love it, yinhua electric nail drill kit portab...",199,"[0.2913164, -0.06366797, 0.46250388, 0.0772158..."


In [25]:
# Embeddings are added in a DataFrame as a column containing a list of dimensions. Below an array of embeddings is generated 
# to implement them in a dataframe as columns of dimensions.

subset_data_stack = np.stack(df_user['embeddings'].values)

In [26]:
# Generating the list of column names of embeddings dimensions:
num_dimensions = subset_data_stack.shape[1]
embedding_columns_names = [f'dim_{i}' for i in range(num_dimensions)]

In [27]:
# Create DataFrame of embeddings where each column contains a single dimension of vector:
embedding_df = pd.DataFrame(subset_data_stack, columns=embedding_columns_names)

In [33]:
df_user.shape

(692331, 17)

In [28]:
# Add generated embeddings DataFrame to the original user DataFrame:
df_user_embeddings_final = pd.concat([df_user.reset_index(drop=True), embedding_df], axis=1)

In [29]:
df_user_embeddings_final

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,...,dim_758,dim_759,dim_760,dim_761,dim_762,dim_763,dim_764,dim_765,dim_766,dim_767
0,5,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-05 14:08:48.923,0,1,...,-0.002711,-0.460343,0.018323,-0.174986,0.113852,0.096990,-0.149028,-0.278069,-0.180932,0.246443
1,4,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-04 18:10:55.070,1,1,...,-0.179098,-0.336104,0.016370,0.050976,0.144952,-0.161408,-0.334452,-0.221662,-0.220744,0.071765
2,5,Yes!,"Smells good, feels great!",[],B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2020-05-16 21:41:06.052,2,1,...,-0.205647,-0.257025,-0.061465,-0.108916,0.189791,-0.330929,-0.172352,-0.325983,-0.502717,-0.174491
3,1,Synthetic feeling,Felt synthetic,[],B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2022-01-28 18:13:50.220,0,1,...,-0.001281,-0.245324,0.073341,-0.141969,-0.022200,0.007424,-0.428374,-0.089052,-0.128445,-0.103974
4,5,A+,Love it,[],B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2020-12-30 10:02:43.534,0,1,...,-0.026411,0.082077,0.251153,-0.188745,0.334880,-0.405307,-0.672850,-0.250945,-0.125713,-0.129676
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
692326,4,Four Stars,Conditioner is great shampoo not as I expected,[],B006YUIWKA,B006YUIWKA,AFIXGFVEGLMOTMBTJL7H3VSIETDQ,2016-11-04 02:37:01.000,0,1,...,0.194189,-0.095605,0.019948,-0.220489,0.129437,-0.043423,-0.074754,-0.617389,-0.312533,-0.170907
692327,1,Pretty,Did not work! Used the whole bottle and my hai...,[],B006YUIWKA,B006YUIWKA,AFV7YZFOJF564EZGET5LG45K4QEA,2016-12-05 03:32:10.000,0,0,...,0.199543,-0.096083,-0.008719,-0.274877,0.231822,-0.036597,-0.263846,-0.574784,-0.063765,-0.064832
692328,5,Great sunless tanner,Product as expected. Shipping was on time.,[],B06ZZV9MZT,B06ZZV9MZT,AHYDCWDMMVMLBX7FY7M7JKADKRDQ,2020-05-27 02:52:54.067,0,1,...,0.042655,-0.225976,-0.042821,-0.493260,0.021015,-0.141857,-0.203021,-0.284647,-0.169166,-0.127791
692329,5,The Crown on top is a Ring!!!,"Not only is it a delicious fragrance, but also...",[],B000HB6VLE,B000HB6VLE,AF6ZIAEN7TQ2WY5ZL77F6JDPV7XQ,2007-07-18 22:36:49.000,4,0,...,-0.039305,-0.131451,0.051593,-0.222641,-0.282173,-0.137917,0.041218,-0.309228,0.144367,-0.176293


Save the output

In [30]:
# Save the original DataFrame with generated embeddings:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

file_path = rf'.\..\data\embeddings_output\df_user_embeddings_BERT_merged{timestamp}.csv'
df_user_embeddings_final.to_csv(file_path, index=False)

In [31]:
# Save the compresed file:
df_user_embeddings_final.to_csv(r".\..\data\embeddings_output\df_user_embeddings_BERT_merged.csv.gz", compression="gzip", index=False)