In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from transformers import BertTokenizer, BertModel, pipeline
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import ast
import re
from datetime import datetime

Import data

In [21]:
# load the dataset of PCA normalized embeddings: 
file_path = rf'.\..\data\embeddings_dim_reduction\df_user_normalized_PCA_merged.csv' # adjust the file_path with the proper name of the file
df_user_normalized_PCA = pd.read_csv(file_path) 

In [22]:
df_user_normalized_PCA['cleaned_text']

0         this spray is really nice. it smells really go...
1         this product does what i need it to do, i just...
2                                 smells good, feels great!
3                                            felt synthetic
4                                                   love it
                                ...                        
692326       conditioner is great shampoo not as i expected
692327    did not work! used the whole bottle and my hai...
692328           product as expected. shipping was on time.
692329    not only is it a delicious fragrance, but also...
692330    the conditioner doesnt really make your hair t...
Name: cleaned_text, Length: 692331, dtype: object

In [31]:
df_user_sentiment = df_user_normalized_PCA[['product_ID','cleaned_text']]

In [32]:
df_user_sentiment

Unnamed: 0,product_ID,cleaned_text
0,B00YQ6X8EO,this spray is really nice. it smells really go...
1,B081TJ8YS3,"this product does what i need it to do, i just..."
2,B07PNNCSP9,"smells good, feels great!"
3,B09JS339BZ,felt synthetic
4,B08BZ63GMJ,love it
...,...,...
692326,B006YUIWKA,conditioner is great shampoo not as i expected
692327,B006YUIWKA,did not work! used the whole bottle and my hai...
692328,B06ZZV9MZT,product as expected. shipping was on time.
692329,B000HB6VLE,"not only is it a delicious fragrance, but also..."


## 1. Sentiment analysis of the user's reviews

### 1.1 Calculate sentiment score of each user's review.

In [33]:
# Step 1: Install necessary libraries (if not already installed)
!pip install textblob

from textblob import TextBlob

# Step 2: Define a function for sentiment analysis
def analyze_sentiment(text):
    # Create a TextBlob object
    blob = TextBlob(text)
    
    # Get the polarity of the text
    polarity = blob.sentiment.polarity
    return polarity

# Step 3: Apply sentiment analysis to the 'cleaned_text' column
df_user_sentiment[['sentiment_polarity']] = df_user_sentiment['cleaned_text'].apply(analyze_sentiment).apply(pd.Series)

Collecting textblob
  Downloading textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
   ---------------------------------------- 0.0/626.3 kB ? eta -:--:--
   ---------------------------------------- 626.3/626.3 kB 7.9 MB/s eta 0:00:00
Installing collected packages: textblob
Successfully installed textblob-0.18.0.post0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user_sentiment[['sentiment_polarity']] = df_user_sentiment['cleaned_text'].apply(analyze_sentiment).apply(pd.Series)


In [34]:
df_user_sentiment

Unnamed: 0,product_ID,cleaned_text,sentiment_polarity
0,B00YQ6X8EO,this spray is really nice. it smells really go...,0.390104
1,B081TJ8YS3,"this product does what i need it to do, i just...",0.250000
2,B07PNNCSP9,"smells good, feels great!",0.850000
3,B09JS339BZ,felt synthetic,0.000000
4,B08BZ63GMJ,love it,0.500000
...,...,...,...
692326,B006YUIWKA,conditioner is great shampoo not as i expected,0.350000
692327,B006YUIWKA,did not work! used the whole bottle and my hai...,-0.083333
692328,B06ZZV9MZT,product as expected. shipping was on time.,-0.100000
692329,B000HB6VLE,"not only is it a delicious fragrance, but also...",0.531250


In [37]:
df_user_sentiment_agg = df_user_sentiment.groupby('product_ID')['sentiment_polarity'].mean().reset_index()

In [38]:
df_user_sentiment_agg

Unnamed: 0,product_ID,sentiment_polarity
0,0005946468,0.800000
1,0123034892,0.700000
2,0124784577,0.296296
3,0515059560,0.052083
4,0615675026,-0.099841
...,...,...
115551,B0CBMFK1S2,0.316667
115552,B0CBWDTY41,0.469333
115553,B0CBXM7WHY,0.490000
115554,B0CCPDTRK7,0.249306


In [40]:
df_user_positive_sentiment_agg = df_user_sentiment_agg[df_user_sentiment_agg['sentiment_polarity'] > 0]

In [41]:
df_user_positive_sentiment_agg

Unnamed: 0,product_ID,sentiment_polarity
0,0005946468,0.800000
1,0123034892,0.700000
2,0124784577,0.296296
3,0515059560,0.052083
5,0692508988,0.398333
...,...,...
115551,B0CBMFK1S2,0.316667
115552,B0CBWDTY41,0.469333
115553,B0CBXM7WHY,0.490000
115554,B0CCPDTRK7,0.249306


In [63]:
df_user_positive_sentiment_agg[df_user_positive_sentiment_agg['product_ID']=='B085TBXF1Z']

Unnamed: 0,product_ID,sentiment_polarity
79514,B085TBXF1Z,0.20487


In [23]:
# Verify the number of unique users:
unique_values_user = df_user_normalized_PCA.user_ID.nunique()
unique_values_user

630315

In [24]:
df_user_normalized_PCA['user_ID'].value_counts()

user_ID
AG73BVBKUOH22USSFJA5ZWL7AKXA      165
AEZP6Z2C5AVQDZAJECQYZWQRNG3Q      146
AEMP3A7IKW37CMWFXNKXWW6HGJHA_1    115
AGZUJTI7A3JFKB4FP5JOH6NVAJIQ_1     87
AFDYIK3FNPY2JFBQYUWC6GSBMIRQ_2     83
                                 ... 
AEX24LYNE5WPGISAP425SQPUKJXQ        1
AFSMCZTEUW3TI2BSPE25BD5GKXLA        1
AGKNUO4XOIPCSIKDRHO56UQDPXVQ        1
AEJQRDONU2O5LSOD5OC77XO43DFA        1
AGIYQU6RK6TBKBCMWKVPBPBMMJNA        1
Name: count, Length: 630315, dtype: int64

In [25]:
df_user_normalized_PCA.shape

(692331, 314)

In [6]:
unique_combinations = df_user_normalized_PCA[['user_ID', 'product_ID']].value_counts()
unique_combinations

user_ID                       product_ID
AGWOOXMW2IXPKZOWAIWNMCXY7LBQ  B09NS1VG4L    2
AGPGHXWBOK3TRJU7WACF5AAYBBVQ  B0B18CG44R    1
AGPGHNJ32YJHVI2YKOBYQHSJS4WQ  B07L5QVV6W    1
AGPGHPOQZP7XKLM6KJF2E6JT3D5A  B07DLF8HKR    1
AGPGHQIMPLOJD3FR3ODRDJFYSJBQ  B079D87KKM    1
                                           ..
AFENC7VBDKGZQXZO437KEQWP45WA  B07Z2QTWSW    1
AFENCIKTUQKQM2YPBVDGBB4S6OZQ  B01KU867GM    1
AFENCVCHNNX3UATG4KRVIPYJLLGQ  B00YZ0XOWC    1
AFENDQSAV5XVGGXKHRSDE3DUYK4A  B00D4T40D2    1
AHZZZSOTVOVACVK2WWXL4ITEAPIA  B00R1TAN7I    1
Name: count, Length: 692330, dtype: int64

## 2. Create User and Product Vectors

* Aggregating the embeddings to compute a single user vector per user_ID and single product vector per product_ID.
* To create a vector for each user, we can average all the review embeddings corresponding to that user. This works because averaging retains a general sense of the user's overall preferences.
* Similarly, we can aggregate the embeddings for each product. For example, average all the embeddings corresponding to each product ID.

### 2.1 Aggregate User Vectors

In [7]:
# Select relevant columns: user_ID and embedding dimensions
vector_cols = [f'dim_norm_PCA{i}' for i in range(300)]
df_user_vectors = df_user_normalized_PCA.groupby('user_ID')[vector_cols].mean().reset_index()

# Rename the columns of vectors to indicate these are user vectors:
df_user_vectors.columns = ['user_ID'] + [f'user_vec_{i}' for i in range(300)]

df_user_vectors.shape

(630315, 301)

In [8]:
df_user_vectors.head()

Unnamed: 0,user_ID,user_vec_0,user_vec_1,user_vec_2,user_vec_3,user_vec_4,user_vec_5,user_vec_6,user_vec_7,user_vec_8,...,user_vec_290,user_vec_291,user_vec_292,user_vec_293,user_vec_294,user_vec_295,user_vec_296,user_vec_297,user_vec_298,user_vec_299
0,AE222BBOVZIF42YOOPNBXL4UUMYA,0.022338,0.100595,-0.206153,0.065659,0.023273,0.01193,0.054218,0.009806,0.07182,...,-0.001122,0.004642,-0.001718,-0.007591,-0.007428,0.001374,-0.01839,-0.007487,0.000145,0.006247
1,AE222FP7YRNFCEQ2W3ZDIGMSYTLQ,0.022153,0.030388,-0.094107,-0.025539,0.093637,0.17047,-0.03177,-0.060638,-0.060701,...,0.001796,-0.002748,0.003149,0.006347,0.020818,-0.002077,0.013311,-0.006835,-0.007763,-0.002501
2,AE222X475JC6ONXMIKZDFGQ7IAUA,-0.012598,0.219775,0.142422,-0.01776,0.129527,0.076851,0.021589,-0.034965,-0.045799,...,-0.022207,-0.002645,0.041338,-0.015161,0.011363,-0.026213,0.002272,0.003468,-0.002008,0.002792
3,AE222Y4WTST6BUZ4J5Y2H6QMBITQ,-0.231625,-0.016032,-0.102149,0.066109,-0.050187,0.007027,0.020258,0.010995,-0.007235,...,-0.006156,-0.003489,-0.000958,0.016319,-0.011625,-0.000594,0.007983,-0.010001,0.012913,-0.001186
4,AE2232TEZOEWQLAFEX2NA6VBGMYQ,0.237254,-0.08806,0.097924,0.076692,0.160438,0.100078,-0.051536,-0.046282,0.132318,...,0.003213,-0.006024,-0.003414,0.001399,0.021043,0.006149,0.000152,-0.002788,0.0106,0.002627


In [9]:
# Save the new DataFrame with user vectors

file_path_user_vec = rf'.\..\data\text_analysis\user_vectors\user_vectors_merged.csv'
df_user_vectors.to_csv(file_path_user_vec, index=False)

### 2.2 Aggregate Product Vectors

In [43]:
#product_vector_cols = [f'dim_norm_PCA{i}' for i in range(300)]
df_product_vectors = df_user_normalized_PCA.groupby('product_ID')[vector_cols].mean().reset_index()

df_product_vectors.columns = ['product_ID'] + [f'product_vec_{i}' for i in range(300)]
df_product_vectors.shape

(115556, 301)

Filter out product vectors with positive sentiment

In [46]:
df_product_vectors_positive_sentiment = df_product_vectors[df_product_vectors['product_ID'].isin(df_user_positive_sentiment_agg['product_ID'])]

In [47]:
df_product_vectors_positive_sentiment

Unnamed: 0,product_ID,product_vec_0,product_vec_1,product_vec_2,product_vec_3,product_vec_4,product_vec_5,product_vec_6,product_vec_7,product_vec_8,...,product_vec_290,product_vec_291,product_vec_292,product_vec_293,product_vec_294,product_vec_295,product_vec_296,product_vec_297,product_vec_298,product_vec_299
0,0005946468,0.005288,0.247759,0.103427,-0.091621,0.005084,-0.099639,-0.107011,0.122461,0.012713,...,0.002068,0.009174,-0.019991,-0.003565,-0.003649,-0.017079,-0.013606,0.001881,0.015874,-0.002307
1,0123034892,0.109861,0.275601,0.000386,-0.047517,0.004198,-0.083188,-0.079647,-0.121367,0.129342,...,-0.007957,-0.009873,0.003017,0.015212,-0.005792,0.007390,0.004461,0.001337,0.009957,-0.012407
2,0124784577,-0.017208,0.132227,0.100658,-0.126151,0.059223,0.032017,-0.042822,0.087571,0.032354,...,-0.005163,-0.009594,-0.002201,0.013803,0.002335,0.004500,-0.009917,0.001281,-0.000278,-0.003500
3,0515059560,-0.209201,-0.058736,0.202120,0.062009,-0.090060,0.028901,-0.018587,0.080671,-0.056896,...,-0.010150,0.030672,-0.013799,0.020456,-0.005572,-0.030364,0.009011,-0.004363,-0.024073,0.008783
5,0692508988,-0.229014,-0.004910,0.084747,0.074721,-0.113337,0.017797,-0.113408,0.059619,-0.109043,...,0.002954,0.007351,-0.000078,0.020734,-0.009473,-0.010563,0.007831,0.008652,-0.000035,0.000989
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115551,B0CBMFK1S2,-0.067334,0.154381,-0.197790,-0.023412,0.010574,-0.022570,0.074094,-0.099601,0.037623,...,0.010685,-0.005477,0.001734,0.002790,-0.004053,0.008070,-0.002784,-0.001871,0.003500,0.010731
115552,B0CBWDTY41,0.069057,0.038905,-0.166532,-0.235593,-0.038300,0.060362,-0.100929,0.040324,-0.045593,...,-0.010125,0.001486,0.003190,0.004283,0.005094,-0.000096,-0.000655,-0.001726,0.001275,-0.004093
115553,B0CBXM7WHY,-0.169268,0.007456,0.089662,-0.067994,0.013050,0.057634,-0.050431,-0.041138,0.025872,...,-0.009981,0.013271,-0.008269,-0.003295,0.014088,-0.008365,0.004828,0.003399,0.010965,-0.004015
115554,B0CCPDTRK7,0.118037,-0.196848,-0.070833,-0.118934,0.010600,0.104431,-0.061131,-0.024269,-0.122102,...,0.002205,0.009895,0.001180,-0.001901,0.007467,-0.010565,0.000210,0.000453,0.006803,-0.002013


In [49]:
# Save the new DataFrame with product vectors

file_path_product_vec = rf'.\..\data\text_analysis\product_vectors\product_vectors_merged_sentiment.csv'
df_product_vectors_positive_sentiment.to_csv(file_path_product_vec, index=False)

## 3. Semantic Analysis

### Compare Reviews - Similarity Research with COS Similarity

* Extract the user and product vectors.
* Compute the cosine similarity between each user and all products.
* Rank products for each user based on similarity scores.

Create a DataFrame that stores all items that user purchased, to remove those from the recommendations

In [13]:
user_item_df = df_user_normalized_PCA.groupby('user_ID')['product_ID'].apply(list).reset_index()
user_item_df

Unnamed: 0,user_ID,product_ID
0,AE222BBOVZIF42YOOPNBXL4UUMYA,[B013HR1A92]
1,AE222FP7YRNFCEQ2W3ZDIGMSYTLQ,[B0BTT658PQ]
2,AE222X475JC6ONXMIKZDFGQ7IAUA,[B00PBDMRES]
3,AE222Y4WTST6BUZ4J5Y2H6QMBITQ,[B00012FPSO]
4,AE2232TEZOEWQLAFEX2NA6VBGMYQ,[B07QNPXBLH]
...,...,...
630310,AHZZYVEU6QFMPFZ2HJUWR22SNK4A,[B07JJ9NFFH]
630311,AHZZZAK24AJ3JNBDUZJGHHWSRVAA,[B00KXFD75M]
630312,AHZZZJP24QUSB5XWW6MAXYBZZZSQ,[B00IG0677G]
630313,AHZZZL7YQJA3RSA6PYK3WMFACYIQ,[B091TKH1JF]


In [50]:
# Extract user vectors
user_ids = df_user_vectors['user_ID'].values
user_vectors = df_user_vectors.iloc[:, 1:].values

# Extract product vectors
product_ids = df_product_vectors_positive_sentiment['product_ID'].values
product_vectors = df_product_vectors_positive_sentiment.iloc[:, 1:].values

In [52]:
print(f"Shape of User Vectors: {user_vectors.shape}")
print(f"Shape of Product Vectors: {product_vectors.shape}")

Shape of User Vectors: (630315, 300)
Shape of Product Vectors: (96526, 300)


In [53]:
# Define the number of closest product vectors to compute
top_n_products = 300 # find the top 300 closest products for each user instead of computing all pairwise similarities.
# Fit the NearestNeighbors model on the product vectors
nbrs = NearestNeighbors(n_neighbors=top_n_products, metric='cosine').fit(product_vectors)

# Retrieve the top-N most similar products for each user
distances, indices = nbrs.kneighbors(user_vectors)

# `distances` contains the cosine distances (1 - similarity)
# Convert distances to similarity scores
similarity_scores = 1 - distances

In [54]:
# Save the indices and similarity scores
np.savez(r'.\..\data\cos_similarity\recommendations_300_merged_sentiment.npz', indices=indices, similarity_scores=similarity_scores)