In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from transformers import BertTokenizer, BertModel, pipeline
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import ast
import re
from datetime import datetime

### 1. Import data

In [24]:
# Load an array with generated indices and similarity scores:
data = np.load(r'.\..\data\cos_similarity\recommendations_300.npz')
indices = data["indices"]
similarity_scores = data["similarity_scores"]

In [4]:
# Load the DataFrame with user vectors:
file_path_user_vec = rf'.\..\data\text_analysis\user_vectors\user_vectors_df.csv'
df_user_vectors = pd.read_csv(file_path_user_vec)
df_user_vectors

Unnamed: 0,user_ID,user_vec_0,user_vec_1,user_vec_2,user_vec_3,user_vec_4,user_vec_5,user_vec_6,user_vec_7,user_vec_8,...,user_vec_290,user_vec_291,user_vec_292,user_vec_293,user_vec_294,user_vec_295,user_vec_296,user_vec_297,user_vec_298,user_vec_299
0,AE222BBOVZIF42YOOPNBXL4UUMYA,0.157177,-0.047737,-0.016334,-0.216434,-0.053552,-0.098437,-0.042217,0.004862,0.124219,...,0.000074,-0.003814,0.016722,-0.003910,0.001206,0.005939,-0.018628,0.000775,-0.005081,-0.010397
1,AE222FP7YRNFCEQ2W3ZDIGMSYTLQ,0.045970,-0.018998,0.164447,-0.194383,0.170237,0.089281,-0.069243,0.064173,-0.061230,...,0.012602,-0.014370,0.013525,0.015043,0.007685,0.017838,0.014043,0.019642,-0.018238,0.001468
2,AE222X475JC6ONXMIKZDFGQ7IAUA,0.410081,0.025667,-0.121930,0.184316,0.005100,-0.151829,0.046592,0.091186,0.147147,...,0.013997,0.025863,-0.006677,-0.014606,0.005337,0.008045,0.014762,0.001563,0.011351,0.010184
3,AE222Y4WTST6BUZ4J5Y2H6QMBITQ,-0.204608,-0.027967,-0.074068,0.101387,0.089039,-0.184170,-0.045104,-0.112713,0.019092,...,-0.007948,0.013191,-0.006841,-0.000236,-0.015392,-0.002550,-0.005537,0.006153,0.022717,-0.003635
4,AE2232TEZOEWQLAFEX2NA6VBGMYQ,0.306584,-0.059949,0.359815,-0.206749,0.433880,-0.091246,0.383815,0.030456,-0.088499,...,0.000389,-0.002188,0.011391,-0.011571,0.013657,0.018282,-0.001572,0.000488,0.012923,-0.009286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
630471,AHZZYVEU6QFMPFZ2HJUWR22SNK4A,0.593454,-0.110751,0.061504,0.057214,0.009090,0.047510,-0.150822,-0.063656,0.111528,...,-0.030163,0.023398,0.014013,-0.002637,-0.001210,0.005657,0.001707,-0.009059,0.008571,-0.018598
630472,AHZZZAK24AJ3JNBDUZJGHHWSRVAA,-0.151946,-0.079085,0.160790,-0.169700,-0.059934,-0.022604,-0.079591,-0.032386,0.089506,...,0.009529,0.003296,-0.014211,0.008373,0.007964,0.011437,-0.014251,-0.011531,-0.000954,-0.005996
630473,AHZZZJP24QUSB5XWW6MAXYBZZZSQ,0.219923,-0.060210,-0.184222,-0.095579,0.057761,0.069270,-0.088772,-0.038855,-0.106477,...,0.002465,0.002635,-0.014682,0.016696,0.000290,-0.008902,0.010821,0.003087,-0.008261,0.008120
630474,AHZZZL7YQJA3RSA6PYK3WMFACYIQ,-0.142785,0.088419,-0.049655,0.022085,0.039177,0.011978,0.020563,-0.029427,0.039243,...,-0.010772,0.020052,-0.009797,0.006591,0.000189,-0.004727,0.011461,0.003287,0.003351,0.008096


In [3]:
# Load the DataFrame with product vectors:
file_path_product_vec = rf'.\..\data\text_analysis\product_vectors\product_vectors_df.csv'
df_product_vectors = pd.read_csv(file_path_product_vec)
df_product_vectors

Unnamed: 0,product_ID,product_vec_0,product_vec_1,product_vec_2,product_vec_3,product_vec_4,product_vec_5,product_vec_6,product_vec_7,product_vec_8,...,product_vec_290,product_vec_291,product_vec_292,product_vec_293,product_vec_294,product_vec_295,product_vec_296,product_vec_297,product_vec_298,product_vec_299
0,0005946468,0.763349,0.091675,0.117928,0.099401,0.132281,0.070900,-0.223043,-0.012843,-0.009337,...,-0.000546,0.003127,0.005684,0.003426,-0.006655,-0.001777,-0.009676,0.000570,-0.001297,0.007379
1,0123034892,0.578087,-0.060032,-0.136238,-0.096158,-0.007847,-0.121033,-0.108942,0.071681,0.014017,...,-0.002423,-0.005551,0.002831,-0.002318,0.003190,0.003064,-0.000896,-0.000299,-0.010328,0.001244
2,0124784577,0.055595,-0.048237,0.045403,0.001712,0.206265,0.038459,0.181421,0.045733,-0.041820,...,-0.006233,-0.002329,0.004678,0.007249,-0.001499,0.003474,0.001119,0.008357,-0.006440,0.007834
3,0515059560,-0.141502,0.079938,0.102094,-0.057478,-0.011861,-0.127184,-0.016523,-0.101494,-0.038372,...,-0.004293,-0.011067,0.000231,0.007716,-0.003150,0.018886,-0.001149,-0.004640,0.000215,0.001940
4,0615675026,-0.187639,0.006820,0.150007,-0.152691,0.051482,-0.066133,-0.137043,-0.074076,0.021345,...,-0.000969,0.008569,-0.003590,-0.015260,0.002899,-0.004042,0.004385,-0.012813,0.003399,-0.007380
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115571,B0CBMFK1S2,-0.149024,-0.046643,-0.168177,0.006320,0.041629,-0.095680,0.010731,0.121839,-0.117584,...,0.008490,-0.027375,0.002692,0.002163,0.021168,-0.004761,-0.003061,-0.000419,-0.001015,0.004682
115572,B0CBWDTY41,-0.100371,0.046079,-0.234349,0.136378,0.072183,-0.078564,0.072506,0.096607,0.007511,...,0.009562,0.001192,-0.000367,-0.001705,-0.007611,0.007333,-0.003466,0.004164,-0.000073,-0.000403
115573,B0CBXM7WHY,-0.128249,-0.042449,0.129125,-0.057023,0.020782,0.061780,-0.092159,0.122308,0.098822,...,-0.008240,-0.005579,-0.009333,-0.011390,-0.003523,0.003123,-0.000537,-0.010630,0.007802,-0.012785
115574,B0CCPDTRK7,-0.222002,0.092505,-0.143339,0.069648,0.143655,0.148648,0.053853,-0.133939,0.024819,...,-0.012326,-0.004668,-0.009286,-0.013051,-0.003388,0.012504,0.008019,0.002882,0.018385,-0.000772


In [5]:
# load the dataset of PCA normalized embeddings: 
file_path = rf'.\..\data\embeddings_dim_reduction\df_user_normalized_PCA.csv' # adjust the filepath with the proper file 
df_user_normalized_PCA = pd.read_csv(file_path) 

In [6]:
# Upload the user DataFrame and analyse the recommendations
file_path = r'.\..\data\data_clean\user_clean_data_ecommerce.csv'  # Update this with your file path
df_user = pd.read_csv(file_path)

### 2. Run recommendations for the specific user_id

### Compare Reviews - Similarity Research with COS Similarity

* Extract the user and product vectors.
* Compute the cosine similarity between each user and all products (already precalculated matrix for 300 closest vectors)
* Rank products for each user based on similarity scores.

Create a DataFrame that stores all items that user purchased, to remove those from the recommendations

In [7]:
user_item_df = df_user_normalized_PCA.groupby('user_ID')['product_ID'].apply(list).reset_index()
user_item_df

Unnamed: 0,user_ID,product_ID
0,AE222BBOVZIF42YOOPNBXL4UUMYA,[B013HR1A92]
1,AE222FP7YRNFCEQ2W3ZDIGMSYTLQ,[B0BTT658PQ]
2,AE222X475JC6ONXMIKZDFGQ7IAUA,[B00PBDMRES]
3,AE222Y4WTST6BUZ4J5Y2H6QMBITQ,[B00012FPSO]
4,AE2232TEZOEWQLAFEX2NA6VBGMYQ,[B07QNPXBLH]
...,...,...
630471,AHZZYVEU6QFMPFZ2HJUWR22SNK4A,[B07JJ9NFFH]
630472,AHZZZAK24AJ3JNBDUZJGHHWSRVAA,[B00KXFD75M]
630473,AHZZZJP24QUSB5XWW6MAXYBZZZSQ,[B00IG0677G]
630474,AHZZZL7YQJA3RSA6PYK3WMFACYIQ,[B091TKH1JF]


In [26]:
df_user_normalized_PCA['user_ID'].value_counts()

user_ID
AG73BVBKUOH22USSFJA5ZWL7AKXA      165
AEZP6Z2C5AVQDZAJECQYZWQRNG3Q      146
AEMP3A7IKW37CMWFXNKXWW6HGJHA_1    115
AGZUJTI7A3JFKB4FP5JOH6NVAJIQ_1     87
AFDYIK3FNPY2JFBQYUWC6GSBMIRQ_2     83
                                 ... 
AFSMCZTEUW3TI2BSPE25BD5GKXLA        1
AGKNUO4XOIPCSIKDRHO56UQDPXVQ        1
AEJQRDONU2O5LSOD5OC77XO43DFA        1
AFFFHL7GG5FLD2TSUGU65HTN6FMA        1
AGIYQU6RK6TBKBCMWKVPBPBMMJNA        1
Name: count, Length: 630476, dtype: int64

In [27]:
unique_combinations = df_user_normalized_PCA[['user_ID', 'product_ID']].value_counts()
unique_combinations

user_ID                       product_ID
AGWOOXMW2IXPKZOWAIWNMCXY7LBQ  B09NS1VG4L    2
AE222BBOVZIF42YOOPNBXL4UUMYA  B013HR1A92    1
AGPGHQIMPLOJD3FR3ODRDJFYSJBQ  B079D87KKM    1
AGPGGDJBP4W2D3QJ2WN3NWHSPA7Q  B08791HQXG    1
AGPGGF3KFAOMNATUGFSZEMRJ6PVQ  B07TXYVLPS    1
                                           ..
AFENAWCNZDSJANL43HMAQDOIN5QQ  B07D33K512    1
AFENAYIMKNX6PGBHATFCTZS2SAAQ  B008QSM704    1
AFENB2HA5MVZWNKRICDWRXR5PCDA  B001E76F6G    1
AFENBUWI2IGQ5ZBTH4XE36QRIDLA  B07FGFWKXM    1
AHZZZSOTVOVACVK2WWXL4ITEAPIA  B00R1TAN7I    1
Name: count, Length: 692535, dtype: int64

In [9]:
# Extract user vectors
user_ids = df_user_vectors['user_ID'].values
user_vectors = df_user_vectors.iloc[:, 1:].values

# Extract product vectors
product_ids = df_product_vectors['product_ID'].values
product_vectors = df_product_vectors.iloc[:, 1:].values

In [25]:
def recommend_top_n_products_by_user_id(user_id, user_ids, similarity_scores, indices, product_ids, user_item_df, top_n=5):
    """
    Recommend top N products for a given user based on precomputed top-N cosine similarity.

    Parameters:
    - user_id: User ID for whom to generate recommendations
    - user_ids: List of user IDs corresponding to rows in similarity data
    - similarity_scores: Precomputed top-N similarity scores (users x top-N products)
    - indices: Indices of the top-N products for each user
    - product_ids: List of product IDs corresponding to the product vectors
    - user_item_df: DataFrame containing user-product interactions
    - top_n: Number of top recommendations to return

    Returns:
    - List of (product_id, similarity_score) tuples
    """

    # Find the index of the user_ID
    if user_id not in user_ids:
        raise ValueError(f"User ID '{user_id}' not found in the user data.")
    
    user_index = np.where(user_ids == user_id)[0][0]  # Locate the index of user_id

    # Get top-N similarity scores and product indices for this user
    user_similarities = similarity_scores[user_index]
    user_product_indices = indices[user_index]

    # Retrieve the list of already purchased products for the user
    purchased_products = user_item_df.loc[user_item_df['user_ID'] == user_id, 'product_ID']
    if not purchased_products.empty:
        # Flatten and handle lists or non-hashable elements in purchased_products
        if purchased_products.apply(lambda x: isinstance(x, list)).any():
            purchased_products = set([item for sublist in purchased_products for item in sublist])
        else:
            purchased_products = set(purchased_products)
    else:
        purchased_products = set()

    # Filter and sort recommendations
    recommendations = []
    for i, product_index in enumerate(user_product_indices):
        product = product_ids[product_index]
        if product not in purchased_products:
            recommendations.append((product, user_similarities[i]))
        if len(recommendations) >= top_n:  # Stop when we have enough recommendations
            break
    
    return recommendations

# Example usage
user_id_input = "AGKHLEW2SOWHNMFQIJGBECAF7INQ"  # Replace with user_ID
top_n = 5

try:
    recommendations = recommend_top_n_products_by_user_id(
        user_id_input, 
        user_ids, 
        similarity_scores, 
        indices, 
        product_ids, 
        user_item_df, 
        top_n
    )

    print(f"Top-{top_n} Recommendations for User '{user_id_input}':")
    for product_id, score in recommendations:
        print(f"Product ID: {product_id}, Similarity Score: {score:.4f}")
except ValueError as e:
    print(e)


Top-5 Recommendations for User 'AGKHLEW2SOWHNMFQIJGBECAF7INQ':
Product ID: B0170FP8CC, Similarity Score: 0.7362
Product ID: B0BQWTXV2Q, Similarity Score: 0.7361
Product ID: B08JQS9FVP, Similarity Score: 0.7267
Product ID: B09TQ2SDKK, Similarity Score: 0.7252
Product ID: B085TBXF1Z, Similarity Score: 0.7218


In [15]:
# Find users who ourchased more than 1 product to verify the recommendations for them:
duplicated_user_ids = df_user_normalized_PCA['user_ID'][df_user_normalized_PCA['user_ID'].duplicated()].unique()
duplicated_user_ids

array(['AGKHLEW2SOWHNMFQIJGBECAF7INQ', 'AFQLNQNQYFWQZPJQZS6V3NZU4QBQ',
       'AFSKPY37N3C43SOI5IEXEK5JSIYA', ...,
       'AFV3EYFZLLLBWIXWRZUSRJOHLNBA', 'AEVTGJFLW22HVSWOJLJCBJUN46WA',
       'AHURE3VT2MLCTARMYI7JA7KKDYAA'], dtype=object)

In [22]:
pd.set_option('display.max_colwidth', None)

Analyse users purchase:

In [23]:
single_user_check = df_user[df_user['user_ID']=='AGKHLEW2SOWHNMFQIJGBECAF7INQ']
single_user_check

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,year,cleaned_text
0,5,Such a lovely scent but not overpowering.,"This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, medium thickness. I am comparing to other brands with yucky chemicals so I'm gonna stick with this. Try it!",[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-05 14:08:48.923,0,1,2020,"this spray is really nice. it smells really good, goes on really fine, and does the trick. i will say it feels like you need a lot of it though to get the texture i want. i have a lot of hair, medium thickness. i am comparing to other brands with yucky chemicals so im gonna stick with this. try it!"
1,4,Works great but smells a little weird.,"This product does what I need it to do, I just wish it was odorless or had a soft coconut smell. Having my head smell like an orange coffee is offputting. (granted, I did know the smell was described but I was hoping it would be light)",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-04 18:10:55.070,1,1,2020,"this product does what i need it to do, i just wish it was odorless or had a soft coconut smell. having my head smell like an orange coffee is offputting. granted, i did know the smell was described but i was hoping it would be light"


Analyse recommended products:

In [20]:
product_ids_to_filter = ['B0170FP8CC', 'B0BQWTXV2Q','B08JQS9FVP','B09TQ2SDKK','B085TBXF1Z']  # List of product IDs you want to filter
filtered_df = df_user[df_user['product_ID'].isin(product_ids_to_filter)]
filtered_df.head(5)

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,year,cleaned_text
2398,4,Super Gentle,"My skin is really sensitive.<br /><br />I also don't generally like all-in-one products. You should see my bathroom cabinets. Trying to consolidate, but I get different rashes in different places, and no one thing works for everything.<br /><br />This is very gentle and did not make my skin or face or scalp break out or rash up at all. It's not super moisturizing, but it wasn't drying, either. The scent is very light, and the cleanser has a little minty/tingly kick. This would have worked for me in my teens or 20s when I couldn't wait to get out of the shower and on with my life.",[],B0BQWTXV2Q,B0BQWTXV2Q,AENH6LSB6BM7XLPEYUL43WBOD6JA,2023-02-06 18:45:25.633,0,0,2023,"my skin is really sensitive.i also dont generally like allinone products. you should see my bathroom cabinets. trying to consolidate, but i get different rashes in different places, and no one thing works for everything.this is very gentle and did not make my skin or face or scalp break out or rash up at all. its not super moisturizing, but it wasnt drying, either. the scent is very light, and the cleanser has a little mintytingly kick. this would have worked for me in my teens or 20s when i couldnt wait to get out of the shower and on with my life."
3381,4,Strong scent but works great,"Really happy with this concentrate. I did not expect it to work well but am very thankful to be wrong. It thickens to the normal consistency of conditioners after it cools. It’s very moisturizing for my thick curly hair. My only issue was the scent. Very strong herbal scent. It was not really my thing but a little added essential oil was all it needed. However, in the future I’ll just order the unscented and add my own scent",[],B08JQS9FVP,B08JQS9FVP,AFPAGWUQX3ELC4PWOOATIP7EDMOA,2021-02-12 20:57:00.886,3,1,2021,"really happy with this concentrate. i did not expect it to work well but am very thankful to be wrong. it thickens to the normal consistency of conditioners after it cools. its very moisturizing for my thick curly hair. my only issue was the scent. very strong herbal scent. it was not really my thing but a little added essential oil was all it needed. however, in the future ill just order the unscented and add my own scent"
4327,3,Strongly scented,"The smell is really strong. Not at all subtle. It reminds me of scented talcum powders my Grandma used to use, kind of a spicy floral smell. I personally think it is too strong and would not want to go around smelling that way if I can help it. That said, it works well to decrease the appearance of oily roots and can extend your hair wash by a day for sure, if you can stand the smell. I will keep it as a backup for when I run out of the other lighter scented dry shampoos.",[],B0170FP8CC,B0170FP8CC,AE5S2ACTELDBCIGS2M5377BIAB7Q,2017-10-13 22:26:41.602,1,1,2017,"the smell is really strong. not at all subtle. it reminds me of scented talcum powders my grandma used to use, kind of a spicy floral smell. i personally think it is too strong and would not want to go around smelling that way if i can help it. that said, it works well to decrease the appearance of oily roots and can extend your hair wash by a day for sure, if you can stand the smell. i will keep it as a backup for when i run out of the other lighter scented dry shampoos."
8252,5,Great product in a convenient package with ingredients I recognize!,I admit I ordered this product because I loved the &#34;mermaid&#34; part of the name and because the ingredients were ones I could recognize. I will order again because it works. My hair--blonde by Aveda--looks great and does have more volume between complete washings. I am planning several trips this year and this Captain Blankenship product will be traveling with me for the quick lifts and for in-depth pick-me-ups between shampoo/conditionings. The rose geranium scent is subtle and refreshing. A real 5 star product.,[],B0170FP8CC,B0170FP8CC,AE7XIIPNUJEEMT45BAXWXGOFMSKA,2016-04-24 14:12:44.000,3,1,2016,i admit i ordered this product because i loved the 34;mermaid34; part of the name and because the ingredients were ones i could recognize. i will order again because it works. my hairblonde by avedalooks great and does have more volume between complete washings. i am planning several trips this year and this captain blankenship product will be traveling with me for the quick lifts and for indepth pickmeups between shampooconditionings. the rose geranium scent is subtle and refreshing. a real 5 star product.
9338,3,"OK for a dry shampoo, but scent is overpowering","Smells like a grandma's powder room. I really can't get past how strong the fragrance is. Is it really necessary? Other than that, I like that it isn't an aerosol. I will probably just save this for when I go camping. I don't think I can tolerate the intense odor for even occasional use at home.",[],B0170FP8CC,B0170FP8CC,AFT65K7NEYPNCN6HT6TRPIEUUWQQ,2016-02-29 15:46:45.000,6,1,2016,"smells like a grandmas powder room. i really cant get past how strong the fragrance is. is it really necessary? other than that, i like that it isnt an aerosol. i will probably just save this for when i go camping. i dont think i can tolerate the intense odor for even occasional use at home."


In [21]:

# Display the cleaned_text column
filtered_df.cleaned_text

2398           my skin is really sensitive.i also dont generally like allinone products. you should see my bathroom cabinets. trying to consolidate, but i get different rashes in different places, and no one thing works for everything.this is very gentle and did not make my skin or face or scalp break out or rash up at all. its not super moisturizing, but it wasnt drying, either. the scent is very light, and the cleanser has a little mintytingly kick. this would have worked for me in my teens or 20s when i couldnt wait to get out of the shower and on with my life.
3381                                                                                                                                            really happy with this concentrate. i did not expect it to work well but am very thankful to be wrong. it thickens to the normal consistency of conditioners after it cools. its very moisturizing for my thick curly hair. my only issue was the scent. very strong herbal scent. it was not