# Hybrid Filter Creation

**Preparation**

Libraries Import

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from transformers import BertTokenizer, BertModel, pipeline
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import ast
import re
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


Data Import

In [3]:
# load the dataset of PCA normalized embeddings: 
file_path = rf'.\..\data\embeddings_dim_reduction\df_user_normalized_PCA20250106_160122.csv' # adjust the file_path with the proper name of the file
df_user_normalized_PCA = pd.read_csv(file_path) 

In [4]:
df_user_normalized_PCA['cleaned_text']

0         this spray is really nice. it smells really go...
1         this product does what i need it to do, i just...
2                                 smells good, feels great!
3                                            felt synthetic
4                                                   love it
                                ...                        
692531       conditioner is great shampoo not as i expected
692532    did not work! used the whole bottle and my hai...
692533           product as expected. shipping was on time.
692534    not only is it a delicious fragrance, but also...
692535    the conditioner doesnt really make your hair t...
Name: cleaned_text, Length: 692536, dtype: object

In [5]:
df_user_sentiment = df_user_normalized_PCA[['product_ID','cleaned_text']]

In [6]:
df_user_sentiment

Unnamed: 0,product_ID,cleaned_text
0,B00YQ6X8EO,this spray is really nice. it smells really go...
1,B081TJ8YS3,"this product does what i need it to do, i just..."
2,B07PNNCSP9,"smells good, feels great!"
3,B09JS339BZ,felt synthetic
4,B08BZ63GMJ,love it
...,...,...
692531,B006YUIWKA,conditioner is great shampoo not as i expected
692532,B006YUIWKA,did not work! used the whole bottle and my hai...
692533,B06ZZV9MZT,product as expected. shipping was on time.
692534,B000HB6VLE,"not only is it a delicious fragrance, but also..."


Sentiment Analysis

In [7]:
# Step 1: Install necessary libraries (if not already installed)
!pip install textblob

from textblob import TextBlob

# Step 2: Define a function for sentiment analysis
def analyze_sentiment(text):
    # Create a TextBlob object
    blob = TextBlob(text)
    
    # Get the polarity of the text
    polarity = blob.sentiment.polarity
    return polarity

# Step 3: Apply sentiment analysis to the 'cleaned_text' column
df_user_sentiment[['sentiment_polarity']] = df_user_sentiment['cleaned_text'].apply(analyze_sentiment).apply(pd.Series)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user_sentiment[['sentiment_polarity']] = df_user_sentiment['cleaned_text'].apply(analyze_sentiment).apply(pd.Series)


In [8]:
df_user_sentiment

Unnamed: 0,product_ID,cleaned_text,sentiment_polarity
0,B00YQ6X8EO,this spray is really nice. it smells really go...,0.390104
1,B081TJ8YS3,"this product does what i need it to do, i just...",0.250000
2,B07PNNCSP9,"smells good, feels great!",0.850000
3,B09JS339BZ,felt synthetic,0.000000
4,B08BZ63GMJ,love it,0.500000
...,...,...,...
692531,B006YUIWKA,conditioner is great shampoo not as i expected,0.350000
692532,B006YUIWKA,did not work! used the whole bottle and my hai...,-0.083333
692533,B06ZZV9MZT,product as expected. shipping was on time.,-0.100000
692534,B000HB6VLE,"not only is it a delicious fragrance, but also...",0.531250


In [9]:
df_user_sentiment_agg = df_user_sentiment.groupby('product_ID')['sentiment_polarity'].mean().reset_index()

In [10]:
df_user_sentiment_agg

Unnamed: 0,product_ID,sentiment_polarity
0,0005946468,0.800000
1,0123034892,0.700000
2,0124784577,0.296296
3,0515059560,0.052083
4,0615675026,-0.099841
...,...,...
115571,B0CBMFK1S2,0.316667
115572,B0CBWDTY41,0.469333
115573,B0CBXM7WHY,0.490000
115574,B0CCPDTRK7,0.249306


In [11]:
df_user_positive_sentiment_agg = df_user_sentiment_agg[df_user_sentiment_agg['sentiment_polarity'] > 0]

In [12]:
df_user_positive_sentiment_agg

Unnamed: 0,product_ID,sentiment_polarity
0,0005946468,0.800000
1,0123034892,0.700000
2,0124784577,0.296296
3,0515059560,0.052083
5,0692508988,0.398333
...,...,...
115571,B0CBMFK1S2,0.316667
115572,B0CBWDTY41,0.469333
115573,B0CBXM7WHY,0.490000
115574,B0CCPDTRK7,0.249306


In [13]:
df_user_positive_sentiment_agg[df_user_positive_sentiment_agg['product_ID']=='B085TBXF1Z']

Unnamed: 0,product_ID,sentiment_polarity
79531,B085TBXF1Z,0.20487


In [14]:
df_user_normalized_PCA['user_ID'].value_counts()

user_ID
AG73BVBKUOH22USSFJA5ZWL7AKXA      165
AEZP6Z2C5AVQDZAJECQYZWQRNG3Q      146
AEMP3A7IKW37CMWFXNKXWW6HGJHA_1    115
AGZUJTI7A3JFKB4FP5JOH6NVAJIQ_1     87
AFDYIK3FNPY2JFBQYUWC6GSBMIRQ_2     83
                                 ... 
AFSMCZTEUW3TI2BSPE25BD5GKXLA        1
AGKNUO4XOIPCSIKDRHO56UQDPXVQ        1
AEJQRDONU2O5LSOD5OC77XO43DFA        1
AFFFHL7GG5FLD2TSUGU65HTN6FMA        1
AGIYQU6RK6TBKBCMWKVPBPBMMJNA        1
Name: count, Length: 630476, dtype: int64

In [15]:
df_user_normalized_PCA.shape

(692536, 312)

In [16]:

# Verify the number of unique users:
unique_values_user = df_user_normalized_PCA.user_ID.nunique()
unique_values_user

630476

In [17]:
unique_combinations = df_user_normalized_PCA[['user_ID', 'product_ID']].value_counts()
unique_combinations

user_ID                       product_ID
AGWOOXMW2IXPKZOWAIWNMCXY7LBQ  B09NS1VG4L    2
AE222BBOVZIF42YOOPNBXL4UUMYA  B013HR1A92    1
AGPGHQIMPLOJD3FR3ODRDJFYSJBQ  B079D87KKM    1
AGPGGDJBP4W2D3QJ2WN3NWHSPA7Q  B08791HQXG    1
AGPGGF3KFAOMNATUGFSZEMRJ6PVQ  B07TXYVLPS    1
                                           ..
AFENAWCNZDSJANL43HMAQDOIN5QQ  B07D33K512    1
AFENAYIMKNX6PGBHATFCTZS2SAAQ  B008QSM704    1
AFENB2HA5MVZWNKRICDWRXR5PCDA  B001E76F6G    1
AFENBUWI2IGQ5ZBTH4XE36QRIDLA  B07FGFWKXM    1
AHZZZSOTVOVACVK2WWXL4ITEAPIA  B00R1TAN7I    1
Name: count, Length: 692535, dtype: int64

Aggregate User Vectors

In [18]:
# Select relevant columns: user_ID and embedding dimensions
vector_cols = [f'dim_norm_PCA{i}' for i in range(300)]
df_user_vectors = df_user_normalized_PCA.groupby('user_ID')[vector_cols].mean().reset_index()

# Rename the columns of vectors to indicate these are user vectors:
df_user_vectors.columns = ['user_ID'] + [f'user_vec_{i}' for i in range(300)]

df_user_vectors.shape

(630476, 301)

In [19]:
df_user_vectors.head()

Unnamed: 0,user_ID,user_vec_0,user_vec_1,user_vec_2,user_vec_3,user_vec_4,user_vec_5,user_vec_6,user_vec_7,user_vec_8,...,user_vec_290,user_vec_291,user_vec_292,user_vec_293,user_vec_294,user_vec_295,user_vec_296,user_vec_297,user_vec_298,user_vec_299
0,AE222BBOVZIF42YOOPNBXL4UUMYA,0.157177,-0.047737,-0.016334,-0.216434,-0.053552,-0.098437,-0.042217,0.004862,0.124219,...,-0.008544,-0.009045,-0.012643,-0.013444,-0.008015,-0.011304,0.005638,0.012653,-0.000733,0.017797
1,AE222FP7YRNFCEQ2W3ZDIGMSYTLQ,0.04597,-0.018998,0.164447,-0.194383,0.170237,0.089281,-0.069243,0.064173,-0.06123,...,0.00081,0.007626,0.023576,0.005984,-0.014254,0.000982,0.005801,0.001267,0.011145,-0.001793
2,AE222X475JC6ONXMIKZDFGQ7IAUA,0.410081,0.025667,-0.12193,0.184316,0.0051,-0.151829,0.046592,0.091186,0.147147,...,0.029371,0.001799,-0.001441,-0.010866,-0.001765,0.00542,-0.010488,-0.011075,0.004787,-0.000156
3,AE222Y4WTST6BUZ4J5Y2H6QMBITQ,-0.204608,-0.027967,-0.074068,0.101387,0.089039,-0.18417,-0.045104,-0.112713,0.019092,...,0.012088,-0.009859,0.001905,-0.002008,0.00282,-0.000749,-0.011317,-0.00327,-0.009697,0.011767
4,AE2232TEZOEWQLAFEX2NA6VBGMYQ,0.306584,-0.059949,0.359815,-0.206749,0.43388,-0.091246,0.383815,0.030456,-0.088499,...,0.004287,0.01419,-0.023456,-0.011924,-0.009559,0.007444,0.031338,-0.011538,0.011149,-0.004757


In [20]:
# Save the new DataFrame with user vectors

file_path_user_vec = rf'.\..\data\text_analysis\user_vectors\user_vectors_merged.csv'
df_user_vectors.to_csv(file_path_user_vec, index=False)

KeyboardInterrupt: 

Aggregate Product Vectors

In [21]:
#product_vector_cols = [f'dim_norm_PCA{i}' for i in range(300)]
df_product_vectors = df_user_normalized_PCA.groupby('product_ID')[vector_cols].mean().reset_index()

df_product_vectors.columns = ['product_ID'] + [f'product_vec_{i}' for i in range(300)]
df_product_vectors.shape

(115576, 301)

In [22]:
df_product_vectors_positive_sentiment = df_product_vectors[df_product_vectors['product_ID'].isin(df_user_positive_sentiment_agg['product_ID'])]

In [23]:
df_product_vectors_positive_sentiment

Unnamed: 0,product_ID,product_vec_0,product_vec_1,product_vec_2,product_vec_3,product_vec_4,product_vec_5,product_vec_6,product_vec_7,product_vec_8,...,product_vec_290,product_vec_291,product_vec_292,product_vec_293,product_vec_294,product_vec_295,product_vec_296,product_vec_297,product_vec_298,product_vec_299
0,0005946468,0.763349,0.091675,0.117928,0.099401,0.132281,0.070900,-0.223043,-0.012843,-0.009337,...,-0.007858,0.004325,-0.002059,-0.002856,0.003310,0.000536,-0.002052,-0.006457,-0.004983,0.003248
1,0123034892,0.578087,-0.060032,-0.136238,-0.096158,-0.007847,-0.121033,-0.108942,0.071681,0.014017,...,-0.013452,0.005248,-0.005749,0.006901,-0.002569,-0.009791,-0.005985,-0.000418,0.004766,0.002285
2,0124784577,0.055595,-0.048237,0.045403,0.001712,0.206265,0.038459,0.181421,0.045733,-0.041820,...,-0.007309,0.007504,0.002950,-0.002596,-0.005377,-0.000655,-0.001878,0.002130,0.005408,0.005242
3,0515059560,-0.141502,0.079938,0.102094,-0.057478,-0.011861,-0.127184,-0.016523,-0.101494,-0.038372,...,-0.006221,-0.008810,0.026192,0.000421,0.000422,-0.016361,0.015942,0.017142,0.004057,-0.015050
5,0692508988,-0.133792,0.152482,0.033673,-0.082416,0.033410,-0.223736,-0.043213,-0.034826,-0.006686,...,0.005252,0.001003,0.000599,-0.006615,-0.009176,-0.006841,-0.010335,-0.002790,0.000446,0.016137
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115571,B0CBMFK1S2,-0.149024,-0.046643,-0.168177,0.006320,0.041629,-0.095680,0.010731,0.121839,-0.117584,...,-0.008954,-0.007799,0.007354,-0.009253,0.017589,0.001140,0.005645,0.016663,-0.000791,-0.000650
115572,B0CBWDTY41,-0.100371,0.046079,-0.234349,0.136378,0.072183,-0.078564,0.072506,0.096607,0.007511,...,0.008602,-0.001902,0.006263,0.001563,0.003499,-0.000318,-0.002803,0.003374,0.003358,0.000822
115573,B0CBXM7WHY,-0.128249,-0.042449,0.129125,-0.057023,0.020782,0.061780,-0.092159,0.122308,0.098822,...,0.002010,-0.000163,-0.003222,-0.008313,0.009361,0.004195,0.004799,0.010006,-0.015273,0.001356
115574,B0CCPDTRK7,-0.222002,0.092505,-0.143339,0.069648,0.143655,0.148648,0.053853,-0.133939,0.024819,...,0.008545,0.005117,-0.009365,0.011635,0.018745,0.007743,-0.004170,0.000142,-0.004274,0.000070


In [44]:
# Save the new DataFrame with product vectors

file_path_product_vec = rf'.\..\data\text_analysis\product_vectors\product_vectors_merged_sentiment.csv'
df_product_vectors_positive_sentiment.to_csv(file_path_product_vec, index=False)


Semantic Analysis

In [24]:
user_item_df = df_user_normalized_PCA.groupby('user_ID')['product_ID'].apply(list).reset_index()
user_item_df

Unnamed: 0,user_ID,product_ID
0,AE222BBOVZIF42YOOPNBXL4UUMYA,[B013HR1A92]
1,AE222FP7YRNFCEQ2W3ZDIGMSYTLQ,[B0BTT658PQ]
2,AE222X475JC6ONXMIKZDFGQ7IAUA,[B00PBDMRES]
3,AE222Y4WTST6BUZ4J5Y2H6QMBITQ,[B00012FPSO]
4,AE2232TEZOEWQLAFEX2NA6VBGMYQ,[B07QNPXBLH]
...,...,...
630471,AHZZYVEU6QFMPFZ2HJUWR22SNK4A,[B07JJ9NFFH]
630472,AHZZZAK24AJ3JNBDUZJGHHWSRVAA,[B00KXFD75M]
630473,AHZZZJP24QUSB5XWW6MAXYBZZZSQ,[B00IG0677G]
630474,AHZZZL7YQJA3RSA6PYK3WMFACYIQ,[B091TKH1JF]


In [25]:
# Extract user vectors
user_ids = df_user_vectors['user_ID'].values
user_vectors = df_user_vectors.iloc[:, 1:].values

# Extract product vectors
product_ids = df_product_vectors_positive_sentiment['product_ID'].values
product_vectors = df_product_vectors_positive_sentiment.iloc[:, 1:].values

In [26]:
print(f"Shape of User Vectors: {user_vectors.shape}")
print(f"Shape of Product Vectors: {product_vectors.shape}")

Shape of User Vectors: (630476, 300)
Shape of Product Vectors: (96545, 300)


In [48]:
# Define the number of closest product vectors to compute
top_n_products = 300 # find the top 300 closest products for each user instead of computing all pairwise similarities.
# Fit the NearestNeighbors model on the product vectors
nbrs = NearestNeighbors(n_neighbors=top_n_products, metric='cosine').fit(product_vectors)

# Retrieve the top-N most similar products for each user
distances, indices = nbrs.kneighbors(user_vectors)

# `distances` contains the cosine distances (1 - similarity)
# Convert distances to similarity scores
similarity_scores = 1 - distances

KeyboardInterrupt: 

In [40]:
# Save the indices and similarity scores
np.savez(r'.\..\data\cos_similarity\recommendations_300_merged_sentiment.npz', indices=indices, similarity_scores=similarity_scores)

**1. Content Based Filter**

In [27]:
# Load an array with generated indices and similarity scores:
data = np.load(r'.\..\data\cos_similarity\recommendations_300_merged_sentiment.npz')
indices = data["indices"]
similarity_scores = data["similarity_scores"]

In [28]:
# Load the DataFrame with user vectors:
file_path_user_vec = rf'.\..\data\text_analysis\user_vectors\user_vectors_merged.csv'
df_user_vectors = pd.read_csv(file_path_user_vec)
df_user_vectors

Unnamed: 0,user_ID,user_vec_0,user_vec_1,user_vec_2,user_vec_3,user_vec_4,user_vec_5,user_vec_6,user_vec_7,user_vec_8,...,user_vec_290,user_vec_291,user_vec_292,user_vec_293,user_vec_294,user_vec_295,user_vec_296,user_vec_297,user_vec_298,user_vec_299
0,AE222BBOVZIF42YOOPNBXL4UUMYA,0.157177,-0.047737,-0.016334,-0.216434,-0.053552,-0.098437,-0.042217,0.004862,0.124219,...,-0.008544,-0.009045,-0.012643,-0.013444,-0.008015,-0.011304,0.005638,0.012653,-0.000733,0.017797
1,AE222FP7YRNFCEQ2W3ZDIGMSYTLQ,0.045970,-0.018998,0.164447,-0.194383,0.170237,0.089281,-0.069243,0.064173,-0.061230,...,0.000810,0.007626,0.023576,0.005984,-0.014254,0.000982,0.005801,0.001267,0.011145,-0.001793
2,AE222X475JC6ONXMIKZDFGQ7IAUA,0.410081,0.025667,-0.121930,0.184316,0.005100,-0.151829,0.046592,0.091186,0.147147,...,0.029371,0.001799,-0.001441,-0.010866,-0.001765,0.005420,-0.010488,-0.011075,0.004787,-0.000156
3,AE222Y4WTST6BUZ4J5Y2H6QMBITQ,-0.204608,-0.027967,-0.074068,0.101387,0.089039,-0.184170,-0.045104,-0.112713,0.019092,...,0.012088,-0.009859,0.001905,-0.002008,0.002820,-0.000749,-0.011317,-0.003270,-0.009697,0.011767
4,AE2232TEZOEWQLAFEX2NA6VBGMYQ,0.306584,-0.059949,0.359815,-0.206749,0.433880,-0.091246,0.383815,0.030456,-0.088499,...,0.004287,0.014190,-0.023456,-0.011924,-0.009559,0.007444,0.031338,-0.011538,0.011149,-0.004757
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
452511,AGVZJBSQ4OT2X3MWLXY5PIXLJ3AA,-0.147532,0.059837,0.039493,-0.279241,-0.122369,-0.005425,0.112958,-0.025145,0.097714,...,0.004796,-0.016647,-0.008902,-0.011176,-0.000242,0.000345,0.003873,0.001957,-0.011616,0.005859
452512,AGVZJOVLW7V2KCG4EYSC5WPFBM7A,0.033293,-0.157630,0.038096,0.186306,-0.118854,-0.077394,0.020711,0.018362,-0.122144,...,0.003417,0.024646,0.000166,-0.000784,0.012555,-0.009504,0.000849,0.015328,-0.006696,0.018807
452513,AGVZJTOO3QVIMJJR55ZRUAS7K7JQ,-0.111310,0.071164,-0.081447,-0.087067,0.006885,0.040921,0.074522,-0.029190,-0.012783,...,0.003594,0.024861,-0.001709,0.004762,0.013981,-0.009622,-0.024978,-0.016635,0.001439,0.003907
452514,AGVZJUDNCUKE5AAKR6S6S5TPDEUQ,-0.201483,-0.070276,-0.073900,0.096959,0.127813,-0.127007,-0.057464,0.211536,-0.022433,...,-0.009823,-0.003091,0.004529,0.002668,-0.012356,0.006953,-0.006037,0.009110,-0.000501,-0.003012


In [29]:
# Load the DataFrame with product vectors:
file_path_product_vec = rf'.\..\data\text_analysis\product_vectors\product_vectors_merged_sentiment.csv'
df_product_vectors = pd.read_csv(file_path_product_vec)
df_product_vectors

Unnamed: 0,product_ID,product_vec_0,product_vec_1,product_vec_2,product_vec_3,product_vec_4,product_vec_5,product_vec_6,product_vec_7,product_vec_8,...,product_vec_290,product_vec_291,product_vec_292,product_vec_293,product_vec_294,product_vec_295,product_vec_296,product_vec_297,product_vec_298,product_vec_299
0,0005946468,0.763349,0.091675,0.117928,0.099401,0.132281,0.070900,-0.223043,-0.012843,-0.009337,...,-0.007858,0.004325,-0.002059,-0.002856,0.003310,0.000536,-0.002052,-0.006457,-0.004983,0.003248
1,0123034892,0.578087,-0.060032,-0.136238,-0.096158,-0.007847,-0.121033,-0.108942,0.071681,0.014017,...,-0.013452,0.005248,-0.005749,0.006901,-0.002569,-0.009791,-0.005985,-0.000418,0.004766,0.002285
2,0124784577,0.055595,-0.048237,0.045403,0.001712,0.206265,0.038459,0.181421,0.045733,-0.041820,...,-0.007309,0.007504,0.002950,-0.002596,-0.005377,-0.000655,-0.001878,0.002130,0.005408,0.005242
3,0515059560,-0.141502,0.079938,0.102094,-0.057478,-0.011861,-0.127184,-0.016523,-0.101494,-0.038372,...,-0.006221,-0.008810,0.026192,0.000421,0.000422,-0.016361,0.015942,0.017142,0.004057,-0.015050
4,0692508988,-0.133792,0.152482,0.033673,-0.082416,0.033410,-0.223736,-0.043213,-0.034826,-0.006686,...,0.005252,0.001003,0.000599,-0.006615,-0.009176,-0.006841,-0.010335,-0.002790,0.000446,0.016137
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96540,B0CBMFK1S2,-0.149024,-0.046643,-0.168177,0.006320,0.041629,-0.095680,0.010731,0.121839,-0.117584,...,-0.008954,-0.007799,0.007354,-0.009253,0.017589,0.001140,0.005645,0.016663,-0.000791,-0.000650
96541,B0CBWDTY41,-0.100371,0.046079,-0.234349,0.136378,0.072183,-0.078564,0.072506,0.096607,0.007511,...,0.008602,-0.001902,0.006263,0.001563,0.003499,-0.000318,-0.002803,0.003374,0.003358,0.000822
96542,B0CBXM7WHY,-0.128249,-0.042449,0.129125,-0.057023,0.020782,0.061780,-0.092159,0.122308,0.098822,...,0.002010,-0.000163,-0.003222,-0.008313,0.009361,0.004195,0.004799,0.010006,-0.015273,0.001356
96543,B0CCPDTRK7,-0.222002,0.092505,-0.143339,0.069648,0.143655,0.148648,0.053853,-0.133939,0.024819,...,0.008545,0.005117,-0.009365,0.011635,0.018745,0.007743,-0.004170,0.000142,-0.004274,0.000070


In [30]:
# load the dataset of PCA normalized embeddings: 
file_path = rf'.\..\data\embeddings_dim_reduction\df_user_normalized_PCA20250106_160122.csv' # adjust the filepath with the proper file 
df_user_normalized_PCA = pd.read_csv(file_path) 

In [31]:
# Upload the user DataFrame and analyse the recommendations
file_path = r'./data/merged_user_meta_df.csv'  # Update this with your file path
df_user = pd.read_csv(file_path)

In [32]:
df_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 693941 entries, 0 to 693940
Data columns (total 14 columns):
 #   Column                      Non-Null Count   Dtype 
---  ------                      --------------   ----- 
 0   rating                      693941 non-null  int64 
 1   review_title                693941 non-null  object
 2   text_review                 693941 non-null  object
 3   user_images                 693941 non-null  object
 4   product_ID                  693941 non-null  object
 5   parent_ID                   693941 non-null  object
 6   user_ID                     693941 non-null  object
 7   timestamp                   693941 non-null  object
 8   helpful_review_vote         693941 non-null  int64 
 9   user_purchase_verification  693941 non-null  int64 
 10  year                        693941 non-null  int64 
 11  cleaned_text                692536 non-null  object
 12  parent_asin                 693941 non-null  object
 13  cleaned_title               6

Run recommendations for the specific user_id

In [33]:
user_item_df = df_user_normalized_PCA.groupby('user_ID')['product_ID'].apply(list).reset_index()
user_item_df

Unnamed: 0,user_ID,product_ID
0,AE222BBOVZIF42YOOPNBXL4UUMYA,[B013HR1A92]
1,AE222FP7YRNFCEQ2W3ZDIGMSYTLQ,[B0BTT658PQ]
2,AE222X475JC6ONXMIKZDFGQ7IAUA,[B00PBDMRES]
3,AE222Y4WTST6BUZ4J5Y2H6QMBITQ,[B00012FPSO]
4,AE2232TEZOEWQLAFEX2NA6VBGMYQ,[B07QNPXBLH]
...,...,...
630471,AHZZYVEU6QFMPFZ2HJUWR22SNK4A,[B07JJ9NFFH]
630472,AHZZZAK24AJ3JNBDUZJGHHWSRVAA,[B00KXFD75M]
630473,AHZZZJP24QUSB5XWW6MAXYBZZZSQ,[B00IG0677G]
630474,AHZZZL7YQJA3RSA6PYK3WMFACYIQ,[B091TKH1JF]


In [65]:
df_user_normalized_PCA['user_ID'].value_counts()

user_ID
AG73BVBKUOH22USSFJA5ZWL7AKXA      165
AEZP6Z2C5AVQDZAJECQYZWQRNG3Q      146
AEMP3A7IKW37CMWFXNKXWW6HGJHA_1    115
AGZUJTI7A3JFKB4FP5JOH6NVAJIQ_1     87
AFDYIK3FNPY2JFBQYUWC6GSBMIRQ_2     83
                                 ... 
AFSMCZTEUW3TI2BSPE25BD5GKXLA        1
AGKNUO4XOIPCSIKDRHO56UQDPXVQ        1
AEJQRDONU2O5LSOD5OC77XO43DFA        1
AFFFHL7GG5FLD2TSUGU65HTN6FMA        1
AGIYQU6RK6TBKBCMWKVPBPBMMJNA        1
Name: count, Length: 630476, dtype: int64

In [66]:
unique_combinations = df_user_normalized_PCA[['user_ID', 'product_ID']].value_counts()
unique_combinations

user_ID                       product_ID
AGWOOXMW2IXPKZOWAIWNMCXY7LBQ  B09NS1VG4L    2
AE222BBOVZIF42YOOPNBXL4UUMYA  B013HR1A92    1
AGPGHQIMPLOJD3FR3ODRDJFYSJBQ  B079D87KKM    1
AGPGGDJBP4W2D3QJ2WN3NWHSPA7Q  B08791HQXG    1
AGPGGF3KFAOMNATUGFSZEMRJ6PVQ  B07TXYVLPS    1
                                           ..
AFENAWCNZDSJANL43HMAQDOIN5QQ  B07D33K512    1
AFENAYIMKNX6PGBHATFCTZS2SAAQ  B008QSM704    1
AFENB2HA5MVZWNKRICDWRXR5PCDA  B001E76F6G    1
AFENBUWI2IGQ5ZBTH4XE36QRIDLA  B07FGFWKXM    1
AHZZZSOTVOVACVK2WWXL4ITEAPIA  B00R1TAN7I    1
Name: count, Length: 692535, dtype: int64

In [34]:
# Extract user vectors
user_ids = df_user_vectors['user_ID'].values
user_vectors = df_user_vectors.iloc[:, 1:].values

# Extract product vectors
product_ids = df_product_vectors['product_ID'].values
product_vectors = df_product_vectors.iloc[:, 1:].values

In [35]:
def recommend_top_n_products_by_user_id(user_id, user_ids, similarity_scores, indices, product_ids, user_item_df, top_n=10):
    """
    Recommend top N products for a given user based on precomputed top-N cosine similarity.

    Parameters:
    - user_id: User ID for whom to generate recommendations
    - user_ids: List of user IDs corresponding to rows in similarity data
    - similarity_scores: Precomputed top-N similarity scores (users x top-N products)
    - indices: Indices of the top-N products for each user
    - product_ids: List of product IDs corresponding to the product vectors
    - user_item_df: DataFrame containing user-product interactions
    - top_n: Number of top recommendations to return

    Returns:
    - List of (product_id, similarity_score) tuples
    """

    # Find the index of the user_ID
    if user_id not in user_ids:
        raise ValueError(f"User ID '{user_id}' not found in the user data.")
    
    user_index = np.where(user_ids == user_id)[0][0]  # Locate the index of user_id

    # Get top-N similarity scores and product indices for this user
    user_similarities = similarity_scores[user_index]
    user_product_indices = indices[user_index]

    # Retrieve the list of already purchased products for the user
    purchased_products = user_item_df.loc[user_item_df['user_ID'] == user_id, 'product_ID']
    if not purchased_products.empty:
        # Flatten and handle lists or non-hashable elements in purchased_products
        if purchased_products.apply(lambda x: isinstance(x, list)).any():
            purchased_products = set([item for sublist in purchased_products for item in sublist])
        else:
            purchased_products = set(purchased_products)
    else:
        purchased_products = set()

    # Filter and sort recommendations
    recommendations = []
    for i, product_index in enumerate(user_product_indices):
        product = product_ids[product_index]
        if product not in purchased_products:
            recommendations.append((product, user_similarities[i]))
        if len(recommendations) >= top_n:  # Stop when we have enough recommendations
            break
    
    return recommendations

# Example usage
user_id_input = "AGKHLEW2SOWHNMFQIJGBECAF7INQ"  # Replace with user_ID
top_n = 10

try:
    recommendations = recommend_top_n_products_by_user_id(
        user_id_input, 
        user_ids, 
        similarity_scores, 
        indices, 
        product_ids, 
        user_item_df, 
        top_n
    )

    print(f"Top-{top_n} Recommendations for User '{user_id_input}':")
    for product_id, score in recommendations:
        print(f"Product ID: {product_id}, Similarity Score: {score:.4f}")
except ValueError as e:
    print(e)

Top-10 Recommendations for User 'AGKHLEW2SOWHNMFQIJGBECAF7INQ':
Product ID: B0BQWTXV2Q, Similarity Score: 0.7368
Product ID: B0170FP8CC, Similarity Score: 0.7360
Product ID: B08JQS9FVP, Similarity Score: 0.7267
Product ID: B09TQ2SDKK, Similarity Score: 0.7249
Product ID: B085TBXF1Z, Similarity Score: 0.7223
Product ID: B07FQTCLNX, Similarity Score: 0.7199
Product ID: B08JF6G92D, Similarity Score: 0.7182
Product ID: B08KNVV18L, Similarity Score: 0.7164
Product ID: B017Y3S58Y, Similarity Score: 0.7163
Product ID: B01I40S6EE, Similarity Score: 0.7129


In [36]:
recommendations

[('B0BQWTXV2Q', 0.7367523443131535),
 ('B0170FP8CC', 0.7359656044201847),
 ('B08JQS9FVP', 0.7267167236243094),
 ('B09TQ2SDKK', 0.7249396030922018),
 ('B085TBXF1Z', 0.7222932872618272),
 ('B07FQTCLNX', 0.7199410621257987),
 ('B08JF6G92D', 0.7182443550142477),
 ('B08KNVV18L', 0.7164496570781012),
 ('B017Y3S58Y', 0.7163045323270015),
 ('B01I40S6EE', 0.7128544319799418)]

In [37]:
# Extract product IDs:
product_ids = [item[0] for item in recommendations]
product_ids

['B0BQWTXV2Q',
 'B0170FP8CC',
 'B08JQS9FVP',
 'B09TQ2SDKK',
 'B085TBXF1Z',
 'B07FQTCLNX',
 'B08JF6G92D',
 'B08KNVV18L',
 'B017Y3S58Y',
 'B01I40S6EE']

In [38]:
# Find users who ourchased more than 1 product to verify the recommendations for them:
duplicated_user_ids = df_user_normalized_PCA['user_ID'][df_user_normalized_PCA['user_ID'].duplicated()].unique()
duplicated_user_ids

array(['AGKHLEW2SOWHNMFQIJGBECAF7INQ', 'AFQLNQNQYFWQZPJQZS6V3NZU4QBQ',
       'AFSKPY37N3C43SOI5IEXEK5JSIYA', ...,
       'AFV3EYFZLLLBWIXWRZUSRJOHLNBA', 'AEVTGJFLW22HVSWOJLJCBJUN46WA',
       'AHURE3VT2MLCTARMYI7JA7KKDYAA'], dtype=object)

In [39]:
pd.set_option('display.max_colwidth', None)

In [40]:
single_user_check = df_user[df_user['user_ID']=='AGKHLEW2SOWHNMFQIJGBECAF7INQ']
single_user_check

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,year,cleaned_text,parent_asin,cleaned_title
0,5,Such a lovely scent but not overpowering.,"This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, medium thickness. I am comparing to other brands with yucky chemicals so I'm gonna stick with this. Try it!",[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-05 14:08:48.923,0,1,2020,"this spray is really nice. it smells really good, goes on really fine, and does the trick. i will say it feels like you need a lot of it though to get the texture i want. i have a lot of hair, medium thickness. i am comparing to other brands with yucky chemicals so im gonna stick with this. try it!",B00YQ6X8EO,herbivore natural sea mist texturizing salt spray coconut 8 oz
1,4,Works great but smells a little weird.,"This product does what I need it to do, I just wish it was odorless or had a soft coconut smell. Having my head smell like an orange coffee is offputting. (granted, I did know the smell was described but I was hoping it would be light)",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-04 18:10:55.070,1,1,2020,"this product does what i need it to do, i just wish it was odorless or had a soft coconut smell. having my head smell like an orange coffee is offputting. granted, i did know the smell was described but i was hoping it would be light",B081TJ8YS3,all natural vegan dry shampoo powder eco friendly root touch up hair powder volumizer for brown hair brunette and dark hair brun application brush two goats apothecary


In [41]:
# Analyse recommended products
filtered_df = df_user[df_user['product_ID'].isin(product_ids)]
filtered_df.head()

Unnamed: 0,rating,review_title,text_review,user_images,product_ID,parent_ID,user_ID,timestamp,helpful_review_vote,user_purchase_verification,year,cleaned_text,parent_asin,cleaned_title
781,5,"Softest hands ever, lightly scented w/ lemongrass oil, lemon oil, orange oil, rosemary","I love Leap bar soaps (LOVE!), and was curious about their liquid hand soap. It is a 12 ounce pump style bottle with visually pleasing label. The soap comes out thick and rich. It is super luxurious feeling- it lathers significantly all the while feeling like hand cream. It rinses well and leaves skin feeling soft. Probably the most soft. Perhaps a little coating of glycerin and aloe are left behind? Yet not a gross coating. I am trying to figure out the magic of my soft, smooth, clean feeling hands on a super cold (18*F), dry day. Super moisturizing.<br /><br />Oddly, I find it has barely any scent. I am used to their super scented bar soaps, and I was expecting a stronger scent to come through. I am not usually keen on perfume (causes headaches, nausea), but when a product uses essential oils for scenting (especially hand soap and house cleaners), I find it delightfully uplifting. This is labeled as lemongrass, but that isn't the only essential oil used. The profile is more complex with the addition of lemon oil, orange oil, and rosemary. I was really excited for this to have a robust citrus smell. I really couldn't tell you just by sniffing. It smells fresh and clean, but it isn't strong (and I was hoping/expecting strong). I see other reviewers mentioning a strong smell. I figured I got a weird bottle or I have covid. I actually went and grabbed my Leap bar soap to make sure I could still smell that. I can. Phew.<br /><br />I'm still going 5 stars because it does have a mild fresh scent, and the soap itself is glorious. Very rich, super moisturizing lather. And it rinses away perfectly. I dont feel like I am working to get the soap off. If it did, I wouldn't like that. So there is some sorcery involved here, lol. I kinda want it as a body wash. My hands feel SO GOOD after using this. I just keep touching my hands.<br /><br />One does not need a lot of soap, so feel free to just add a little to the hand. I would even go so far as to add a little water as it goes down. Or if you like this soap and have a foamer, then just a wee bit of this with a lot of water will do it.<br /><br />Love their designs on the bottles.<br /><br />Water, Organic Coconut Oil, Organic Olive Oil, Vegetable Glycerin, Organic Guar Gum, Lemongrass Oil, Orange Oil, Organic Jojoba Oil, Lemon Oil, Rosemary Extract, Organic Aloe Vera (Potassium Hydroxide is used to saponify the organic oils into soap, but none remains in the final product)",[],B08KNVV18L,B08KNVV18L,AF2BLE54TEMGZ546U763ZHZRXC4A,2020-12-18 17:42:00.919,0,0,2020,"i love leap bar soaps love!, and was curious about their liquid hand soap. it is a 12 ounce pump style bottle with visually pleasing label. the soap comes out thick and rich. it is super luxurious feeling it lathers significantly all the while feeling like hand cream. it rinses well and leaves skin feeling soft. probably the most soft. perhaps a little coating of glycerin and aloe are left behind? yet not a gross coating. i am trying to figure out the magic of my soft, smooth, clean feeling hands on a super cold 18f, dry day. super moisturizing.oddly, i find it has barely any scent. i am used to their super scented bar soaps, and i was expecting a stronger scent to come through. i am not usually keen on perfume causes headaches, nausea, but when a product uses essential oils for scenting especially hand soap and house cleaners, i find it delightfully uplifting. this is labeled as lemongrass, but that isnt the only essential oil used. the profile is more complex with the addition of lemon oil, orange oil, and rosemary. i was really excited for this to have a robust citrus smell. i really couldnt tell you just by sniffing. it smells fresh and clean, but it isnt strong and i was hopingexpecting strong. i see other reviewers mentioning a strong smell. i figured i got a weird bottle or i have covid. i actually went and grabbed my leap bar soap to make sure i could still smell that. i can. phew.im still going 5 stars because it does have a mild fresh scent, and the soap itself is glorious. very rich, super moisturizing lather. and it rinses away perfectly. i dont feel like i am working to get the soap off. if it did, i wouldnt like that. so there is some sorcery involved here, lol. i kinda want it as a body wash. my hands feel so good after using this. i just keep touching my hands.one does not need a lot of soap, so feel free to just add a little to the hand. i would even go so far as to add a little water as it goes down. or if you like this soap and have a foamer, then just a wee bit of this with a lot of water will do it.love their designs on the bottles.water, organic coconut oil, organic olive oil, vegetable glycerin, organic guar gum, lemongrass oil, orange oil, organic jojoba oil, lemon oil, rosemary extract, organic aloe vera potassium hydroxide is used to saponify the organic oils into soap, but none remains in the final product",B08KNVV18L,leap lemongrass hand soap natural moisturizing lather gentle rinses clean made with organic oils cruelty free vegan liquid hand wash superbly designed 12 fl oz
1080,3,Made my face feel greasy,"I really wanted to like this facial serum. It comes in a little dropper bottle that is common for this kind of product. The serum is a prickly pear oil that is supposed to be very good for your face.<br /><br />When you use it you will first notice the fragrance of the oil. I am not sure if maybe my bottle went bad, but mine just smelled very bad, almost like oil that went rancid. I really didn’t like it at all and it makes it difficult using this product especially since that smell lingered for some time on top of it.<br /><br />The oil is on the lighter side, and even so it took a while to absorb and my skin felt greasy for some time. I would not recommend using this during the day. Especially when wearing a mask, using a facial oil that leaves an oily residue makes my face very hot and sweaty, just awful and this one would do exactly that. Sadly this one is just average at best and I will not be re-purchasing when I run out.",[],B07FQTCLNX,B07FQTCLNX,AEZP6Z2C5AVQDZAJECQYZWQRNG3Q,2020-12-13 15:36:44.502,0,0,2020,"i really wanted to like this facial serum. it comes in a little dropper bottle that is common for this kind of product. the serum is a prickly pear oil that is supposed to be very good for your face.when you use it you will first notice the fragrance of the oil. i am not sure if maybe my bottle went bad, but mine just smelled very bad, almost like oil that went rancid. i really didnt like it at all and it makes it difficult using this product especially since that smell lingered for some time on top of it.the oil is on the lighter side, and even so it took a while to absorb and my skin felt greasy for some time. i would not recommend using this during the day. especially when wearing a mask, using a facial oil that leaves an oily residue makes my face very hot and sweaty, just awful and this one would do exactly that. sadly this one is just average at best and i will not be repurchasing when i run out.",B07FQTCLNX,naissance certified organic prickly pear cactus oil barbary fig seed oil 1 fl oz pure natural cold pressed vegan hexane free non gmo
1794,5,This oil is thicker feeling than most face oils,"When I first put it on my face, it felt so thick and I started thinking this was going to be way to thick and heavy feeling But it is absorbing into my skin quite well. The product is unscented but the natural scent of the oil kind of smells like old olive oil. But that smell goes away too.",[],B07FQTCLNX,B07FQTCLNX,AG73BVBKUOH22USSFJA5ZWL7AKXA,2020-12-10 17:30:54.289,0,0,2020,"when i first put it on my face, it felt so thick and i started thinking this was going to be way to thick and heavy feeling but it is absorbing into my skin quite well. the product is unscented but the natural scent of the oil kind of smells like old olive oil. but that smell goes away too.",B07FQTCLNX,naissance certified organic prickly pear cactus oil barbary fig seed oil 1 fl oz pure natural cold pressed vegan hexane free non gmo
2398,4,Super Gentle,"My skin is really sensitive.<br /><br />I also don't generally like all-in-one products. You should see my bathroom cabinets. Trying to consolidate, but I get different rashes in different places, and no one thing works for everything.<br /><br />This is very gentle and did not make my skin or face or scalp break out or rash up at all. It's not super moisturizing, but it wasn't drying, either. The scent is very light, and the cleanser has a little minty/tingly kick. This would have worked for me in my teens or 20s when I couldn't wait to get out of the shower and on with my life.",[],B0BQWTXV2Q,B0BQWTXV2Q,AENH6LSB6BM7XLPEYUL43WBOD6JA,2023-02-06 18:45:25.633,0,0,2023,"my skin is really sensitive.i also dont generally like allinone products. you should see my bathroom cabinets. trying to consolidate, but i get different rashes in different places, and no one thing works for everything.this is very gentle and did not make my skin or face or scalp break out or rash up at all. its not super moisturizing, but it wasnt drying, either. the scent is very light, and the cleanser has a little mintytingly kick. this would have worked for me in my teens or 20s when i couldnt wait to get out of the shower and on with my life.",B0BQWTXV2Q,hanz de fuko everything wash for hair face body allinone soap shampoo for men women kids safe for sensitive skin minty rum sandalwood 8 oz bottle
2909,5,Amazing for your face,"I like to layer this on top of my moisturizer if my face feels particularly dry. This sinks in and goes to work keeping your face hydrated and not tight. There is sort of an odd scent to it, but it isn't overpowering. I have heard great things about prickly pear oils and this definitely lived up to my expectations.",[],B07FQTCLNX,B07FQTCLNX,AHT6AM6BNIZUHFJB5V2M6XM72G7Q,2020-11-28 01:06:31.619,0,0,2020,"i like to layer this on top of my moisturizer if my face feels particularly dry. this sinks in and goes to work keeping your face hydrated and not tight. there is sort of an odd scent to it, but it isnt overpowering. i have heard great things about prickly pear oils and this definitely lived up to my expectations.",B07FQTCLNX,naissance certified organic prickly pear cactus oil barbary fig seed oil 1 fl oz pure natural cold pressed vegan hexane free non gmo


In [42]:
# Display the cleaned_text column
filtered_df.cleaned_text

781       i love leap bar soaps love!, and was curious about their liquid hand soap. it is a 12 ounce pump style bottle with visually pleasing label. the soap comes out thick and rich. it is super luxurious feeling it lathers significantly all the while feeling like hand cream. it rinses well and leaves skin feeling soft. probably the most soft. perhaps a little coating of glycerin and aloe are left behind? yet not a gross coating. i am trying to figure out the magic of my soft, smooth, clean feeling hands on a super cold 18f, dry day. super moisturizing.oddly, i find it has barely any scent. i am used to their super scented bar soaps, and i was expecting a stronger scent to come through. i am not usually keen on perfume causes headaches, nausea, but when a product uses essential oils for scenting especially hand soap and house cleaners, i find it delightfully uplifting. this is labeled as lemongrass, but that isnt the only essential oil used. the profile is more complex with the addit

**2. Collaborative Filter**

In [43]:
# Install any missing libraries
!pip install pandas numpy scikit-learn scipy matplotlib



In [44]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

# Load Dataset
file_path = r'.\..\data\data_clean\user_clean_data_ecommerce.csv'
data = pd.read_csv(file_path)

# Preprocessing: Select rows 0 to 299
data = data[['user_ID', 'product_ID', 'rating']].dropna()
data_sample = data.iloc[:300]  # Select the first 300 rows (indices 0 to 299)

# User-Item Matrix
user_item_matrix = data_sample.pivot_table(index='user_ID', columns='product_ID', values='rating')
user_item_matrix = user_item_matrix.fillna(0)

# Sparse Representation for Efficiency
sparse_matrix = csr_matrix(user_item_matrix.values)

# Get the number of features (columns in the matrix)
n_features = user_item_matrix.shape[1]

# Normalize Ratings by Subtracting Mean (Centered Matrix)
mean_user_rating = np.mean(user_item_matrix, axis=1).values.reshape(-1, 1)
user_item_matrix_normalized = user_item_matrix - mean_user_rating

# SVD with Grid Search for Optimal Components
best_rmse = float('inf')
best_components = 0

# Adjust n_components to not exceed n_features
for n in [50, 100, 150, 200, n_features]:
    if n > n_features:  # Skip invalid configurations
        continue
    svd = TruncatedSVD(n_components=n, random_state=42)
    decomposed_matrix = svd.fit_transform(sparse_matrix)
    reconstructed_matrix = np.dot(decomposed_matrix, svd.components_)
    rmse = np.sqrt(mean_squared_error(sparse_matrix.toarray(), reconstructed_matrix))
    print(f"n_components={n}, RMSE={rmse}")
    if rmse < best_rmse:
        best_rmse = rmse
        best_components = n

print(f"Best n_components={best_components} with RMSE={best_rmse}")

# Final Model with Optimal Components
svd_final = TruncatedSVD(n_components=best_components, random_state=42)
decomposed_matrix_final = svd_final.fit_transform(sparse_matrix)
reconstructed_matrix_final = np.dot(decomposed_matrix_final, svd_final.components_)

# Evaluate RMSE
rmse_final = np.sqrt(mean_squared_error(sparse_matrix.toarray(), reconstructed_matrix_final))
print(f"Final RMSE: {rmse_final}")


n_components=50, RMSE=0.15980843821005355
n_components=100, RMSE=0.00819451937152662
n_components=150, RMSE=1.1218115963314147e-15
n_components=200, RMSE=1.3175092213891877e-15
n_components=292, RMSE=1.2975310172513988e-15
Best n_components=150 with RMSE=1.1218115963314147e-15
Final RMSE: 1.1218115963314147e-15


In [45]:
# Example mappings
user_ids = data_sample['user_ID'].unique()  # Get unique user IDs from the dataset
actual_item_ids = data_sample['product_ID'].unique()  # Get unique product IDs from the dataset

# Map fixed range (0-299) to actual product IDs
item_index_to_actual_id = {index: actual_id for index, actual_id in enumerate(actual_item_ids)}

# Example user index for whom we want recommendations
user_index = 0  # Index in your user_ids list
actual_user_id = user_ids[user_index]  # Map index to actual user ID

# Simulated predicted matrix: Rows = Users, Columns = Items
# Replace this with the actual prediction matrix from your model
predicted_matrix = np.random.rand(len(user_ids), len(actual_item_ids))

# Get the user's prediction scores
user_predictions = predicted_matrix[user_index]

# Sort items by predicted score (lowest first)
recommended_items_indices = np.argsort(user_predictions)[:10]  # Top 10 item indices
recommendations = [
    f"{rank}. User {actual_user_id} - Recommendation Item {item_index_to_actual_id[item_idx]} (Index: {item_idx}, Score: {user_predictions[item_idx]:.4f})"
    for rank, item_idx in enumerate(recommended_items_indices, 1)
]

# Display recommendations
print(f"Top 10 Recommendations for User {actual_user_id} (with Predicted Scores):")
for rec in recommendations:
    print(rec)

Top 10 Recommendations for User AGKHLEW2SOWHNMFQIJGBECAF7INQ (with Predicted Scores):
1. User AGKHLEW2SOWHNMFQIJGBECAF7INQ - Recommendation Item B07FP2C8N8 (Index: 269, Score: 0.0044)
2. User AGKHLEW2SOWHNMFQIJGBECAF7INQ - Recommendation Item B08QRSNJW9 (Index: 219, Score: 0.0118)
3. User AGKHLEW2SOWHNMFQIJGBECAF7INQ - Recommendation Item B095SC4J8T (Index: 56, Score: 0.0118)
4. User AGKHLEW2SOWHNMFQIJGBECAF7INQ - Recommendation Item B096LJJ4K6 (Index: 276, Score: 0.0135)
5. User AGKHLEW2SOWHNMFQIJGBECAF7INQ - Recommendation Item B001281404 (Index: 279, Score: 0.0149)
6. User AGKHLEW2SOWHNMFQIJGBECAF7INQ - Recommendation Item B086QY6T7N (Index: 9, Score: 0.0238)
7. User AGKHLEW2SOWHNMFQIJGBECAF7INQ - Recommendation Item B00R8LI7ZO (Index: 275, Score: 0.0312)
8. User AGKHLEW2SOWHNMFQIJGBECAF7INQ - Recommendation Item B00EIL38WO (Index: 124, Score: 0.0332)
9. User AGKHLEW2SOWHNMFQIJGBECAF7INQ - Recommendation Item B0841S3FRB (Index: 201, Score: 0.0335)
10. User AGKHLEW2SOWHNMFQIJGBECAF7I

**3. Hybrid**

Load and Prepare Data for Hybrid

In [46]:
import pandas as pd
import numpy as np

# Collaborative filter data
collaborative_data_path = r'.\..\data\data_clean\user_clean_data_ecommerce.csv'
collaborative_data = pd.read_csv(collaborative_data_path)
collaborative_data = collaborative_data[['user_ID', 'product_ID', 'rating']].dropna()

# Content-based filter data
content_data_path = r'.\..\data\text_analysis\product_vectors\product_vectors_merged_sentiment.csv'
df_product_vectors_positive_sentiment = pd.read_csv(content_data_path)

Collaborative Filter - Matrix Decomposition (SVD)

In [47]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error

# Collaborative filter - User-Item matrix
user_item_matrix = collaborative_data.pivot_table(index='user_ID', columns='product_ID', values='rating').fillna(0)
sparse_matrix = csr_matrix(user_item_matrix.values)

# Normalize Ratings
mean_user_rating = np.mean(user_item_matrix, axis=1).values.reshape(-1, 1)
user_item_matrix_normalized = user_item_matrix - mean_user_rating

# SVD for Collaborative Filtering
best_rmse = float('inf')
best_components = 0

for n in [50, 100, 150, 200, user_item_matrix.shape[1]]:
    svd = TruncatedSVD(n_components=n, random_state=42)
    decomposed_matrix = svd.fit_transform(sparse_matrix)
    reconstructed_matrix = np.dot(decomposed_matrix, svd.components_)
    rmse = np.sqrt(mean_squared_error(sparse_matrix.toarray(), reconstructed_matrix))
    if rmse < best_rmse:
        best_rmse = rmse
        best_components = n

svd_final = TruncatedSVD(n_components=best_components, random_state=42)
decomposed_matrix_final = svd_final.fit_transform(sparse_matrix)
reconstructed_matrix_final = np.dot(decomposed_matrix_final, svd_final.components_)
collaborative_scores = decomposed_matrix_final



  user_item_matrix = collaborative_data.pivot_table(index='user_ID', columns='product_ID', values='rating').fillna(0)


: 

Content Based Filter - Cosine Similarity

In [1]:
from sklearn.neighbors import NearestNeighbors

# Extract product vectors for content-based filtering
product_vectors = df_product_vectors_positive_sentiment.iloc[:, 1:].values

# Fit NearestNeighbors model
top_n_products = 300  # Number of closest products to retrieve
nbrs = NearestNeighbors(n_neighbors=top_n_products, metric='cosine').fit(product_vectors)

# Retrieve top-N products for each user
distances, indices = nbrs.kneighbors(collaborative_scores)

# Convert distances to similarity scores
similarity_scores = 1 - distances  # Convert cosine distances to similarity scores


NameError: name 'df_product_vectors_positive_sentiment' is not defined

Combine Collaborative and Content Based Score

In [None]:
# Combine both collaborative and content-based scores
combined_scores = 0.5 * collaborative_scores + 0.5 * similarity_scores

# Prepare recommendations for each user
def get_recommendations(user_id, top_n=10):
    user_index = collaborative_data['user_ID'].unique().tolist().index(user_id)
    product_indices = indices[user_index]
    scores = combined_scores[user_index]

    # Get the top N recommendations
    recommended_products = [(df_product_vectors_positive_sentiment.iloc[idx]['product_ID'], score) for idx, score in zip(product_indices, scores)]
    return recommended_products[:top_n]

# Example: Get top 10 recommendations for a user
user_id = collaborative_data['user_ID'].iloc[0]
top_10_recommendations = get_recommendations(user_id, top_n=10)
print(top_10_recommendations)
