In [149]:
import torch
import torch.nn.functional as F
import pandas as pd

In [150]:
#load unprocessed data
products_unprocessed = pd.read_csv('data/products_sampled.csv')
reviews_unprocessed = pd.read_csv('data/reviews_sampled.csv')

products_unprocessed.shape, reviews_unprocessed.shape


((44111, 20), (45705, 13))

In [151]:
products_unprocessed.head(1)

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes,details,index
0,"['Office Products', 'Office & School Supplies'...","class=""a-keyvalue prodDetTable"" role=""present...",['Protect yourself and your RFID card with a S...,,Black RFID Blocking ID Badge Holder (Holds 2 C...,"['B005CXZTO2', 'B007XV1MSI', 'B000O9K45I', 'B0...",,Specialist ID,"['RFID Blocking 2 Card Holder', 'FIPS 201 Appr...","['>#43,873 in Office Products (See top 100)', ...",[],Office Products,"class=""a-bordered a-horizontal-stripes a-spa...","October 14, 2011",$6.49,B005VSY1VK,['https://images-na.ssl-images-amazon.com/imag...,['https://images-na.ssl-images-amazon.com/imag...,{},108713


In [152]:
#load the merged dataset
merged_data = pd.read_csv('data/merged_data_processed.csv')
unreviewed_products = pd.read_csv('data/unreviewed_products_processed.csv')
all_products = pd.read_csv('data/products_sampled_processed.csv')
all_reviews = pd.read_csv('data/reviews_sampled_processed.csv')

In [153]:
all_products.shape, all_reviews.shape, merged_data.shape, unreviewed_products.shape

((35248, 43), (45486, 9), (47251, 51), (22896, 43))

In [154]:
#load the embeddings
rev_embeddings = torch.load('data/review_embeddings.pt')
summ_embeddings = torch.load('data/summary_embeddings.pt')
desc_embeddings = torch.load('data/description_embeddings.pt')
titles_embeddings = torch.load('data/title_embeddings.pt')
features_embeddings = torch.load('data/feature_embeddings.pt')
brand_embeddings = torch.load('data/brand_embeddings.pt')

In [155]:
product_embeddings_average = torch.mean(torch.stack([desc_embeddings, titles_embeddings, features_embeddings, brand_embeddings]), dim=0)
user_embeddings_average = torch.mean(torch.stack([rev_embeddings, summ_embeddings]), dim=0)

In [156]:
product_embeddings_map = {}
for i, row in all_products.iterrows():
    product_id = row['asin']
    product_embeddings_map[product_id] = product_embeddings_average[i]

In [157]:
#select 1 user randomly for merged data
user = merged_data['reviewerID'].sample(1).values[0]

user

#select top 5 reviews for the user from the all_reviews
user_reviews = all_reviews[all_reviews['reviewerID'] == user].sort_values('overall', ascending=False).head(5)

#get the index of the reviews (index is a column)
user_reviews_index = user_reviews["index"].values
user_reviews_index

embeddings_index = user_reviews.index


#get embeddings that correspond to the user reviews
user_reviews_embeddings = user_embeddings_average[embeddings_index]
user_reviews_embeddings


#get the embeddings that correspond to the unreviewed products

unreviewed_product_embeddings = {}
for i, row in unreviewed_products.iterrows():
    product_id = row['asin']
    unreviewed_product_embeddings[product_id] = product_embeddings_map[product_id]




#get the cosine similarity between the user reviews and all the unreviewed products

unreviewed_product_similarity = {}
for i in range(len(unreviewed_product_embeddings)):
    product_id = unreviewed_products.iloc[i]['asin']
    product_embedding = unreviewed_product_embeddings[product_id]
    similarity = F.cosine_similarity(user_reviews_embeddings, product_embedding, dim=1)
    unreviewed_product_similarity[product_id] = similarity.mean().item()


In [158]:
print(unreviewed_product_similarity)

#select the 5 best products
best_products = sorted(unreviewed_product_similarity.items(), key=lambda x: x[1], reverse=True)[:5]

best_products

{'B00KH94VSG': 0.9653437733650208, 'B00CX71JNU': 0.9636460542678833, 'B002CO43BO': 0.9640669822692871, 'B0187ZSO20': 0.9528821706771851, 'B00J80NAKK': 0.9616990089416504, 'B001A3Y5UM': 0.9604053497314453, 'B006DXANZA': 0.9548710584640503, 'B0015ZUKH2': 0.9625096321105957, 'B00M55B2BE': 0.9664911031723022, 'B00422MOBU': 0.9544351696968079, 'B00LPSJL92': 0.952394962310791, 'B000VZ1468': 0.9655637741088867, 'B016IB1K6W': 0.9412863850593567, 'B00O4AFZT8': 0.9590355157852173, 'B006CQU2WW': 0.9609476327896118, 'B001MHT1VE': 0.9612683057785034, 'B006922LK0': 0.9636877179145813, 'B01G7V3E6C': 0.9578800201416016, 'B004BP9PZ6': 0.9508565664291382, 'B000T6783M': 0.9531758427619934, 'B004MAZH66': 0.9574882388114929, 'B002XJR93W': 0.9615433812141418, 'B00WMFTAY8': 0.9474371671676636, 'B000MQAI72': 0.9637007713317871, 'B00006JNQJ': 0.954576849937439, 'B00NO546US': 0.948432445526123, 'B01G5IV3XI': 0.950442910194397, 'B00R1X2354': 0.9553495645523071, 'B0051G44S2': 0.9625407457351685, 'B0071NMV6K': 0.9

[('B00FB46D9M', 0.975752055644989),
 ('B010W97UBU', 0.974515438079834),
 ('B00DK0ZOZS', 0.9738521575927734),
 ('B00006ICYO', 0.9737021327018738),
 ('B001QKOCQG', 0.9736384153366089)]

In [159]:
for i in range(5):
    print(f"User review {i+1}:")
    #get index of the review (index is a column)
    review_index = user_reviews_index[i]
    print("Index:", review_index)
    #get the review where index column matches review index
    review = reviews_unprocessed[reviews_unprocessed['index'] == review_index].iloc[0]
    print("Summary:", review['summary'])
    print("Review text:", review['reviewText'])
    print("Rating:", review['overall'])
    print()

User review 1:
Index: 778092
Summary: Original Brother Cartridge
Review text: OEM Cartridge. Fits and works as expected. Prompt shipping. I love the Prime program as it gets my orders here fast. And the price lower than local suppliers.
Rating: 5.0

User review 2:
Index: 661836
Summary: Great for inexpensive clip boards.
Review text: For inexpensive clip board, we are very satisfied with these. Nothing fancy but very effective.
Rating: 5.0

User review 3:
Index: 494162
Summary: OEM HP Ink
Review text: Hey it's HP OEM ink, works great like expected. Saved a few dollars and got fast delivery. I just love the Prime program
Rating: 5.0

User review 4:
Index: 450385
Summary: Nice and thin give a decent natural feel
Review text: Nice and thin give a decent natural feel. The lubrication works very well I thought.
Rating: 5.0

User review 5:
Index: 434660
Summary: Original HP Ink
Review text: Original HP ink. Works very well and get excellent print results. Less expensive from Amazon than loca

In [160]:
#print the review text and summary for the user and also the top 5 product titles
print('Top 5 Products:')
for product in best_products:
    print(products_unprocessed[products_unprocessed['asin'] == product[0]]['title'].values[0])
    

Top 5 Products:
Uni-ball Signo Ultra Micro Point Gel Pens Refills -0.38mm-red Ink-value Set of 5(with Our Shop Original Description of Goods)
Lots Donuts Doughnuts Desktop Office Silicone Mouse Pad
Goldstar Erasable Markers
Screen Top Monitor Shelf
KNM9J04203 - 9J04203 Toner


In [161]:
print(desc_embeddings.shape)
print(titles_embeddings.shape)
print(features_embeddings.shape)
print(brand_embeddings.shape)
print(rev_embeddings.shape)
print(summ_embeddings.shape)


torch.Size([35248, 768])
torch.Size([35248, 768])
torch.Size([35248, 768])
torch.Size([35248, 768])
torch.Size([45486, 768])
torch.Size([45486, 768])
