In [2]:
from run_pattern_similarity import get_rating_pattern_similarities

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

## helper function

In [2]:
def compute_proportions(ratings, total_ratings):
    return [count / total_ratings for count in ratings]

def calculate_cosine_similarity(vector1, vector2):
    return cosine_similarity([vector1], [vector2])[0][0]

## load data

In [3]:
all_beauty = get_rating_pattern_similarities("All_Beauty")
appliances = get_rating_pattern_similarities("Appliances")


Processing domain: All_Beauty
Processing domain: Appliances


In [5]:
domain_statistics = {}
for domain in [all_beauty, appliances]:
    domain_statistics[str(domain)] = {
        "num_reviews": len(domain),
        "num_users": len(domain["user_id"].unique()),
        "num_items": len(domain["parent_asin"].unique()),
        "user_ids": domain["user_id"].unique(),
        "item_ids": domain["parent_asin"].unique(),
    }

In [6]:
domain_statistics
# number of overlapping users and items
count_overlap_users = len(set(domain_statistics[str(all_beauty)]["user_ids"]).intersection(set(domain_statistics[str(appliances)]["user_ids"])))
count_overlap_items = len(set(domain_statistics[str(all_beauty)]["item_ids"]).intersection(set(domain_statistics[str(appliances)]["item_ids"])))

print(f"All users all_beauty: {domain_statistics[str(all_beauty)]['num_users']}")
print(f"All items appliances: {domain_statistics[str(appliances)]['num_users']}")
print(f"Number of overlapping users: {count_overlap_users}")
print(f"Number of overlapping items: {count_overlap_items}")

All users all_beauty: 631986
All items appliances: 1755732
Number of overlapping users: 58120
Number of overlapping items: 0


In [6]:
all_beauty

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,5.0,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1588687728923,0,True
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1588615855070,1,True
2,5.0,Yes!,"Smells good, feels great!",[],B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,1589665266052,2,True
3,1.0,Synthetic feeling,Felt synthetic,[],B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,1643393630220,0,True
4,5.0,A+,Love it,[],B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,1609322563534,0,True
...,...,...,...,...,...,...,...,...,...,...
701523,4.0,Four Stars,Conditioner is great shampoo not as I expected,[],B006YUIWKA,B006YUIWKA,AFIXGFVEGLMOTMBTJL7H3VSIETDQ,1478227021000,0,True
701524,1.0,Pretty,Did not work! Used the whole bottle and my hai...,[],B006YUIWKA,B006YUIWKA,AFV7YZFOJF564EZGET5LG45K4QEA,1480908730000,0,False
701525,5.0,Great sunless tanner,Product as expected. Shipping was on time.,[],B06ZZV9MZT,B06ZZV9MZT,AHYDCWDMMVMLBX7FY7M7JKADKRDQ,1590547974067,0,True
701526,5.0,The Crown on top is a Ring!!!,"Not only is it a delicious fragrance, but also...",[],B000HB6VLE,B000HB6VLE,AF6ZIAEN7TQ2WY5ZL77F6JDPV7XQ,1184798209000,4,False


## compute user-item rating matrices for both datasets (shared users)

In [7]:
# filter for shared users
all_beauty_filtered = all_beauty[all_beauty["user_id"].isin(domain_statistics[str(appliances)]["user_ids"])]
appliances_filtered = appliances[appliances["user_id"].isin(domain_statistics[str(all_beauty)]["user_ids"])]

# Check for duplicates
all_beauty_duplicates = all_beauty_filtered.duplicated(subset=["user_id", "parent_asin"], keep=False)
appliances_duplicates = appliances_filtered.duplicated(subset=["user_id", "parent_asin"], keep=False)

# Remove duplicates
all_beauty_filtered = all_beauty_filtered[~all_beauty_duplicates]
appliances_filtered = appliances_filtered[~appliances_duplicates]


# only keep user_id, parent_asin, and rating
all_beauty_filtered = all_beauty_filtered[["user_id", "parent_asin", "rating"]]
appliances_filtered = appliances_filtered[["user_id", "parent_asin", "rating"]]


# create user-item matrix
all_beauty_matrix = all_beauty_filtered.pivot(index="user_id", columns="parent_asin", values="rating")
appliances_matrix = appliances_filtered.pivot(index="user_id", columns="parent_asin", values="rating")

all_beauty_matrix = all_beauty_matrix.fillna(0)
appliances_matrix = appliances_matrix.fillna(0)

In [9]:
from sklearn.decomposition import TruncatedSVD, NMF
import numpy as np

In [8]:
all_beauty_matrix.shape

(57723, 30434)

In [None]:
svd = TruncatedSVD(n_components=10)  # k = number of latent factors
# all_beauty_matrix_reduced = all_beauty_matrix.iloc[:57723, :20000]
user_factors_1 = svd.fit_transform(all_beauty_matrix)  # User embeddings for Dataset 1
# user_factors_2 = svd.fit_transform(appliances_matrix)  # User embeddings for Dataset 2

In [11]:
user_factors_2 = svd.fit_transform(appliances_matrix)

In [34]:
user_factors_2.shape

(57689, 10)

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute pairwise cosine similarity for shared users

In [13]:
all_beauty_matrix_reduced = all_beauty_matrix.iloc[:20000, :20000]

In [None]:
cos_sim = cosine_similarity(all_beauty_matrix_reduced, all_beauty_matrix_reduced)

In [None]:
cos_sim

## group ratings by item

In [18]:
ratings_counts_all_beauty = all_beauty.groupby(['parent_asin', 'rating']).size().unstack(fill_value=0)
total_ratings_all_beauty = ratings_counts_all_beauty.sum(axis=1)

ratings_counts_all_beauty.head(), total_ratings_all_beauty.head()

(rating       1.0  2.0  3.0  4.0  5.0
 parent_asin                         
 0005946468     0    0    0    0    1
 0123034892     0    0    0    0    1
 0124784577     0    0    1    0    2
 0515059560     0    0    0    1    0
 0692508988     0    0    0    0    1,
 parent_asin
 0005946468    1
 0123034892    1
 0124784577    3
 0515059560    1
 0692508988    1
 dtype: int64)

## convert counts to proportions

In [None]:
all_beauty_rating_proportions = ratings_counts_all_beauty.div(total_ratings_all_beauty, axis=0)

In [None]:
all_beauty_rating_proportions.head()