In [None]:
import pandas as pd

# Define the file path
file_path = 'data/similarity_results.txt'

# Read the file into a DataFrame
df = pd.read_csv(file_path, sep='\t', header=None, names=['Category1', 'Category2', 'Similarity'])

# Remove leading and trailing spaces from category names
df['Category1'] = df['Category1'].str.strip()
df['Category2'] = df['Category2'].str.strip()

# Filter out rows where Category1 is equal to Category2
df_filtered = df[df['Category1'] != df['Category2']]

# Sort the filtered DataFrame by the 'Similarity' column in descending order
df_sorted_filtered = df_filtered.sort_values(by='Similarity', ascending=False)

# Display the sorted and filtered DataFrame
print(df_sorted_filtered)


                       Category1                 Category2  Similarity
6                     All_Beauty  Beauty_and_Personal_Care    0.277004
290                   Gift_Cards        Subscription_Boxes    0.272520
214  Cell_Phones_and_Accessories               Electronics    0.263107
97        Arts_Crafts_and_Sewing         Handmade_Products    0.257519
105       Arts_Crafts_and_Sewing           Office_Products    0.241480
..                           ...                       ...         ...
221  Cell_Phones_and_Accessories              Kindle_Store    0.017323
361    Industrial_and_Scientific              Kindle_Store    0.015454
127                   Automotive              Kindle_Store    0.006102
262                  Electronics              Kindle_Store    0.005018
74                    Appliances              Kindle_Store   -0.001945

[435 rows x 3 columns]


## Process data for CDRs

In [3]:
from utils import get_domain_reviews
import numpy as np
import pandas as pd

In [4]:
source_domain = get_domain_reviews("Software")
source_domain.head()

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,1.0,malware,mcaffee IS malware,[],B07BFS3G7P,B0BQSK9QCF,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,1562182632076,0,False
1,5.0,Lots of Fun,I love playing tapped out because it is fun to...,[],B00CTQ6SIG,B00CTQ6SIG,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,1424120336000,0,True
2,5.0,Light Up The Dark,I love this flashlight app! It really illumin...,[],B0066WJLU6,B0066WJLU6,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,1362399267000,0,True
3,4.0,Fun game,One of my favorite games,[],B00KCYMAWK,B00KCYMAWK,AH6CATODIVPVUOJEWHRSRCSKAOHA,1561061428662,0,True
4,4.0,I am not that good at it but my kids are,Cute game. I am not that good at it but my kid...,[],B00P1RK566,B00P1RK566,AEINY4XOINMMJCK5GZ3M6MMHBN6A,1418257196000,0,True


In [5]:
target_domain = get_domain_reviews("Video_Games")
target_domain.head()

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,4.0,It’s pretty sexual. Not my fav,I’m playing on ps5 and it’s interesting. It’s...,[],B07DJWBYKP,B07DK1H3H5,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,1608186804795,0,True
1,5.0,Good. A bit slow,Nostalgic fun. A bit slow. I hope they don’t...,[],B00ZS80PC2,B07SRWRH5D,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,1587051114941,1,False
2,5.0,... an order for my kids & they have really en...,This was an order for my kids & they have real...,[],B01FEHJYUU,B07MFMFW34,AGXVBIUFLFGMVLATYXHJYL4A5Q7Q,1490877431000,0,True
3,5.0,Great alt to pro controller,"These work great, They use batteries which is ...",[],B07GXJHRVK,B0BCHWZX95,AFTC6ZR5IKNRDG5JCPVNVMU3XV2Q,1577637634017,0,True
4,5.0,solid product,I would recommend to anyone looking to add jus...,[],B00HUWA45W,B00HUWA45W,AFTC6ZR5IKNRDG5JCPVNVMU3XV2Q,1427591932000,0,True


In [20]:
num_users = target_domain['user_id:token'].nunique()
num_items = target_domain['item_id:token'].nunique()

# Total possible interactions
total_possible_interactions = num_users * num_items

# Actual interactions (non-zero ratings are represented by rows in the dataset)
actual_interactions = len(target_domain)

# Calculate sparsity
sparsity = 1 - (actual_interactions / total_possible_interactions)

In [21]:
sparsity

0.999987821011192

In [None]:
from utils import get_domain_reviews
def make_rec_df(source_domain, target_domain):
    source_domain = get_domain_reviews(source_domain)
    target_domain = get_domain_reviews(target_domain)

    source_domain = source_domain[['user_id', 'parent_asin', 'rating', 'timestamp']]
    source_domain = source_domain[['user_id', 'parent_asin', 'rating', 'timestamp']].rename(columns={
    'user_id': 'user_id:token',
    'parent_asin': 'item_id:token',
    'rating': 'rating:float',
    'timestamp': 'timestamp:float'
    })
    target_domain = target_domain[['user_id', 'parent_asin', 'rating', 'timestamp']]
    target_domain = target_domain[['user_id', 'parent_asin', 'rating', 'timestamp']].rename(columns={
    'user_id': 'user_id:token',
    'parent_asin': 'item_id:token',
    'rating': 'rating:float',
    'timestamp': 'timestamp:float'
    })

    latest_review_source = source_domain['timestamp:float'].max()
    five_years_ago = latest_review_source - 5 * 365 * 24 * 60 * 60 * 1000

    source_domain = source_domain[source_domain['timestamp:float'] >= five_years_ago]
    n = 250000
    source_domain = source_domain.sample(n, random_state=42).reset_index(drop=True)

    latest_review_target = target_domain['timestamp:float'].max()
    five_years_ago = latest_review_target - 5 * 365 * 24 * 60 * 60 * 1000

    target_domain = target_domain[target_domain['timestamp:float'] >= five_years_ago]
    n = 250000
    target_domain = target_domain.sample(n, random_state=42).reset_index(drop=True)
    target_domain.to_csv('vendor/recbole-cdr/recbole_cdr/dataset/target_domain_reviews/target_domain_reviews.inter', sep='\t', index=False)
    source_domain.to_csv('vendor/recbole-cdr/recbole_cdr/dataset/source_domain_reviews/source_domain_reviews.inter', sep='\t', index=False)



In [6]:
# only keep user_id, parent_asin, and rating columns
source_domain = source_domain[['user_id', 'parent_asin', 'rating', 'timestamp']]
# Rename parent_asin to item_id and keep only the specified columns
source_domain = source_domain[['user_id', 'parent_asin', 'rating', 'timestamp']].rename(columns={
    'user_id': 'user_id:token',
    'parent_asin': 'item_id:token',
    'rating': 'rating:float',
    'timestamp': 'timestamp:float'
})

# Save all_beauty DataFrame to a text file
#source_domain.to_csv('vendor/recbole-cdr/recbole_cdr/datasets/source_domain_reviews/source_domain_reviews.inter', sep='\t', index=False)

In [7]:
target_domain = target_domain[['user_id', 'parent_asin', 'rating', 'timestamp']]
target_domain = target_domain[['user_id', 'parent_asin', 'rating', 'timestamp']].rename(columns={
    'user_id': 'user_id:token',
    'parent_asin': 'item_id:token',
    'rating': 'rating:float',
    'timestamp': 'timestamp:float'
})
# Save appliances DataFrame to a text file
#target_domain.to_csv('vendor/recbole-cdr/recbole_cdr/datasets/target_domain_reviews/target_domain_reviews.inter', sep='\t', index=False)

In [8]:
#print number of same users in both source and target domain
len(set(target_domain['user_id:token']).intersection(set(source_domain['user_id:token'])))

269068

In [14]:
# Source
latest_review = source_domain['timestamp:float'].max()
five_years_ago = latest_review - 5 * 365 * 24 * 60 * 60 * 1000
source_5_years = source_domain[source_domain['timestamp:float'] >= five_years_ago]
# sample n items from the target domain
n = 250000
source_5_years_sample = source_5_years.sample(n, random_state=42).reset_index(drop=True)

len(source_5_years)

1189720

In [15]:
#filter the target domain to only include items from the last 5 years timespamp is unix time ex. 1608186804795
latest_review = target_domain['timestamp:float'].max()
five_years_ago = latest_review - 5 * 365 * 24 * 60 * 60 * 1000
target_5_years = target_domain[target_domain['timestamp:float'] >= five_years_ago]
# sample n items from the target domain
n = 250000
target_5_years_sample = target_5_years.sample(n, random_state=42).reset_index(drop=True)


len(target_5_years_sample)

250000

In [16]:
#print number of same users in both source and target domain
len(set(source_5_years_sample['user_id:token']).intersection(set(target_5_years_sample['user_id:token'])))

3542

In [17]:
target_5_years_sample.to_csv('vendor/recbole-cdr/recbole_cdr/dataset/target_domain_reviews/target_domain_reviews.inter', sep='\t', index=False)

In [18]:
source_5_years_sample.to_csv('vendor/recbole-cdr/recbole_cdr/dataset/source_domain_reviews/source_domain_reviews.inter', sep='\t', index=False)

In [32]:
#get number of unique user ids
target_5_years_sample['user_id:token'].nunique()

438915