In [None]:
import pandas as pd

# Define the file path
file_path = 'data/similarity_results.txt'

# Read the file into a DataFrame
df = pd.read_csv(file_path, sep='\t', header=None, names=['Category1', 'Category2', 'Similarity'])

# Remove leading and trailing spaces from category names
df['Category1'] = df['Category1'].str.strip()
df['Category2'] = df['Category2'].str.strip()

# Filter out rows where Category1 is equal to Category2
df_filtered = df[df['Category1'] != df['Category2']]

# Sort the filtered DataFrame by the 'Similarity' column in descending order
df_sorted_filtered = df_filtered.sort_values(by='Similarity', ascending=False)

# Display the sorted and filtered DataFrame
print(df_sorted_filtered)


                       Category1                 Category2  Similarity
6                     All_Beauty  Beauty_and_Personal_Care    0.277004
290                   Gift_Cards        Subscription_Boxes    0.272520
214  Cell_Phones_and_Accessories               Electronics    0.263107
97        Arts_Crafts_and_Sewing         Handmade_Products    0.257519
105       Arts_Crafts_and_Sewing           Office_Products    0.241480
..                           ...                       ...         ...
221  Cell_Phones_and_Accessories              Kindle_Store    0.017323
361    Industrial_and_Scientific              Kindle_Store    0.015454
127                   Automotive              Kindle_Store    0.006102
262                  Electronics              Kindle_Store    0.005018
74                    Appliances              Kindle_Store   -0.001945

[435 rows x 3 columns]


## Process data for CDRs

In [1]:
from utils import get_domain_reviews
import numpy as np
import pandas as pd

In [11]:
source_domain = get_domain_reviews("musical_instruments")
source_domain.head()

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,5.0,Five Stars,"Great headphones, comfortable and sound is goo...",[],B003LPTAYI,B003LPTAYI,AGKASBHYZPGTEPO6LWZPVJWB2BVA,1452650586000,0,True
1,3.0,nice sound. pedal failed after less than 1 year,I like the piano.. but the sustain pedal faile...,[],B00723436A,B06XP6TDVY,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,1558567365290,2,True
2,4.0,okay,pretty good overall. I like it. the controll...,[],B0040FJ27S,B0040FJ27S,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,1384912482000,0,True
3,3.0,Easy to return.,Too bad it didn't work. At least the return pr...,[],B00191WVF6,B00WJ3HL5I,AEM663T6XHZFWLODF4US2RCOCUSA,1607055693671,0,True
4,5.0,Good product despite tight bolt.,Good and sturdy but the bolt was hell to get o...,[],B07T9NM5QR,B07T9NM5QR,AFJTRBXMURLHS5EGNXLUHDHIZRFQ,1622595785255,0,False


In [12]:
source_domain.size

30174390

In [4]:
target_domain = get_domain_reviews("video_games")
print(target_domain.size)
target_domain.head()

46246150


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,4.0,It’s pretty sexual. Not my fav,I’m playing on ps5 and it’s interesting. It’s...,[],B07DJWBYKP,B07DK1H3H5,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,1608186804795,0,True
1,5.0,Good. A bit slow,Nostalgic fun. A bit slow. I hope they don’t...,[],B00ZS80PC2,B07SRWRH5D,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,1587051114941,1,False
2,5.0,... an order for my kids & they have really en...,This was an order for my kids & they have real...,[],B01FEHJYUU,B07MFMFW34,AGXVBIUFLFGMVLATYXHJYL4A5Q7Q,1490877431000,0,True
3,5.0,Great alt to pro controller,"These work great, They use batteries which is ...",[],B07GXJHRVK,B0BCHWZX95,AFTC6ZR5IKNRDG5JCPVNVMU3XV2Q,1577637634017,0,True
4,5.0,solid product,I would recommend to anyone looking to add jus...,[],B00HUWA45W,B00HUWA45W,AFTC6ZR5IKNRDG5JCPVNVMU3XV2Q,1427591932000,0,True


In [13]:
# Get the latest timestamp from the data
latest_timestamp = source_domain['timestamp'].max()

# Calculate the timestamp for 5 years ago from the latest timestamp
five_years_ago_timestamp = latest_timestamp - (5 * 365 * 24 * 60 * 60 * 1000)

# Filter the DataFrame to only keep reviews from the last 5 years
source_domain_filtered = source_domain[source_domain['timestamp'] >= five_years_ago_timestamp]

# # Rename parent_asin to item_id and keep only the specified columns
# source_domain = source_domain[['user_id', 'parent_asin', 'rating', 'timestamp']].rename(columns={
#     'user_id': 'user_id:token',
#     'parent_asin': 'item_id:token',
#     'rating': 'rating:float',
#     'timestamp': 'timestamp:float'
# })

# # Save the DataFrame to a text file
# source_domain.to_csv('source_domain_reviews.inter', sep='\t', index=False)

In [14]:
source_domain_filtered.shape

(1639342, 10)

In [12]:
# Sample 500,000 rows from the source_domain DataFrame
sampled_source_domain = source_domain.sample(n=500000, random_state=42).reset_index(drop=True)

# Display the shape of the sampled DataFrame to confirm the number of rows
sampled_source_domain.shape

(500000, 10)

In [15]:
# only keep user_id, parent_asin, and rating columns
source_domain_filtered = source_domain_filtered[['user_id', 'parent_asin', 'rating', 'timestamp']]
# Rename parent_asin to item_id and keep only the specified columns
source_domain_filtered = source_domain_filtered[['user_id', 'parent_asin', 'rating', 'timestamp']].rename(columns={
    'user_id': 'user_id:token',
    'parent_asin': 'item_id:token',
    'rating': 'rating:float',
    'timestamp': 'timestamp:float'
})

# Save all_beauty DataFrame to a text file
source_domain_filtered.to_csv('source_domain_reviews.inter', sep='\t', index=False)

In [8]:
target_domain_filtered = target_domain[target_domain['timestamp'] >= five_years_ago_timestamp]

In [9]:
target_domain_filtered.size

21420070

In [10]:
target_domain_filtered = target_domain_filtered[['user_id', 'parent_asin', 'rating', 'timestamp']]
target_domain_filtered = target_domain_filtered[['user_id', 'parent_asin', 'rating', 'timestamp']].rename(columns={
    'user_id': 'user_id:token',
    'parent_asin': 'item_id:token',
    'rating': 'rating:float',
    'timestamp': 'timestamp:float'
})
# Save appliances DataFrame to a text file
target_domain_filtered.to_csv('target_domain_reviews.inter', sep='\t', index=False)