# NIRS (preprocessing)

In [1]:
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
import gzip
import json
import random
import sklearn
import torch

def seed_everything(seed=42):
    # Seed the random number generator
    random.seed(seed)

    # Seed NumPy
    np.random.seed(seed)

    # Seed scikit-learn
    sklearn.utils.check_random_state(seed)

    # Seed PyTorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

    # Set pandas options
    pd.set_option('display.max_columns', None)  # Display all columns in pandas DataFrames
    pd.set_option('display.max_rows', None)  # Display all rows in pandas DataFrames
    pd.set_option('display.width', None)  # Disable column width restriction
    pd.set_option('display.expand_frame_repr', False)  # Prevent line wrapping in pandas DataFrames

seed_everything(seed=42)

In [2]:
def parse(path):
  """read the dataset from json"""
  
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l.strip())
    
def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
    
  return pd.DataFrame.from_dict(df, orient='index')

def count_nan_values(df):
    nan_counts = df.isna().sum()
    return nan_counts[nan_counts > 0]

def count_empty_strings(df):
    empty_string_counts = (df == '').sum()
    return empty_string_counts[empty_string_counts > 0]

def print_shapes(reviews_df, products_df):
    print(f"Reviews df shape: {reviews_df.shape}")
    print(f"Products df shape: {products_df.shape}")
    
def save_data(reviews_df, products_df, reviews_file, products_file):
    reviews_df.to_csv(reviews_file, index=False)
    products_df.to_csv(products_file, index=False)

## Reading data (skip if you already have the sampled data you need)

In [3]:
df_reviews = getDF('data/Office_Products_5.json.gz')
df_products = getDF('data/meta_Office_Products.json.gz')

In [4]:
print(f'Number of unique products: {df_reviews["asin"].nunique()}')
print(f'Number of unique users: {df_reviews["reviewerID"].nunique()}')

Number of unique products: 27965
Number of unique users: 101501


In [5]:
print_shapes(df_reviews, df_products)

Reviews df shape: (800357, 12)
Products df shape: (315458, 19)


In [6]:
df_reviews.head(3)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,4.0,True,"11 7, 2017",A2NIJTYWADLK57,140503528,{'Format:': ' Board book'},cotton clay,kids like story BUT while i really wanted a bo...,"good story, small size book though",1510012800,,
1,4.0,True,"03 7, 2017",A2827D8EEURMP4,140503528,{'Format:': ' Hardcover'},emankcin,Bought this used and it came in great conditio...,Good,1488844800,,
2,5.0,True,"06 25, 2016",APB6087F4J09J,140503528,{'Format:': ' Board book'},Starbucks Fan,Every story and book about Corduroy is Fantast...,Best Books for All Children,1466812800,,


In [7]:
df_products.head(3)

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes,details
0,"[Office Products, Office & School Supplies, Ed...","class=""a-keyvalue prodDetTable"" role=""present...",[Sequential Spelling is based on the classic O...,,Sequential Spelling Level 1 Bundle with Studen...,[],,STL Distributors,[],"[>#439,654 in Office Products (See top 100), >...","[1935943065, 1935943073, B00IJH9Q4M, 002115021...",Office Products,"class=""a-bordered a-horizontal-stripes a-spa...","August 15, 2014",$32.90,12624861,[],[],
1,"[Office Products, Office &amp; School Supplies...",,"[Unusual book, , ]",,"Mathematics, Applications and Concepts, Course...",[],,bailey,[],"3,839,628 in Books (",[],Books,,,$8.62,78652669,[],[],
2,[],"class=""a-keyvalue prodDetTable"" role=""present...",[Pearson MyHistoryLab Online Access Code for A...,,Pearson MyHistoryLab Online Access Code for Am...,[],,Pearson MyHistoryLab,[Pearson MyHistoryLab Online Access Code for A...,"[>#1,925,354 in Office Products (See top 100)]",[],Office Products,,"June 21, 2012",$0.99,136039847,[],[],


## Sampling data (skip if you already have the sampled data you need)

In [8]:
def sample_data(reviews_df, products_df, min_reviews_count=10, max_users=1000, frac_sampled_products=0.1):
    # Sample a subset of users based on the number of reviews they have 
    user_reviews_count = reviews_df['reviewerID'].value_counts()
    selected_users = user_reviews_count[user_reviews_count >= min_reviews_count].index[:max_users]
    reviews_subset: pd.DataFrame = reviews_df[reviews_df['reviewerID'].isin(selected_users)]
    
    # Get all the products reviewed by the selected users
    reviewed_products = reviews_subset['asin'].unique()
    sampled_products: pd.DataFrame = products_df.sample(frac=frac_sampled_products, random_state=42)
    
    # Add the missing products that are reviewed
    missing_products = set(reviewed_products) - set(sampled_products['asin'])
    missing_products_df = products_df[products_df['asin'].isin(missing_products)]
    sampled_products = pd.concat([sampled_products, missing_products_df])
    
    return reviews_subset, sampled_products

In [9]:
df_reviews_sampled, df_products_sampled = sample_data(df_reviews, df_products, min_reviews_count=10, frac_sampled_products=0.1)

# save sampled data as checkpoint
save_data(df_reviews_sampled, df_products_sampled, 'data/reviews_sampled.csv', 'data/products_sampled.csv')

In [10]:
print_shapes(df_reviews_sampled, df_products_sampled)

Reviews df shape: (45779, 12)
Products df shape: (44109, 19)


## Missing values handling / removing irrelevant features

In [11]:
# optional checkpoint to skip the initial reading and sampling step
df_reviews_sampled = pd.read_csv('data/reviews_sampled.csv')
df_products_sampled = pd.read_csv('data/products_sampled.csv')

df_reviews_cleaned = df_reviews_sampled.copy()
df_products_cleaned = df_products_sampled.copy()

### Process for reviews dataset

In [12]:
print_shapes(df_reviews_cleaned, df_products_cleaned)

Reviews df shape: (45779, 12)
Products df shape: (44109, 19)


In [13]:
# remove irrelevant columns
df_reviews_cleaned.drop(columns=['verified', 'unixReviewTime', 'style', 'image', 'vote'], inplace=True)

In [14]:
# fill evential null values in the reviewTime and adapt the date format
df_reviews_cleaned['reviewTime'] = pd.to_datetime(df_reviews_cleaned['reviewTime'], errors='coerce')
df_reviews_cleaned['reviewTime'] = df_reviews_cleaned['reviewTime'].fillna(pd.Timestamp.min).dt.strftime('%B %d, %Y')

In [15]:
# remove sample with empty reviewer name and reviwer text, since
# it's a very small percentage of the dataset
df_reviews_cleaned.dropna(subset=['reviewerName', 'reviewText'], inplace=True)

In [16]:
print("Total nan values: " , df_reviews_cleaned.isna().sum())

Total nan values:  overall         0
reviewTime      0
reviewerID      0
asin            0
reviewerName    0
reviewText      0
summary         0
dtype: int64


In [17]:
print_shapes(df_reviews_cleaned, df_products_cleaned)

Reviews df shape: (45710, 7)
Products df shape: (44109, 19)


### Process for the product dataset

In [18]:
print('Nan values per feature: \n', count_nan_values(df_products_cleaned))

Nan values per feature: 
 tech1           11287
fit             44104
title               3
tech2           44070
brand             505
main_cat          191
similar_item    22407
date             5068
price           16957
details           840
dtype: int64


In [19]:
# Replace illegal dates with the oldest possible date format
df_products_cleaned['date'] = pd.to_datetime(df_products_cleaned['date'], errors='coerce')
df_products_cleaned['date'] = df_products_cleaned['date'].fillna(pd.Timestamp.min).dt.strftime('%B %d, %Y')

In [20]:
default_main_cat = 'Office Products'

# Fill nan values of main category with 'Office Products', which is the main category in the dataset
df_products_cleaned['main_cat'] = df_products_cleaned['main_cat'].fillna(default_main_cat)

# Remove rows with main category starting with '<', which are the start of an html tag
df_products_cleaned = df_products_cleaned[~df_products_cleaned['main_cat'].str.startswith('<')]

In [21]:
# drop the only samples with nan values in title
df_products_cleaned.dropna(subset=['title'], inplace=True)

In [22]:
# fill the missing vlaues of brands with 'Unknown'
df_products_cleaned['brand'] = df_products_cleaned['brand'].fillna('Unknown')

In [23]:
# remove useless / irrelevant columns / columns without meaningful data
# (details is also irrilevant, as most of the samples has empty json)
# we will remove the "category" feature for now as well (the one with the list of categories; 
# we may add it again later if we find a way to use it)
cols_to_drop = ['similar_item', 'price', 'details', 'also_view', 'also_buy', "imageURL", "imageURLHighRes", 'tech1', 'tech2', 'fit', 'category']
df_products_cleaned.drop(cols_to_drop, axis=1, inplace=True)

In [24]:
df_products_cleaned.head(3)

Unnamed: 0,description,title,brand,feature,rank,main_cat,date,asin
0,['Protect yourself and your RFID card with a S...,Black RFID Blocking ID Badge Holder (Holds 2 C...,Specialist ID,"['RFID Blocking 2 Card Holder', 'FIPS 201 Appr...","['>#43,873 in Office Products (See top 100)', ...",Office Products,"October 14, 2011",B005VSY1VK
1,['The Star Wars Moleskine Saga continues in 20...,Moleskine 2015 Star Wars Limited Edition Daily...,Moleskine,[],[],Office Products,"December 26, 2013",8867323296
2,"['Staples Washable Glue Sticks, Purple, .26 oz...","Staples Washable Glue Sticks, Purple, .26 oz.,...",Staples,[],"['>#161,293 in Office Products (See top 100)',...",Office Products,"June 22, 2015",B011LAU4R6


In [25]:
print_shapes(df_reviews_cleaned, df_products_cleaned)

Reviews df shape: (45710, 7)
Products df shape: (43810, 8)


## Text preprocessing

In [26]:
import text_preprocessing as tp

df_reviews_text_processed = df_reviews_cleaned.copy()
df_products_text_processed = df_products_cleaned.copy()

preprocessor = tp.TextPreprocessor()

In [27]:
df_reviews_text_processed['summary'] = df_reviews_text_processed['summary'].astype(str)
df_reviews_text_processed['summary'] = preprocessor.fit_transform(df_reviews_text_processed['summary'])

df_reviews_text_processed = df_reviews_text_processed[df_reviews_text_processed['summary'] != '']

In [28]:
df_reviews_text_processed['reviewText'] = df_reviews_text_processed['reviewText'].astype(str)
df_reviews_text_processed['reviewText'] = preprocessor.fit_transform(df_reviews_text_processed['reviewText'])

df_reviews_text_processed = df_reviews_text_processed[df_reviews_text_processed['reviewText'] != '']

In [29]:
df_products_text_processed['description'] = df_products_text_processed['description'].astype(str)
df_products_text_processed['description'] = preprocessor.fit_transform(df_products_text_processed['description'])
df_products_text_processed['description'] = df_products_text_processed['description'].str[1:-1]

df_products_text_processed = df_products_text_processed[df_products_text_processed['description'] != '']

In [30]:
# parse the feature column in lists, and then concatenate them together
# the prerprocessing will follow
df_products_text_processed['feature'] = df_products_text_processed['feature'].astype(str)
df_products_text_processed['feature'] = df_products_text_processed['feature'].apply(lambda x: ". ".join(eval(x)).strip())
df_products_text_processed['feature'] = preprocessor.fit_transform(df_products_text_processed['feature'])

df_products_text_processed = df_products_text_processed[df_products_text_processed['feature'] != '']

In [31]:
# preprocess title
df_products_text_processed['title'] = df_products_text_processed['title'].astype(str)
df_products_text_processed['title'] = preprocessor.fit_transform(df_products_text_processed['title'])

In [32]:
# preprocess brand
df_products_text_processed["brand"] = preprocessor.fit_transform(df_products_text_processed["brand"])

## Handling "rank" feature from product dataset

We decided just to keep the rank of the product in the office category (the rank will be -1 if the product doen't have a rank in hte Office category)

In [33]:
import re

df_products_rank_managed = df_products_text_processed.copy()

def extract_office_product_rank(rank):
    match = re.search(r'>#(\d+)', rank)
    if match and 'Office Products' in rank:
        return int(match.group(1))
    else:
        return -1

df_products_rank_managed['rank'] = df_products_rank_managed['rank'].astype(str)
df_products_rank_managed['rank'] = df_products_rank_managed['rank'].apply(extract_office_product_rank)

## Renaming features

In [34]:
df_products_ranaming_cols = df_products_rank_managed.copy()

df_products_ranaming_cols.rename(columns={'date': 'productPublishedDate'}, inplace=True)

## Remove reviews of products missing in the product dataset

In [35]:
df_reviews_product_consistent = df_reviews_text_processed.copy()

# Remove reviews of products missing in the product dataset
df_reviews_product_consistent = df_reviews_product_consistent[
  df_reviews_product_consistent['asin'].isin(df_reviews_product_consistent['asin'])]

## One-hot encoding for "main category" feature of product

In [36]:
df_products_main_cat_onehot = df_products_ranaming_cols.copy()

print(df_products_main_cat_onehot["main_cat"].nunique())

df_products_main_cat_onehot = pd.get_dummies(df_products_main_cat_onehot, columns=['main_cat'], prefix='main_cat')

# Convert the one-hot encoded features to integer
one_hot_columns = [col for col in df_products_main_cat_onehot.columns if col.startswith('main_cat_')]
df_products_main_cat_onehot[one_hot_columns] = df_products_main_cat_onehot[one_hot_columns].astype(int)

df_products_main_cat_onehot.head(2)

33


Unnamed: 0,description,title,brand,feature,rank,productPublishedDate,asin,main_cat_All Beauty,main_cat_All Electronics,main_cat_Amazon Home,"main_cat_Arts, Crafts & Sewing","main_cat_Arts, Crafts &amp; Sewing",main_cat_Automotive,main_cat_Baby,main_cat_Books,main_cat_Camera & Photo,main_cat_Camera &amp; Photo,main_cat_Car Electronics,main_cat_Cell Phones & Accessories,main_cat_Cell Phones &amp; Accessories,main_cat_Computers,main_cat_Gift Cards,main_cat_Grocery,main_cat_Health & Personal Care,main_cat_Home Audio & Theater,main_cat_Home Audio &amp; Theater,main_cat_Industrial & Scientific,main_cat_Industrial &amp; Scientific,main_cat_Musical Instruments,main_cat_Office Products,main_cat_Pet Supplies,main_cat_Portable Audio & Accessories,main_cat_Software,main_cat_Sports & Outdoors,main_cat_Sports &amp; Outdoors,main_cat_Tools & Home Improvement,main_cat_Tools &amp; Home Improvement,main_cat_Toys & Games,main_cat_Toys &amp; Games,main_cat_Video Games
0,rotect rfid card skimsafe card holder made rig...,black rfid blocking id badge holder hold 2 car...,specialist id,rfid blocking 2 card holder fips 201 approved ...,43,"October 14, 2011",B005VSY1VK,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,xclusive design classi,best abstract fiery floral design mouse pad cu...,luxladymousepad,material made best plastic manufacturing also ...,-1,"September 21, 1677",B00KH94VSG,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Managing dates for products

In [37]:
df_products_dates_managed = df_products_main_cat_onehot.copy()

#get the most recent date - first need to convert the date to datetime, it is in this format: October 14, 2011 or Unknown
def exponential_decay(time_diff, decay_rate=0.0001):
    return np.exp(-decay_rate * time_diff)

dates = df_products_dates_managed['productPublishedDate'].apply(lambda x: pd.to_datetime(x, errors='coerce'))
most_recent_date = dates.max()
most_outdated_date = dates.min()
print("most recent date:", most_recent_date)
print("most oudated date:", most_outdated_date)

#create a new column where if the date is Unknown, it will be the most outdated date, otherwise it will be the difference between the most recent date and the date of the product
df_products_dates_managed['dayDifferenceProduct'] = dates.apply(lambda x: (most_recent_date - x).days if pd.notnull(x) else (most_recent_date - most_outdated_date).days)

#apply a function to the day difference so a bigger day difference gives a 
# low value and a smaller day difference gives a high value
df_products_dates_managed['dayDifferenceProduct'] = df_products_dates_managed['dayDifferenceProduct'].apply(exponential_decay)
df_products_dates_managed.head(2)

most recent date: 2019-01-14 00:00:00
most oudated date: 1941-12-07 00:00:00


Unnamed: 0,description,title,brand,feature,rank,productPublishedDate,asin,main_cat_All Beauty,main_cat_All Electronics,main_cat_Amazon Home,"main_cat_Arts, Crafts & Sewing","main_cat_Arts, Crafts &amp; Sewing",main_cat_Automotive,main_cat_Baby,main_cat_Books,main_cat_Camera & Photo,main_cat_Camera &amp; Photo,main_cat_Car Electronics,main_cat_Cell Phones & Accessories,main_cat_Cell Phones &amp; Accessories,main_cat_Computers,main_cat_Gift Cards,main_cat_Grocery,main_cat_Health & Personal Care,main_cat_Home Audio & Theater,main_cat_Home Audio &amp; Theater,main_cat_Industrial & Scientific,main_cat_Industrial &amp; Scientific,main_cat_Musical Instruments,main_cat_Office Products,main_cat_Pet Supplies,main_cat_Portable Audio & Accessories,main_cat_Software,main_cat_Sports & Outdoors,main_cat_Sports &amp; Outdoors,main_cat_Tools & Home Improvement,main_cat_Tools &amp; Home Improvement,main_cat_Toys & Games,main_cat_Toys &amp; Games,main_cat_Video Games,dayDifferenceProduct
0,rotect rfid card skimsafe card holder made rig...,black rfid blocking id badge holder hold 2 car...,specialist id,rfid blocking 2 card holder fips 201 approved ...,43,"October 14, 2011",B005VSY1VK,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.767283
3,xclusive design classi,best abstract fiery floral design mouse pad cu...,luxladymousepad,material made best plastic manufacturing also ...,-1,"September 21, 1677",B00KH94VSG,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.059833


## Managing dates for reviews

In [38]:
df_reviews_dates_managed = df_reviews_product_consistent.copy()

rev_dates = df_reviews_cleaned['reviewTime'].apply(lambda x: pd.to_datetime(x, errors='coerce'))


rev_most_recent_date = rev_dates.max()
rev_most_outdated_date = rev_dates.min()
print("reviews: most recent date:", rev_most_recent_date)
print("reviews: most oudated date:", rev_most_outdated_date)

df_reviews_dates_managed['dayDifferenceReview'] = rev_dates.apply(lambda x: (rev_most_recent_date - x).days if pd.notnull(x) else (rev_most_recent_date - rev_most_outdated_date).days)
df_reviews_dates_managed['dayDifferenceReview'] = df_reviews_dates_managed['dayDifferenceReview'].apply(exponential_decay)
df_reviews_dates_managed.head(2)

reviews: most recent date: 2018-09-22 00:00:00
reviews: most oudated date: 2000-09-18 00:00:00


Unnamed: 0,overall,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,dayDifferenceReview
0,5.0,"June 02, 2015",A1HBTW5M7ZZ9PT,310818621,FTLOE,absolutely love organizer never one figured wa...,super good deal,0.886211
1,5.0,"March 21, 2014",A2F0F4NB6BLGVX,310823706,Lee,good bible carrier large print bible afraid wo...,leatherlook bible carrier,0.848233


## Merge products and reviews dataset

the following dataset (`merged_df`) will be the actual data used in NCF. This since, other than the reviews themselves, It just contains the related products information with which the users interacted already.

In [39]:
df_reviews_final = df_reviews_dates_managed.copy()
df_products_final = df_products_dates_managed.copy()

merged_df = df_reviews_final.merge(df_products_final, on='asin', how='inner')

## Create a separate dataset for unreviewed products (item with no interaction)

In [40]:
unreviewed_products_df = df_products_final[~df_products_final['asin'].isin(df_reviews_final['asin'])]

## Save the prepared data

In [41]:
merged_df.head(2)

Unnamed: 0,overall,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,dayDifferenceReview,description,title,brand,feature,rank,productPublishedDate,main_cat_All Beauty,main_cat_All Electronics,main_cat_Amazon Home,"main_cat_Arts, Crafts & Sewing","main_cat_Arts, Crafts &amp; Sewing",main_cat_Automotive,main_cat_Baby,main_cat_Books,main_cat_Camera & Photo,main_cat_Camera &amp; Photo,main_cat_Car Electronics,main_cat_Cell Phones & Accessories,main_cat_Cell Phones &amp; Accessories,main_cat_Computers,main_cat_Gift Cards,main_cat_Grocery,main_cat_Health & Personal Care,main_cat_Home Audio & Theater,main_cat_Home Audio &amp; Theater,main_cat_Industrial & Scientific,main_cat_Industrial &amp; Scientific,main_cat_Musical Instruments,main_cat_Office Products,main_cat_Pet Supplies,main_cat_Portable Audio & Accessories,main_cat_Software,main_cat_Sports & Outdoors,main_cat_Sports &amp; Outdoors,main_cat_Tools & Home Improvement,main_cat_Tools &amp; Home Improvement,main_cat_Toys & Games,main_cat_Toys &amp; Games,main_cat_Video Games,dayDifferenceProduct
0,5.0,"April 09, 2016",A2GIQGI2UXOZ4M,439893577,Gene Sechrest,job big enough purpose fold flat fold stand gr...,ultimate kid magnetic board,0.914297,agnetic tabletop learning easel one simplestye...,little red tool box magnetic tabletop learning...,scholastic,fold flat easy storage open reveal giant 12 x ...,21,"November 25, 2006",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.641915
1,5.0,"October 27, 2017",A2M13JN7YVG29U,528960911,Stacie Baugh,love,five star,0.967539,yecatching 50 x 32 reference piece home classr...,rand mcnally mseries fullcolor laminated unite...,rand mcnally,eyecatching 50 x 32 reference piece home class...,5,"April 18, 2006",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.627884


In [42]:
unreviewed_products_df.head(2)

Unnamed: 0,description,title,brand,feature,rank,productPublishedDate,asin,main_cat_All Beauty,main_cat_All Electronics,main_cat_Amazon Home,"main_cat_Arts, Crafts & Sewing","main_cat_Arts, Crafts &amp; Sewing",main_cat_Automotive,main_cat_Baby,main_cat_Books,main_cat_Camera & Photo,main_cat_Camera &amp; Photo,main_cat_Car Electronics,main_cat_Cell Phones & Accessories,main_cat_Cell Phones &amp; Accessories,main_cat_Computers,main_cat_Gift Cards,main_cat_Grocery,main_cat_Health & Personal Care,main_cat_Home Audio & Theater,main_cat_Home Audio &amp; Theater,main_cat_Industrial & Scientific,main_cat_Industrial &amp; Scientific,main_cat_Musical Instruments,main_cat_Office Products,main_cat_Pet Supplies,main_cat_Portable Audio & Accessories,main_cat_Software,main_cat_Sports & Outdoors,main_cat_Sports &amp; Outdoors,main_cat_Tools & Home Improvement,main_cat_Tools &amp; Home Improvement,main_cat_Toys & Games,main_cat_Toys &amp; Games,main_cat_Video Games,dayDifferenceProduct
3,xclusive design classi,best abstract fiery floral design mouse pad cu...,luxladymousepad,material made best plastic manufacturing also ...,-1,"September 21, 1677",B00KH94VSG,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.059833
4,itten piano key mouse pad 8 x 8 x 25 made heav...,3drose llc 8 x 8 x 025 inch kitten piano key m...,3drose,dimension inch 8 w x 8 h x 025 matte finish so...,1,"July 14, 2014",B00CX71JNU,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.848318


In [43]:
print(f'Number of unique products: {df_products_final["asin"].nunique()}')
print(f'Number of unique users: {df_reviews_final["reviewerID"].nunique()}')
print(f'Number of unique reviewed products: {merged_df["asin"].nunique()}')
print(f'Number of unique unreviewed products: {unreviewed_products_df["asin"].nunique()}')

Number of unique products: 34211
Number of unique users: 1000
Number of unique reviewed products: 11369
Number of unique unreviewed products: 22842


In [44]:
save_data(merged_df, unreviewed_products_df, 'data/merged_data_processed.csv', 'data/unreviewed_products_processed.csv')

In [45]:
save_data(df_reviews_final, df_products_final, 'data/reviews_sampled_processed.csv', 'data/products_sampled_processed.csv')