# NIRS (preprocessing)

In [1]:
import pandas as pd
import numpy as np
import gzip
import json
import utils

utils.seed_everything(seed=42)

## Reading data (skip if you already have the sampled data you need)

In [2]:
df_reviews = utils.getDF('data/Office_Products_5.json.gz').drop_duplicates(subset=['reviewerID', 'asin', 'reviewText', 'summary'])
df_products = utils.getDF('data/meta_Office_Products.json.gz').drop_duplicates(subset=['asin'])

In [3]:
print(f'Number of unique products: {df_products["asin"].drop_duplicates().shape[0]}')
print(f'Number of unique users: {df_reviews["reviewerID"].drop_duplicates().shape[0]}')
print(f'Number of unique reviews: {df_reviews[["reviewerID", "asin", "reviewText", "summary"]].drop_duplicates().shape[0]}') 

Number of unique products: 306617
Number of unique users: 101501
Number of unique reviews: 750068


In [4]:
utils.print_shapes(df_reviews, df_products)

Reviews df shape: (750068, 12)
Products df shape: (306617, 19)


In [5]:
# drop duplicates
df_reviews = df_reviews.drop_duplicates(subset=['reviewerID', 'asin', 'reviewText', 'summary'])
df_products = df_products.drop_duplicates(subset=['asin'])

utils.print_shapes(df_reviews, df_products)

Reviews df shape: (750068, 12)
Products df shape: (306617, 19)


In [6]:
df_reviews.head(3)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,4.0,True,"11 7, 2017",A2NIJTYWADLK57,140503528,{'Format:': ' Board book'},cotton clay,kids like story BUT while i really wanted a bo...,"good story, small size book though",1510012800,,
1,4.0,True,"03 7, 2017",A2827D8EEURMP4,140503528,{'Format:': ' Hardcover'},emankcin,Bought this used and it came in great conditio...,Good,1488844800,,
2,5.0,True,"06 25, 2016",APB6087F4J09J,140503528,{'Format:': ' Board book'},Starbucks Fan,Every story and book about Corduroy is Fantast...,Best Books for All Children,1466812800,,


In [7]:
df_products.head(3)

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes,details
0,"[Office Products, Office & School Supplies, Ed...","class=""a-keyvalue prodDetTable"" role=""present...",[Sequential Spelling is based on the classic O...,,Sequential Spelling Level 1 Bundle with Studen...,[],,STL Distributors,[],"[>#439,654 in Office Products (See top 100), >...","[1935943065, 1935943073, B00IJH9Q4M, 002115021...",Office Products,"class=""a-bordered a-horizontal-stripes a-spa...","August 15, 2014",$32.90,12624861,[],[],
1,"[Office Products, Office &amp; School Supplies...",,"[Unusual book, , ]",,"Mathematics, Applications and Concepts, Course...",[],,bailey,[],"3,839,628 in Books (",[],Books,,,$8.62,78652669,[],[],
2,[],"class=""a-keyvalue prodDetTable"" role=""present...",[Pearson MyHistoryLab Online Access Code for A...,,Pearson MyHistoryLab Online Access Code for Am...,[],,Pearson MyHistoryLab,[Pearson MyHistoryLab Online Access Code for A...,"[>#1,925,354 in Office Products (See top 100)]",[],Office Products,,"June 21, 2012",$0.99,136039847,[],[],


## Sampling data (skip if you already have the sampled data you need)

In [8]:
# min_reviews_count = 10
# max_users = 1000
# frac_products = 0.1

# df_reviews_sampled, df_products_sampled = utils.sample_data(
#   df_reviews, df_products, min_reviews_count=min_reviews_count, max_users=max_users, frac_products=frac_products)

# # save sampled data as checkpoint
# # utils.save_data(df_reviews_sampled, df_products_sampled, 'data/reviews_sampled.csv', 'data/products_sampled.csv')
# df_reviews_sampled['index'] = df_reviews_sampled.index
# df_products_sampled['index'] = df_products_sampled.index
# utils.save_data(df_reviews_sampled, 'data/reviews_sampled.csv')
# utils.save_data(df_products_sampled, 'data/products_sampled.csv')

df_reviews_sampled, df_products_sampled = df_reviews, df_products

In [9]:
utils.print_shapes(df_reviews_sampled, df_products_sampled)

Reviews df shape: (750068, 12)
Products df shape: (306617, 19)


## Data cleaning (missing values handling / removing irrelevant features)

In [10]:
import data_cleaning as cleaning

# optional checkpoint to skip the initial reading and sampling step
# df_reviews_sampled = pd.read_csv('data/reviews_sampled.csv')
# df_products_sampled = pd.read_csv('data/products_sampled.csv')

# df_reviews_cleaned = df_reviews_sampled.copy()
# df_products_cleaned = df_products_sampled.copy()

### Process for reviews dataset

In [11]:
print("Total nan values: " , df_reviews_sampled.isna().sum())
utils.print_shape(df_reviews_sampled, 'reviews')

Total nan values:  overall                0
verified               0
reviewTime             0
reviewerID             0
asin                   0
style             281098
reviewerName         133
reviewText           202
summary              122
unixReviewTime         0
vote              667035
image             739274
dtype: int64
reviews shape: (750068, 12)


In [12]:
df_reviews_cleaned = cleaning.clean_reviews_data(df_reviews_sampled)
df_reviews_cleaned.dropna(inplace=True)

In [13]:
print("Total nan values: " , df_reviews_cleaned.isna().sum())
utils.print_shape(df_reviews_cleaned, 'reviews')

Total nan values:  overall         0
reviewTime      0
reviewerID      0
asin            0
reviewerName    0
reviewText      0
summary         0
dtype: int64
reviews shape: (749626, 7)


### Process for the product dataset

In [14]:
print('Nan values per feature: \n', utils.count_nan_values(df_products_sampled))
utils.print_shape(df_products_sampled, 'products')

Nan values per feature: 
 details    7147
dtype: int64
products shape: (306617, 19)


In [15]:
df_products_cleaned = cleaning.clean_products_data(df_products_sampled)

In [16]:
print('Nan values per feature: \n', utils.count_nan_values(df_products_cleaned))
utils.print_shape(df_products_cleaned, 'products')

Nan values per feature: 
 Series([], dtype: int64)
products shape: (303665, 8)


## Text preprocessing

In [17]:
import text_preprocessing as tp

df_reviews_text_processed = df_reviews_cleaned.copy()
df_products_text_processed = df_products_cleaned.copy()

preprocessor = tp.TextPreprocessor()

In [18]:
df_reviews_text_processed['summary'] = df_reviews_text_processed['summary'].astype(str)
df_reviews_text_processed['summary'] = preprocessor.fit_transform(df_reviews_text_processed['summary'])

df_reviews_text_processed = df_reviews_text_processed[df_reviews_text_processed['summary'] != '']

In [19]:
df_reviews_text_processed['reviewText'] = df_reviews_text_processed['reviewText'].astype(str)
df_reviews_text_processed['reviewText'] = preprocessor.fit_transform(df_reviews_text_processed['reviewText'])

df_reviews_text_processed = df_reviews_text_processed[df_reviews_text_processed['reviewText'] != '']

In [20]:
df_products_text_processed['description'] = df_products_text_processed['description'].astype(str)
df_products_text_processed['description'] = preprocessor.fit_transform(df_products_text_processed['description'])
df_products_text_processed['description'] = df_products_text_processed['description'].str[1:-1]

df_products_text_processed = df_products_text_processed[df_products_text_processed['description'] != '']

In [21]:
# parse the feature column in lists, and then concatenate them together
# the prerprocessing will follow
df_products_text_processed['feature'] = df_products_text_processed['feature'].astype(str)
df_products_text_processed['feature'] = df_products_text_processed['feature'].apply(lambda x: ". ".join(eval(x)).strip())
df_products_text_processed['feature'] = preprocessor.fit_transform(df_products_text_processed['feature'])

df_products_text_processed = df_products_text_processed[df_products_text_processed['feature'] != '']

In [22]:
# preprocess title
df_products_text_processed['title'] = df_products_text_processed['title'].astype(str)
df_products_text_processed['title'] = preprocessor.fit_transform(df_products_text_processed['title'])

In [23]:
# preprocess brand
df_products_text_processed['brand'] = df_products_text_processed['brand'].astype(str)
df_products_text_processed["brand"] = preprocessor.fit_transform(df_products_text_processed["brand"])

In [26]:
# save checkpoint data preprocessed
utils.save_data(df_reviews_text_processed, 'data/reviews_processed_checkpoint.csv')
utils.save_data(df_products_text_processed, 'data/products_processed_checkpoint.csv')

## Handling "rank" feature from product dataset

We decided just to keep the rank of the product in the office category (the rank will be -1 if the product doen't have a rank in hte Office category)

In [24]:
import re

df_products_rank_managed = df_products_text_processed.copy()

def extract_office_product_rank(rank):
    match = re.search(r'>#(\d+)', rank)
    if match and 'Office Products' in rank:
        return int(match.group(1))
    else:
        return -1

df_products_rank_managed['rank'] = df_products_rank_managed['rank'].astype(str)
df_products_rank_managed['rank'] = df_products_rank_managed['rank'].apply(extract_office_product_rank)

## Renaming features

In [27]:
df_products_ranaming_cols = df_products_rank_managed.copy()

df_products_ranaming_cols.rename(columns={'date': 'productPublishedDate'}, inplace=True)

## One-hot encoding for "main category" feature of product

In [28]:
df_products_main_cat_onehot = df_products_ranaming_cols.copy()

print(df_products_main_cat_onehot["main_cat"].nunique())

df_products_main_cat_onehot = pd.get_dummies(df_products_main_cat_onehot, columns=['main_cat'], prefix='main_cat')

# Convert the one-hot encoded features to integer
one_hot_columns = [col for col in df_products_main_cat_onehot.columns if col.startswith('main_cat_')]
df_products_main_cat_onehot[one_hot_columns] = df_products_main_cat_onehot[one_hot_columns].astype(int)

df_products_main_cat_onehot.head(2)

39


Unnamed: 0,description,title,brand,feature,rank,productPublishedDate,asin,main_cat_,main_cat_All Beauty,main_cat_All Electronics,main_cat_Amazon Devices,main_cat_Amazon Home,main_cat_Appliances,"main_cat_Arts, Crafts & Sewing","main_cat_Arts, Crafts &amp; Sewing",main_cat_Automotive,main_cat_Baby,main_cat_Books,main_cat_Camera & Photo,main_cat_Camera &amp; Photo,main_cat_Car Electronics,main_cat_Cell Phones & Accessories,main_cat_Cell Phones &amp; Accessories,main_cat_Computers,main_cat_GPS & Navigation,main_cat_Gift Cards,main_cat_Grocery,main_cat_Health & Personal Care,main_cat_Health &amp; Personal Care,main_cat_Home Audio & Theater,main_cat_Home Audio &amp; Theater,main_cat_Industrial & Scientific,main_cat_Industrial &amp; Scientific,main_cat_Musical Instruments,main_cat_Office Products,main_cat_Pet Supplies,main_cat_Portable Audio & Accessories,main_cat_Portable Audio &amp; Accessories,main_cat_Software,main_cat_Sports & Outdoors,main_cat_Sports &amp; Outdoors,main_cat_Tools & Home Improvement,main_cat_Tools &amp; Home Improvement,main_cat_Toys & Games,main_cat_Toys &amp; Games,main_cat_Video Games
2,earson myhistorylab online access code america...,pearson myhistorylab online access code americ...,pearson myhistorylab,pearson myhistorylab online access code americ...,1,"June 21, 2012",136039847,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,orduroy bear go launderette lisa overhears mot...,pocket corduroy,ingram book distributor,9780140503524,422,"September 14, 2006",140503528,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


## Managing dates for products

In [29]:
df_products_dates_managed = df_products_main_cat_onehot.copy()

#get the most recent date - first need to convert the date to datetime, it is in this format: October 14, 2011 or Unknown
def exponential_decay(time_diff, decay_rate=0.0001):
    return np.exp(-decay_rate * time_diff)

dates = df_products_dates_managed['productPublishedDate'].apply(lambda x: pd.to_datetime(x, errors='coerce'))
most_recent_date = dates.max()
most_outdated_date = dates.min()
print("most recent date:", most_recent_date)
print("most oudated date:", most_outdated_date)

#create a new column where if the date is Unknown, it will be the most outdated date, otherwise it will be the difference between the most recent date and the date of the product
df_products_dates_managed['dayDifferenceProduct'] = dates.apply(lambda x: (most_recent_date - x).days if pd.notnull(x) else (most_recent_date - most_outdated_date).days)

#apply a function to the day difference so a bigger day difference gives a 
# low value and a smaller day difference gives a high value
df_products_dates_managed['dayDifferenceProduct'] = df_products_dates_managed['dayDifferenceProduct'].apply(exponential_decay)
df_products_dates_managed.head(2)

most recent date: 2019-01-14 00:00:00
most oudated date: 1941-12-07 00:00:00


Unnamed: 0,description,title,brand,feature,rank,productPublishedDate,asin,main_cat_,main_cat_All Beauty,main_cat_All Electronics,main_cat_Amazon Devices,main_cat_Amazon Home,main_cat_Appliances,"main_cat_Arts, Crafts & Sewing","main_cat_Arts, Crafts &amp; Sewing",main_cat_Automotive,main_cat_Baby,main_cat_Books,main_cat_Camera & Photo,main_cat_Camera &amp; Photo,main_cat_Car Electronics,main_cat_Cell Phones & Accessories,main_cat_Cell Phones &amp; Accessories,main_cat_Computers,main_cat_GPS & Navigation,main_cat_Gift Cards,main_cat_Grocery,main_cat_Health & Personal Care,main_cat_Health &amp; Personal Care,main_cat_Home Audio & Theater,main_cat_Home Audio &amp; Theater,main_cat_Industrial & Scientific,main_cat_Industrial &amp; Scientific,main_cat_Musical Instruments,main_cat_Office Products,main_cat_Pet Supplies,main_cat_Portable Audio & Accessories,main_cat_Portable Audio &amp; Accessories,main_cat_Software,main_cat_Sports & Outdoors,main_cat_Sports &amp; Outdoors,main_cat_Tools & Home Improvement,main_cat_Tools &amp; Home Improvement,main_cat_Toys & Games,main_cat_Toys &amp; Games,main_cat_Video Games,dayDifferenceProduct
2,earson myhistorylab online access code america...,pearson myhistorylab online access code americ...,pearson myhistorylab,pearson myhistorylab online access code americ...,1,"June 21, 2012",136039847,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0.786785
3,orduroy bear go launderette lisa overhears mot...,pocket corduroy,ingram book distributor,9780140503524,422,"September 14, 2006",140503528,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0.637309


## Managing dates for reviews

In [30]:
df_reviews_dates_managed = df_reviews_text_processed.copy()

rev_dates = df_reviews_cleaned['reviewTime'].apply(lambda x: pd.to_datetime(x, errors='coerce'))


rev_most_recent_date = rev_dates.max()
rev_most_outdated_date = rev_dates.min()
print("reviews: most recent date:", rev_most_recent_date)
print("reviews: most oudated date:", rev_most_outdated_date)

df_reviews_dates_managed['dayDifferenceReview'] = rev_dates.apply(lambda x: (rev_most_recent_date - x).days if pd.notnull(x) else (rev_most_recent_date - rev_most_outdated_date).days)
df_reviews_dates_managed['dayDifferenceReview'] = df_reviews_dates_managed['dayDifferenceReview'].apply(exponential_decay)
df_reviews_dates_managed.head(2)

reviews: most recent date: 2018-10-02 00:00:00
reviews: most oudated date: 1999-10-11 00:00:00


Unnamed: 0,overall,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,dayDifferenceReview
0,4.0,"November 07, 2017",A2NIJTYWADLK57,140503528,cotton clay,kid like story really wanted board book one sm...,good story small size book though,0.967635
1,4.0,"March 07, 2017",A2827D8EEURMP4,140503528,emankcin,bought used came great condition almost like n...,good,0.944216


## Consistency checks of the samples of products and reviews data after the preprocessing

In [31]:
utils.print_shapes(df_reviews_dates_managed, df_products_dates_managed)

Reviews df shape: (746577, 8)
Products df shape: (233469, 47)


In [32]:
# remove duplicates
df_products_final = df_products_dates_managed.copy().drop_duplicates(subset=['asin'])
df_reviews_final = df_reviews_dates_managed.copy().drop_duplicates(subset=['reviewerID', 'asin', 'reviewText', 'summary'])

# remove reviews of products that are not in the final products
df_reviews_final = df_reviews_final[df_reviews_final['asin'].isin(df_products_final['asin'])]


# # Last check to make sure users has at least "max_reviews_count" (+ drop duplicates)
# user_reviews_count = df_reviews_final['reviewerID'].value_counts()
# selected_users = user_reviews_count[user_reviews_count >= min_reviews_count]
# df_reviews_final = df_reviews_final[df_reviews_final['reviewerID'].isin(selected_users.keys())]

assert (df_reviews_final['asin'].isin(df_products_final['asin'])).all()
utils.print_shape(df_reviews_final, 'reviews')
utils.print_shape(df_products_final, 'products')

reviews shape: (661471, 8)
products shape: (233469, 47)


## Merge products and reviews dataset

the following dataset (`merged_df`) will be the actual data used in NCF. This since, other than the reviews themselves, It just contains the related products information with which the users interacted already.

In [33]:
df_reviews_final: pd.DataFrame = df_reviews_final.copy().drop_duplicates(subset=['reviewerID', 'asin', 'reviewText', 'summary'])
df_products_final: pd.DataFrame = df_products_dates_managed.copy().drop_duplicates(subset=['asin'])

merged_df = pd.merge(df_reviews_final, df_products_final, on='asin', how='inner')

## Create a separate dataset for unreviewed products (items with no interaction)

In [34]:
unreviewed_products_df = df_products_final[~df_products_final['asin'].isin(df_reviews_final['asin'])]

## Save the prepared data

In [35]:
merged_df.head(2)

Unnamed: 0,overall,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,dayDifferenceReview,description,title,brand,feature,rank,productPublishedDate,main_cat_,main_cat_All Beauty,main_cat_All Electronics,main_cat_Amazon Devices,main_cat_Amazon Home,main_cat_Appliances,"main_cat_Arts, Crafts & Sewing","main_cat_Arts, Crafts &amp; Sewing",main_cat_Automotive,main_cat_Baby,main_cat_Books,main_cat_Camera & Photo,main_cat_Camera &amp; Photo,main_cat_Car Electronics,main_cat_Cell Phones & Accessories,main_cat_Cell Phones &amp; Accessories,main_cat_Computers,main_cat_GPS & Navigation,main_cat_Gift Cards,main_cat_Grocery,main_cat_Health & Personal Care,main_cat_Health &amp; Personal Care,main_cat_Home Audio & Theater,main_cat_Home Audio &amp; Theater,main_cat_Industrial & Scientific,main_cat_Industrial &amp; Scientific,main_cat_Musical Instruments,main_cat_Office Products,main_cat_Pet Supplies,main_cat_Portable Audio & Accessories,main_cat_Portable Audio &amp; Accessories,main_cat_Software,main_cat_Sports & Outdoors,main_cat_Sports &amp; Outdoors,main_cat_Tools & Home Improvement,main_cat_Tools &amp; Home Improvement,main_cat_Toys & Games,main_cat_Toys &amp; Games,main_cat_Video Games,dayDifferenceProduct
0,4.0,"November 07, 2017",A2NIJTYWADLK57,140503528,cotton clay,kid like story really wanted board book one sm...,good story small size book though,0.967635,orduroy bear go launderette lisa overhears mot...,pocket corduroy,ingram book distributor,9780140503524,422,"September 14, 2006",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0.637309
1,4.0,"March 07, 2017",A2827D8EEURMP4,140503528,emankcin,bought used came great condition almost like n...,good,0.944216,orduroy bear go launderette lisa overhears mot...,pocket corduroy,ingram book distributor,9780140503524,422,"September 14, 2006",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0.637309


In [36]:
unreviewed_products_df.head(2)

Unnamed: 0,description,title,brand,feature,rank,productPublishedDate,asin,main_cat_,main_cat_All Beauty,main_cat_All Electronics,main_cat_Amazon Devices,main_cat_Amazon Home,main_cat_Appliances,"main_cat_Arts, Crafts & Sewing","main_cat_Arts, Crafts &amp; Sewing",main_cat_Automotive,main_cat_Baby,main_cat_Books,main_cat_Camera & Photo,main_cat_Camera &amp; Photo,main_cat_Car Electronics,main_cat_Cell Phones & Accessories,main_cat_Cell Phones &amp; Accessories,main_cat_Computers,main_cat_GPS & Navigation,main_cat_Gift Cards,main_cat_Grocery,main_cat_Health & Personal Care,main_cat_Health &amp; Personal Care,main_cat_Home Audio & Theater,main_cat_Home Audio &amp; Theater,main_cat_Industrial & Scientific,main_cat_Industrial &amp; Scientific,main_cat_Musical Instruments,main_cat_Office Products,main_cat_Pet Supplies,main_cat_Portable Audio & Accessories,main_cat_Portable Audio &amp; Accessories,main_cat_Software,main_cat_Sports & Outdoors,main_cat_Sports &amp; Outdoors,main_cat_Tools & Home Improvement,main_cat_Tools &amp; Home Improvement,main_cat_Toys & Games,main_cat_Toys &amp; Games,main_cat_Video Games,dayDifferenceProduct
2,earson myhistorylab online access code america...,pearson myhistorylab online access code americ...,pearson myhistorylab,pearson myhistorylab online access code americ...,1,"June 21, 2012",136039847,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0.786785
5,ood helper help record reading progress br mak...,ekloen mixed design antiqued bronze colour ele...,ekloen,bookmark also art inspirational gift occasion ...,43,"November 08, 2015",245109919,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0.890208


In [37]:
print(f'Number of unique products: {df_products_final["asin"].nunique()}')
print(f'Number of unique users: {df_reviews_final["reviewerID"].nunique()}')
print(f'Number of unique reviewed products: {merged_df["asin"].nunique()}')
print(f'Number of unique unreviewed products: {unreviewed_products_df["asin"].nunique()}')

Number of unique products: 233469
Number of unique users: 101359
Number of unique reviewed products: 23884
Number of unique unreviewed products: 209585


In [38]:
utils.print_shapes(df_reviews_final, df_products_final)

Reviews df shape: (661471, 8)
Products df shape: (233469, 47)


In [39]:
utils.print_shapes(df_reviews_final.drop_duplicates(subset=['reviewerID', 'asin', 'reviewText', 'summary']), df_products_final.drop_duplicates(subset=['asin']))

Reviews df shape: (661471, 8)
Products df shape: (233469, 47)


In [40]:
merged_df.shape

(661471, 54)

In [41]:
utils.save_data(merged_df, 'data/merged_data_processed.csv')
utils.save_data(unreviewed_products_df, 'data/unreviewed_products_processed.csv')

In [42]:
utils.save_data(df_reviews_final, 'data/reviews_sampled_processed.csv')
utils.save_data(df_products_final, 'data/products_sampled_processed.csv')