# NIRS (preprocessing)

In [1]:
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
import gzip
import json
import random
import sklearn
import torch

def seed_everything(seed=42):
    # Seed the random number generator
    random.seed(seed)

    # Seed NumPy
    np.random.seed(seed)

    # Seed scikit-learn
    sklearn.utils.check_random_state(seed)

    # Seed PyTorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

    # Set pandas options
    pd.set_option('display.max_columns', None)  # Display all columns in pandas DataFrames
    pd.set_option('display.max_rows', None)  # Display all rows in pandas DataFrames
    pd.set_option('display.width', None)  # Disable column width restriction
    pd.set_option('display.expand_frame_repr', False)  # Prevent line wrapping in pandas DataFrames

seed_everything(seed=42)

In [2]:
def parse(path):
  """read the dataset from json"""
  
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l.strip())
    
def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
    
  return pd.DataFrame.from_dict(df, orient='index')

def count_nan_values(df):
    nan_counts = df.isna().sum()
    return nan_counts[nan_counts > 0]

def count_empty_strings(df):
    empty_string_counts = (df == '').sum()
    return empty_string_counts[empty_string_counts > 0]

def print_shapes(reviews_df, products_df):
    print(f"Reviews df shape: {reviews_df.shape}")
    print(f"Products df shape: {products_df.shape}")
    
def save_data(reviews_df, products_df, reviews_file, products_file):
    reviews_df.to_csv(reviews_file, index=False)
    products_df.to_csv(products_file, index=False)

## Reading data (skip if you already have the sampled data you need)

In [3]:
df_reviews = getDF('data/Office_Products_5.json.gz')
df_products = getDF('data/meta_Office_Products.json.gz')

In [4]:
print_shapes(df_reviews, df_products)

Reviews df shape: (800357, 12)
Products df shape: (315458, 19)


In [5]:
df_reviews.head(3)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,4.0,True,"11 7, 2017",A2NIJTYWADLK57,140503528,{'Format:': ' Board book'},cotton clay,kids like story BUT while i really wanted a bo...,"good story, small size book though",1510012800,,
1,4.0,True,"03 7, 2017",A2827D8EEURMP4,140503528,{'Format:': ' Hardcover'},emankcin,Bought this used and it came in great conditio...,Good,1488844800,,
2,5.0,True,"06 25, 2016",APB6087F4J09J,140503528,{'Format:': ' Board book'},Starbucks Fan,Every story and book about Corduroy is Fantast...,Best Books for All Children,1466812800,,


In [6]:
df_products.head(3)

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes,details
0,"[Office Products, Office & School Supplies, Ed...","class=""a-keyvalue prodDetTable"" role=""present...",[Sequential Spelling is based on the classic O...,,Sequential Spelling Level 1 Bundle with Studen...,[],,STL Distributors,[],"[>#439,654 in Office Products (See top 100), >...","[1935943065, 1935943073, B00IJH9Q4M, 002115021...",Office Products,"class=""a-bordered a-horizontal-stripes a-spa...","August 15, 2014",$32.90,12624861,[],[],
1,"[Office Products, Office &amp; School Supplies...",,"[Unusual book, , ]",,"Mathematics, Applications and Concepts, Course...",[],,bailey,[],"3,839,628 in Books (",[],Books,,,$8.62,78652669,[],[],
2,[],"class=""a-keyvalue prodDetTable"" role=""present...",[Pearson MyHistoryLab Online Access Code for A...,,Pearson MyHistoryLab Online Access Code for Am...,[],,Pearson MyHistoryLab,[Pearson MyHistoryLab Online Access Code for A...,"[>#1,925,354 in Office Products (See top 100)]",[],Office Products,,"June 21, 2012",$0.99,136039847,[],[],


## Sampling data (skip if you already have the sampled data you need)

In [7]:
def sample_data(reviews_df, products_df, min_reviews_count=10, max_users=1000, frac_sampled_products=0.1):
    # Sample a subset of users based on the number of reviews they have 
    user_reviews_count = reviews_df['reviewerID'].value_counts()
    selected_users = user_reviews_count[user_reviews_count >= min_reviews_count].index[:max_users]
    reviews_subset: pd.DataFrame = reviews_df[reviews_df['reviewerID'].isin(selected_users)]

    # Sample a subset of products based on popularity or ratings
    # You can use salesRank or overall ratings for this purpose
    sampled_products: pd.DataFrame = products_df.sample(frac=frac_sampled_products, random_state=42)

    return reviews_subset, sampled_products

In [8]:
df_reviews_sampled, df_products_sampled = sample_data(df_reviews, df_products, min_reviews_count=10, frac_sampled_products=0.1)

# save sampled data as checkpoint
save_data(df_reviews_sampled, df_products_sampled, 'data/reviews_sampled.csv', 'data/products_sampled.csv')

In [9]:
print_shapes(df_reviews_sampled, df_products_sampled)

Reviews df shape: (45779, 12)
Products df shape: (31546, 19)


## Missing values handling / removing irrelevant features

In [10]:
# optional checkpoint to skip the initial reading and sampling step
# df_reviews_sampled = pd.read_csv('data/reviews_sampled.csv')
# df_products_sampled = pd.read_csv('data/products_sampled.csv')

df_reviews_prepared = df_reviews_sampled.copy()
df_products_prepared = df_products_sampled.copy()

### Process for reviews dataset

In [11]:
print_shapes(df_reviews_prepared, df_products_prepared)

Reviews df shape: (45779, 12)
Products df shape: (31546, 19)


In [12]:
# remove irrelevant columns
df_reviews_prepared.drop(columns=['verified', 'reviewTime', 'style', 'image', 'vote'], inplace=True)

In [13]:
# remove sample with empty reviewer name and reviwer text, since
# it's a very small percentage of the dataset
df_reviews_prepared.dropna(subset=['reviewerName', 'reviewText'], inplace=True)

In [14]:
print("Total nan values: " , df_reviews_prepared.isna().sum())

Total nan values:  overall           0
reviewerID        0
asin              0
reviewerName      0
reviewText        0
summary           0
unixReviewTime    0
dtype: int64


In [15]:
print_shapes(df_reviews_prepared, df_products_prepared)

Reviews df shape: (45734, 7)
Products df shape: (31546, 19)


### Process for the product dataset

In [16]:
print('Nan values per feature: \n', count_nan_values(df_products_prepared))

Nan values per feature: 
 details    737
dtype: int64


In [17]:
# Replace illegal dates with the oldest possible date format
df_products_prepared['date'] = pd.to_datetime(df_products_prepared['date'], errors='coerce')
df_products_prepared['date'] = df_products_prepared['date'].fillna(pd.Timestamp.min).dt.strftime('%B %d, %Y')

In [18]:
# fill nan values of main category with 'Office Products', which the main in the dataset
df_products_prepared['main_cat'] = df_products_prepared['main_cat'].fillna('Office Products')

In [19]:
# drop the only samples with nan values in title
df_products_prepared.dropna(subset=['title'], inplace=True)

In [20]:
# fill the missing vlaues of brands with 'Unknown'
df_products_prepared['brand'] = df_products_prepared['brand'].fillna('Unknown')

In [21]:
# remove useless / irrelevant columns / columns without meaningful data
# (details is also irrilevant, as most of the samples has empty json)
# we will remove the "category" feature for now as well (the one with the list of categories; 
# we may add it again later if we find a way to use it)
cols_to_drop = ['similar_item', 'price', 'details', 'also_view', 'also_buy', "imageURL", "imageURLHighRes", 'tech1', 'tech2', 'fit', 'category']
df_products_prepared.drop(cols_to_drop, axis=1, inplace=True)

In [22]:
df_products_prepared.head(3)

Unnamed: 0,description,title,brand,feature,rank,main_cat,date,asin
108713,[Protect yourself and your RFID card with a Sk...,Black RFID Blocking ID Badge Holder (Holds 2 C...,Specialist ID,"[RFID Blocking 2 Card Holder, FIPS 201 Approve...","[>#43,873 in Office Products (See top 100), >#...",Office Products,"October 14, 2011",B005VSY1VK
7634,[The Star Wars Moleskine Saga continues in 201...,Moleskine 2015 Star Wars Limited Edition Daily...,Moleskine,[],[],Office Products,"December 26, 2013",8867323296
261709,"[Staples Washable Glue Sticks, Purple, .26 oz....","Staples Washable Glue Sticks, Purple, .26 oz.,...",Staples,[],"[>#161,293 in Office Products (See top 100), >...",Office Products,"June 22, 2015",B011LAU4R6


In [23]:
print_shapes(df_reviews_prepared, df_products_prepared)

Reviews df shape: (45734, 7)
Products df shape: (31546, 8)


## Filter products by main category

In [24]:
df_products_prepared = df_products_prepared[df_products_prepared['main_cat'] == 'Office Products']

df_products_prepared.shape

(22287, 8)

## Text preprocessing

In [25]:
import re
import string
from bs4 import BeautifulSoup
from unidecode import unidecode
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('words')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/lucamodica/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/lucamodica/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /home/lucamodica/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/lucamodica/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/lucamodica/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [26]:
# class TextPreprocessor(BaseEstimator, TransformerMixin):
#     def __init__(self):
#         self.stop_words = set(stopwords.words('english'))
#         self.lemmatizer = WordNetLemmatizer()

#     def fit(self, X, y=None):
#         return self

#     def transform(self, X):
#         return [self._preprocess(text) for text in X]

#     def _preprocess(self, text):
#         # Lowercasing
#         text = text.lower()
#         # Remove accented characters
#         text = unidecode(text)
        
#         # Remove numbers
#         text = re.sub(r'\d+', '', text)
        
#         # remove punctuation
#         text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
        
#         # remove double spaces
#         text = re.sub(' +', ' ', text)
        
#         # Tokenize text
#         words = word_tokenize(text)
#         # Remove stopwords and lemmatize
#         words = [self.lemmatizer.lemmatize(
#             word) for word in words if word not in self.stop_words]
        
#         return ' '.join(words)
import text_preprocessing as tp

# function parsing an array of stirng in stirng form into a real list of strings
def parse_string_array(arr):
    if arr == '[]':
        return []
    return arr[1:-1].split(', ')

preprocessor = tp.TextPreprocessor()

In [27]:
df_reviews_prepared['summary'] = df_reviews_prepared['summary'].astype(str)
df_reviews_prepared['summary'] = preprocessor.fit_transform(df_reviews_prepared['summary'])

df_reviews_prepared = df_reviews_prepared[df_reviews_prepared['summary'] != '']

In [28]:
df_reviews_prepared['reviewText'] = df_reviews_prepared['reviewText'].astype(str)
df_reviews_prepared['reviewText'] = preprocessor.fit_transform(df_reviews_prepared['reviewText'])

df_reviews_prepared = df_reviews_prepared[df_reviews_prepared['reviewText'] != '']

In [29]:
df_products_prepared['description'] = df_products_prepared['description'].astype(str)
df_products_prepared['description'] = preprocessor.fit_transform(df_products_prepared['description'])
df_products_prepared['description'] = df_products_prepared['description'].str[1:-1]

df_products_prepared = df_products_prepared[df_products_prepared['description'] != '']

In [30]:
# parse the feature column in lists, and then concatenate them together
# the prerprocessing will follow
df_products_prepared['feature'] = df_products_prepared['feature'].astype(str)
df_products_prepared['feature'] = df_products_prepared['feature'].apply(lambda x: ". ".join(eval(x)).strip())
df_products_prepared['feature'] = preprocessor.fit_transform(df_products_prepared['feature'])

df_products_prepared = df_products_prepared[df_products_prepared['feature'] != '']

## Handling "rank" feature from product dataset

We decided just to keep the rank of the product in the office category (the rank will be -1 if the product doen't have a rank in hte Office category)

In [31]:
def extract_office_product_rank(rank):
    match = re.search(r'>#(\d+)', rank)
    if match and 'Office Products' in rank:
        return int(match.group(1))
    else:
        return -1

df_products_prepared['rank'] = df_products_prepared['rank'].astype(str)
df_products_prepared['rank'] = df_products_prepared['rank'].apply(extract_office_product_rank)

## Save the prepared data

In [32]:
print_shapes(df_reviews_prepared, df_products_prepared)

Reviews df shape: (45562, 7)
Products df shape: (17813, 8)


In [33]:
df_reviews_prepared.head(3)

Unnamed: 0,overall,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime
85,5.0,A1HBTW5M7ZZ9PT,310818621,FTLOE,absolutely love organizer never one figured wa...,super good deal,1433203200
105,5.0,A2F0F4NB6BLGVX,310823706,Lee,good bible carrier large print bible afraid wo...,leatherlook bible carrier,1395360000
164,5.0,A23BRQWL8LNB37,439499887,David,kid love peppa reading say,five star,1496361600


In [34]:
df_products_prepared.head(3)

Unnamed: 0,description,title,brand,feature,rank,main_cat,date,asin
108713,rotect rfid card skimsafe card holder made rig...,Black RFID Blocking ID Badge Holder (Holds 2 C...,Specialist ID,rfid blocking 2 card holder fips 201 approved ...,43,Office Products,"October 14, 2011",B005VSY1VK
156159,itten piano key mouse pad 8 x 8 x 25 made heav...,3dRose LLC 8 x 8 x 0.25 Inches Kitten on Piano...,3dRose,dimension inch 8 w x 8 h x 025 matte finish so...,1,Office Products,"July 14, 2014",B00CX71JNU
68230,ivo next favorite pen ultra gel stick vibrant ...,"Vivo Ultra Gel Stick Pens, 0.7mm Fine Tip, Bla...",VIVO,ultra smooth gel ink vivid black amp color ful...,1,Office Products,"April 30, 2009",B002CO43BO


In [35]:
save_data(df_reviews_prepared, df_products_prepared, 'data/reviews_sampled_processed.csv', 'data/products_sampled_processed.csv')