# NIRS (preprocessing)

In [72]:
# test reading from the csv
import pandas as pd
df_reviews_sampled = pd.read_csv('data/reviews_sampled.csv')
df_products_sampled = pd.read_csv('data/products_sampled.csv')

In [73]:
def save_sampled_data(reviews_df, products_df, reviews_file, products_file):
    reviews_df.to_csv(reviews_file, index=False)
    products_df.to_csv(products_file, index=False)


## Missing values handling

In [74]:
def count_nan_values(df):
    nan_counts = df.isna().sum()
    return nan_counts[nan_counts > 0]

def count_empty_strings(df):
    empty_string_counts = (df == '').sum()
    empty_array_counts = (df == '[]').sum()
    counting = empty_string_counts[empty_string_counts > 0]
    
    return counting[empty_array_counts > 0]


In [75]:
print('Nan values per feature: \n', count_nan_values(df_reviews_sampled))
print('\n Empty values per feature: \n', count_empty_strings(df_reviews_sampled))

Nan values per feature: 
 reviewerName    44
reviewText      25
dtype: int64

 Empty values per feature: 
 Series([], dtype: int64)


In [76]:
df_reviews_sampled = df_reviews_sampled.dropna(subset=['reviewerName'])

We decided to keep the review name nan values, since those will be irrelevant for the neural network training. Moreover, the reviews of users without a name can be meaningful.

In [77]:
print("Total nan values: " , df_reviews_sampled.isna().sum())

Total nan values:  overall            0
reviewerID         0
asin               0
reviewerName       0
reviewText        25
summary            0
unixReviewTime     0
dtype: int64


In [78]:
df_products_sampled.shape

(31546, 14)

In [79]:
print('Nan values per feature: \n', count_nan_values(df_products_sampled))
print('\n Empty values per feature:\n', count_empty_strings(df_products_sampled))

Nan values per feature: 
 title               2
brand             491
main_cat          179
similar_item    19491
date             3937
price           14298
details           737
dtype: int64

 Empty values per feature:
 Series([], dtype: int64)


In [80]:
test = df_products_sampled.copy()

# Replace illegal dates with the oldest possible date format
df_products_sampled['date'] = pd.to_datetime(test['date'], errors='coerce')
df_products_sampled['date'] = df_products_sampled['date'].fillna(pd.Timestamp.min).dt.strftime('%B %d, %Y')


In [81]:
# fill nan values of main category with 'Office Products', which the main in the dataset
df_products_sampled['main_cat'] = df_products_sampled['main_cat'].fillna('Office Products')

In [82]:
df_products_sampled.dropna(subset=['title'])

Unnamed: 0,category,description,title,also_buy,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,details
0,"['Office Products', 'Office & School Supplies'...",['Protect yourself and your RFID card with a S...,Black RFID Blocking ID Badge Holder (Holds 2 C...,"['B005CXZTO2', 'B007XV1MSI', 'B000O9K45I', 'B0...",Specialist ID,"['RFID Blocking 2 Card Holder', 'FIPS 201 Appr...","['>#43,873 in Office Products (See top 100)', ...",[],Office Products,"class=""a-bordered a-horizontal-stripes a-spa...","October 14, 2011",$6.49,B005VSY1VK,{}
1,[],['The Star Wars Moleskine Saga continues in 20...,Moleskine 2015 Star Wars Limited Edition Daily...,[],Moleskine,[],[],[],Office Products,,"December 26, 2013",,8867323296,{}
2,"['Office Products', 'Office & School Supplies'...","['Staples Washable Glue Sticks, Purple, .26 oz...","Staples Washable Glue Sticks, Purple, .26 oz.,...",[],Staples,[],"['>#161,293 in Office Products (See top 100)',...",[],Office Products,,"June 22, 2015",$4.19,B011LAU4R6,{}
3,"['Office Products', 'Office & School Supplies'...","['Exclusive design, classic']",Best Abstract Fiery Floral Design Mouse Pads C...,[],Luxlady?Mousepad,['Material is made of the best plastic manufac...,"['>#143,156 in Computers & Accessories > Compu...",[],Cell Phones & Accessories,,"September 21, 1677",,B00KH94VSG,{}
4,"['Office Products', 'Office & School Supplies'...","['Kitten On Piano Keys Mouse Pad is 8"" x 8"" x ...",3dRose LLC 8 x 8 x 0.25 Inches Kitten on Piano...,[],3dRose,"['Dimensions (in inches): 8 W x 8 H x 0.25 D',...","['>#1,396,217 in Office Products (See top 100)...",[],Office Products,"class=""a-bordered a-horizontal-stripes a-spa...","July 14, 2014",$20.83,B00CX71JNU,{}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31541,"['Office Products', 'Office & School Supplies'...","[""These Desk Drawer Organizers are a modular s...",mDesign Office Supplies Desk Organizer for Sci...,"['B00WRMKZ9A', 'B06X9TFSGL', 'B0118B1T9U', 'B0...",mDesign,"['Perfect for notepads, clips, staples, tape, ...","['>#10,666 in Office Products (See top 100)', ...",[],Office Products,"class=""a-bordered a-horizontal-stripes a-spa...","April 27, 2015",$9.00,B00WRML5SK,{}
31542,"['Office Products', 'Office & School Supplies'...","[""This high-quality case has a double set of c...",Alice's Adventures in Wonderland 1898 Book Cov...,[],Coastal Colors,"['Double clips to hold all your stuff', ""Holds...","['>#485,879 in Office Products (See top 100)',...",['B00P331KSK'],Office Products,,"September 17, 2014",,B00NN6T41Y,{}
31543,"['Office Products', 'Office Electronics', 'Vid...",['DT00471 Lamp with Housing for Hitachi CP-HX2...,CTLAMP DT00471 Replacement Lamp Premium DT0047...,[],CTLAMP,['Type: High Quality Compatible lamp with Hous...,"['>#2,184 in Computers & Accessories > Compute...",['B003BYQO3Y'],Home Audio & Theater,"class=""a-bordered a-horizontal-stripes a-spa...","October 21, 2015",$2.13,B016XZQMMA,{}
31544,"['Office Products', 'Office &amp; School Suppl...","['', '']",Women of Marvel 2010 Wall Calendar,[],DayDream,[],"7,611,877 in Books (",[],Books,,"September 21, 1677",,1423800001,


In [83]:
# from bs4 import BeautifulSoup
# import re

# def extract_similar_items(html):

#   soup = BeautifulSoup(html, 'html.parser')

#   items = []
#   for i in range(5):
#     item_id = soup.select_one(f'#comparison_image{i}')['alt']
#     item_name = soup.select_one(f'#comparison_title{i}').text

#     # Remove special characters
#     item_name = re.sub(r'\W+', ' ', item_name)

#     items.append({'id': item_id, 'name': item_name})

#   return items

# # Sample usage  
# html = df_products_sampled['similar_item'][8]

# items = extract_similar_items(html)
# print(items)

df_products_sampled.dropna(subset=['similar_item'], inplace=True)

## Filter products by main category

In [2]:
df_products_sampled = df_products_sampled[df_products_sampled['main_cat'] == 'Office Products']

df_products_sampled.shape

(22287, 14)

## Text preprocessing (reviewText and summary)

In [3]:
import re
import string
from bs4 import BeautifulSoup
from unidecode import unidecode
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('words')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/lucamodica/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/lucamodica/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /home/lucamodica/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/lucamodica/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/lucamodica/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self._preprocess(text) for text in X]

    def _preprocess(self, text):
        # Lowercasing
        text = text.lower()
        # Remove accented characters
        text = unidecode(text)
        
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        
        # remove punctuation
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
        
        # remove double spaces
        text = re.sub(' +', ' ', text)
        
        # Tokenize text
        words = word_tokenize(text)
        # Remove stopwords and lemmatize
        words = [self.lemmatizer.lemmatize(
            word) for word in words if word not in self.stop_words]
        
        return ' '.join(words)
    
preprocessor = TextPreprocessor()

In [5]:
df_reviews_sampled['summary'] = df_reviews_sampled['summary'].astype(str)
df_reviews_sampled['summary'] = preprocessor.fit_transform(df_reviews_sampled['summary'])
df_reviews_sampled = df_reviews_sampled[df_reviews_sampled['summary'] != '']

df_reviews_sampled['reviewText'] = df_reviews_sampled['reviewText'].dropna().astype(str)
df_reviews_sampled['reviewText'] = df_reviews_sampled['reviewText'].astype(str)
df_reviews_sampled['reviewText'] = preprocessor.fit_transform(df_reviews_sampled['reviewText'])
df_reviews_sampled = df_reviews_sampled[df_reviews_sampled['reviewText'] != '']

In [6]:
df_products_sampled['description'] = df_products_sampled['description'].astype(str)
df_products_sampled['description'] = preprocessor.fit_transform(df_products_sampled['description'])
df_products_sampled = df_products_sampled[df_products_sampled['description'] != '']

## Save the prepared data

In [8]:
df_reviews_sampled.head()

Unnamed: 0,overall,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime
0,5.0,A1HBTW5M7ZZ9PT,310818621,FTLOE,absolutely love organizer ive never one figure...,super good deal,1433203200
1,5.0,A2F0F4NB6BLGVX,310823706,Lee,good bible carrier large print bible afraid wo...,leatherlook bible carrier,1395360000
2,5.0,A23BRQWL8LNB37,439499887,David,kid love peppa reading say,five star,1496361600
3,5.0,A3LGV5JXFSBFTL,439499887,Ryan H,great kid easy use,good,1474243200
4,5.0,A3LGV5JXFSBFTL,439499887,Ryan H,great kid,good,1474243200


In [9]:
df_products_sampled.head()

Unnamed: 0,category,description,title,also_buy,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,details
0,"['Office Products', 'Office & School Supplies'...",protect rfid card skimsafe card holder made ri...,Black RFID Blocking ID Badge Holder (Holds 2 C...,"['B005CXZTO2', 'B007XV1MSI', 'B000O9K45I', 'B0...",Specialist ID,"['RFID Blocking 2 Card Holder', 'FIPS 201 Appr...","['>#43,873 in Office Products (See top 100)', ...",[],Office Products,"class=""a-bordered a-horizontal-stripes a-spa...","October 14, 2011",$6.49,B005VSY1VK,{}
1,[],star war moleskine saga continues daily planne...,Moleskine 2015 Star Wars Limited Edition Daily...,[],Moleskine,[],[],[],Office Products,,"December 26, 2013",,8867323296,{}
2,"['Office Products', 'Office & School Supplies'...",staple washable glue stick purple oz pack,"Staples Washable Glue Sticks, Purple, .26 oz.,...",[],Staples,[],"['>#161,293 in Office Products (See top 100)',...",[],Office Products,,"June 22, 2015",$4.19,B011LAU4R6,{}
4,"['Office Products', 'Office & School Supplies'...",kitten piano key mouse pad x x made heavyduty ...,3dRose LLC 8 x 8 x 0.25 Inches Kitten on Piano...,[],3dRose,"['Dimensions (in inches): 8 W x 8 H x 0.25 D',...","['>#1,396,217 in Office Products (See top 100)...",[],Office Products,"class=""a-bordered a-horizontal-stripes a-spa...","July 14, 2014",$20.83,B00CX71JNU,{}
5,"['Office Products', 'Office & School Supplies'...",vivo next favorite pen ultra gel stick vibrant...,"Vivo Ultra Gel Stick Pens, 0.7mm Fine Tip, Bla...",[],VIVO,"['Ultra smooth gel ink', 'Vivid black &amp; co...","['>#1,646,151 in Office Products (See top 100)...",[],Office Products,,"April 30, 2009",,B002CO43BO,{}


In [16]:
save_sampled_data(df_reviews_sampled, df_products_sampled, 'data/reviews_sampled_processed.csv', 'data/products_sampled_processed.csv')