In [1]:
import pandas as pd
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [2]:
# The reviews are labelled as fake or real (in the dataset they’re mapped fake (label1) or real (label2)).
# https://medium.com/@lievgarcia/deception-on-amazon-c1e30d977cfd

df = pd.read_csv('amazon_reviews.txt', sep = "\t")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21000 entries, 0 to 20999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   DOC_ID             21000 non-null  int64 
 1   LABEL              21000 non-null  object
 2   RATING             21000 non-null  int64 
 3   VERIFIED_PURCHASE  21000 non-null  object
 4   PRODUCT_CATEGORY   21000 non-null  object
 5   PRODUCT_ID         21000 non-null  object
 6   PRODUCT_TITLE      21000 non-null  object
 7   REVIEW_TITLE       21000 non-null  object
 8   REVIEW_TEXT        21000 non-null  object
dtypes: int64(2), object(7)
memory usage: 1.4+ MB


In [3]:
#mapping binary output label to numeric values 0 (fake review) and 1 (real review)
df['target'] = pd.factorize(df['LABEL'])[0]  

df.head()

Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT,target
0,1,__label1__,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav...",0
1,2,__label1__,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...,0
2,3,__label1__,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...,0
3,4,__label1__,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...,0
4,5,__label1__,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...,0


In [4]:
num_fake = len(df[df['target'] == 0])
num_real = len(df[df['target'] == 1])

print(num_real, num_fake)

10500 10500


As seen above, the dataset is evenly balanced across both classes.

## Review Text Preprocessing

In [5]:
tokenizer = RegexpTokenizer(r'\w+')

# converting to lowercase and tokenizing
review_tokens = [tokenizer.tokenize(review.lower()) for review in df['REVIEW_TEXT']]
review_tokens[0]

['when',
 'least',
 'you',
 'think',
 'so',
 'this',
 'product',
 'will',
 'save',
 'the',
 'day',
 'just',
 'keep',
 'it',
 'around',
 'just',
 'in',
 'case',
 'you',
 'need',
 'it',
 'for',
 'something']

In [6]:
# removing stop words
stop_words = set(stopwords.words("english"))
content_review_tokens = [[token for token in review if token not in stop_words] for review in review_tokens]

print("Before stop word removal: ", review_tokens[0])
print()
print("After stop word removal: ", content_review_tokens[0])

Before stop word removal:  ['when', 'least', 'you', 'think', 'so', 'this', 'product', 'will', 'save', 'the', 'day', 'just', 'keep', 'it', 'around', 'just', 'in', 'case', 'you', 'need', 'it', 'for', 'something']

After stop word removal:  ['least', 'think', 'product', 'save', 'day', 'keep', 'around', 'case', 'need', 'something']


## Stemming and Lemmatization

In [11]:
from nltk.stem import SnowballStemmer     #porter 2 algorithm
snowball = SnowballStemmer(language = "english")

content_review_tokens = [[snowball.stem(token) for token in review] for review in content_review_tokens]
print(content_review_tokens[374])

['brush', 'soft', 'soon', 'first', 'usag', 'see', 'bristl', 'come', 'worth', 'purcha', 'fall', 'generic', 'product']


In [12]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

content_review_tokens = [[lemmatizer.lemmatize(token) for token in review] for review in content_review_tokens]
print(content_review_tokens[374])

['brush', 'soft', 'soon', 'first', 'usag', 'see', 'bristl', 'come', 'worth', 'purcha', 'fall', 'generic', 'product']


## N-Gram Modelling

In [13]:
from nltk import ngrams

review_text_unigrams = [list(ngrams(tokens, 1)) for tokens in content_review_tokens]
review_text_bigrams = [list(ngrams(tokens, 2)) for tokens in content_review_tokens]
review_text_trigrams = [list(ngrams(tokens, 3)) for tokens in content_review_tokens]

print(review_text_unigrams[374])
print(review_text_bigrams[374])
print(review_text_trigrams[374])

[('brush',), ('soft',), ('soon',), ('first',), ('usag',), ('see',), ('bristl',), ('come',), ('worth',), ('purcha',), ('fall',), ('generic',), ('product',)]
[('brush', 'soft'), ('soft', 'soon'), ('soon', 'first'), ('first', 'usag'), ('usag', 'see'), ('see', 'bristl'), ('bristl', 'come'), ('come', 'worth'), ('worth', 'purcha'), ('purcha', 'fall'), ('fall', 'generic'), ('generic', 'product')]
[('brush', 'soft', 'soon'), ('soft', 'soon', 'first'), ('soon', 'first', 'usag'), ('first', 'usag', 'see'), ('usag', 'see', 'bristl'), ('see', 'bristl', 'come'), ('bristl', 'come', 'worth'), ('come', 'worth', 'purcha'), ('worth', 'purcha', 'fall'), ('purcha', 'fall', 'generic'), ('fall', 'generic', 'product')]
