# SI 671 - Amazon Review Fraud Detection Project

# Notebook 1: Preliminary Analysis
Notebook Intent: Examination of the dataset's native characteristics, prior to major efforts in feature expansion. This is to help figure out what the state of class and categorical balances look like and, with only native features, see if there are any significant native differentiators between classes.

## Setup

In [1]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# df = pd.read_csv("data/amazon_reviews.txt", sep="\t")
df = pd.read_csv('drive/Shared drives/SI671 Project [Data Mining]/data/amazon_reviews.txt', sep='\t') 
print(f"{df.shape[0]} records")

#### __label1__ means fraudulent while __label2__ means legitimate
if('LABEL' in df.columns):
    df.LABEL = df.LABEL.replace(to_replace='__label1__', value=0).replace('__label2__', value=1)
    df = df.rename(columns={'LABEL':'LEGIT'})

df.head(5)

21000 records


Unnamed: 0,DOC_ID,LEGIT,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT
0,1,0,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav..."
1,2,0,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...
2,3,0,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...
3,4,0,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...
4,5,0,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...


## Dataset Structural Observations

In [None]:
print("We have a 50-50 split between legit and fake reviews")
print(f".... {df.LEGIT.value_counts()[0]} Fraudulent and {df.LEGIT.value_counts()[1]} Legit\n")

print("No missing features in any records // dropna() doesn't change shape:")
print(f"{df.shape}, {df.dropna().shape}\n")

print(f"We have {df.PRODUCT_CATEGORY.describe()['unique']} categories, each with 700 items:")
print(df.PRODUCT_CATEGORY.value_counts()[:5],'\n')

print(f"When we subdivide category by legit-illegit counts, we can see each is split 50-50:")
print(df.groupby('PRODUCT_CATEGORY').LEGIT.value_counts()[:6],'\n')

print("We also have a \"Verified Purchase\" field. More records in the dataset are verified than not.")
print("Being verified does not mean non-fraudulent:")

VerifLegitCounts = df.groupby('VERIFIED_PURCHASE').LEGIT.value_counts()
print(f"\t{VerifLegitCounts.Y[0]} Verified and Fraudulent\n\t{VerifLegitCounts.Y[1]} Verified Legitimate")
print(f"\t{VerifLegitCounts.N[0]} Unverified and Fraudulent\n\t{VerifLegitCounts.N[1]} Unverified Legitimate")

# Verified Legitimacy-Fraudulancy Ratio Varies by Product Category
# df.groupby(['LEGIT','PRODUCT_CATEGORY']).VERIFIED_PURCHASE.value_counts()

We have a 50-50 split between legit and fake reviews
.... 10500 Fraudulent and 10500 Legit

No missing features in any records // dropna() doesn't change shape:
(21000, 9), (21000, 9)

We have 30 categories, each with 700 items:
Sports                    700
Toys                      700
Apparel                   700
Health & Personal Care    700
Jewelry                   700
Name: PRODUCT_CATEGORY, dtype: int64 

When we subdivide category by legit-illegit counts, we can see each is split 50-50:
PRODUCT_CATEGORY  LEGIT
Apparel           0        350
                  1        350
Automotive        0        350
                  1        350
Baby              0        350
                  1        350
Name: LEGIT, dtype: int64 

We also have a "Verified Purchase" field. More records in the dataset are verified than not.
Being verified does not mean non-fraudulent:
	2877 Verified and Fraudulent
	8821 Verified Legitimate
	7623 Unverified and Fraudulent
	1679 Unverified Legitimate


In [None]:
print("Mean, STDev, and quartiles for reviews are seemingly relatively non-significantly differentiated")
print(df.groupby('LEGIT').RATING.describe(),'\n')
print(df.groupby('LEGIT').RATING.value_counts().unstack())

Mean, STDev, and quartiles for reviews are seemingly relatively non-significantly differentiated
         count      mean       std  min  25%  50%  75%  max
LEGIT                                                      
0      10500.0  4.115429  1.285663  1.0  4.0  5.0  5.0  5.0
1      10500.0  4.140476  1.270898  1.0  4.0  5.0  5.0  5.0 

RATING    1    2    3     4     5
LEGIT                            
0       889  627  926  1999  6059
1       868  565  942  1974  6151


***
## Comparing Stemmer and Lemmer for POS Preservation

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('punkt')
nltk.download('wordnet') 
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger') #POS Tag

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
 #### Comparison :: POS Tagging under vanilla, stemmed, and lemmed string variants
sample_terms = ['A','burnt','toasted','enchillada','I','found','under',"nick's",'car',
              'while','i','was','running','with','you','and','him','at','his','place',
              'after','my','dinner']

sample_str = ' '.join(sample_terms)
stemmed_str = PorterStemmer().stem(sample_str)
lemmed_str = WordNetLemmatizer().lemmatize(sample_str)

print(sample_str)
print(stemmed_str)
print(lemmed_str)
print()

#### Takeaway: stemmer seems to cause self-referential "I" to register improperly as NN, noun, instead of preposition
print([i[:3] for i in sample_str.split()])
print([i[1] for i in nltk.pos_tag(sample_str.split())])
print([i[1] for i in nltk.pos_tag(stemmed_str.split())])
print([i[1] for i in nltk.pos_tag(lemmed_str.split())])

A burnt toasted enchillada I found under nick's car while i was running with you and him at his place after my dinner
a burnt toasted enchillada i found under nick's car while i was running with you and him at his place after my dinn
A burnt toasted enchillada I found under nick's car while i was running with you and him at his place after my dinner

['A', 'bur', 'toa', 'enc', 'I', 'fou', 'und', 'nic', 'car', 'whi', 'i', 'was', 'run', 'wit', 'you', 'and', 'him', 'at', 'his', 'pla', 'aft', 'my', 'din']
['DT', 'NN', 'VBD', 'NN', 'PRP', 'VBD', 'IN', 'JJ', 'NN', 'IN', 'NN', 'VBD', 'VBG', 'IN', 'PRP', 'CC', 'PRP', 'IN', 'PRP$', 'NN', 'IN', 'PRP$', 'NN']
['DT', 'NN', 'VBD', 'NN', 'NN', 'VBD', 'IN', 'JJ', 'NN', 'IN', 'NN', 'VBD', 'VBG', 'IN', 'PRP', 'CC', 'PRP', 'IN', 'PRP$', 'NN', 'IN', 'PRP$', 'NN']
['DT', 'NN', 'VBD', 'NN', 'PRP', 'VBD', 'IN', 'JJ', 'NN', 'IN', 'NN', 'VBD', 'VBG', 'IN', 'PRP', 'CC', 'PRP', 'IN', 'PRP$', 'NN', 'IN', 'PRP$', 'NN']
