In [5]:
import pandas as pd

1. Ingesting data

In [6]:
df = pd.read_csv('/Users/mzheng/stat3494w-paper/data/amazon_reviews.txt', delimiter = '\t')

In [7]:
len(df)

21000

In [8]:
df.head()

Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT
0,1,__label1__,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav..."
1,2,__label1__,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...
2,3,__label1__,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...
3,4,__label1__,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...
4,5,__label1__,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...


In [9]:
# re-encode 'LABEL' values
df.loc[df["LABEL"] == "__label1__", "LABEL"] = '1' # these are fake reviews
df.loc[df["LABEL"] == "__label2__", "LABEL"] = '0' # these are real reviews

In [10]:
# get information about data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21000 entries, 0 to 20999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   DOC_ID             21000 non-null  int64 
 1   LABEL              21000 non-null  object
 2   RATING             21000 non-null  int64 
 3   VERIFIED_PURCHASE  21000 non-null  object
 4   PRODUCT_CATEGORY   21000 non-null  object
 5   PRODUCT_ID         21000 non-null  object
 6   PRODUCT_TITLE      21000 non-null  object
 7   REVIEW_TITLE       21000 non-null  object
 8   REVIEW_TEXT        21000 non-null  object
dtypes: int64(2), object(7)
memory usage: 1.4+ MB


2. Data Exploration

In [11]:
products_by_labels = df.groupby(df["LABEL"]).PRODUCT_CATEGORY.value_counts()
products_by_labels

LABEL  PRODUCT_CATEGORY      
0      Apparel                   350
       Automotive                350
       Baby                      350
       Beauty                    350
       Books                     350
       Camera                    350
       Electronics               350
       Furniture                 350
       Grocery                   350
       Health & Personal Care    350
       Home                      350
       Home Entertainment        350
       Home Improvement          350
       Jewelry                   350
       Kitchen                   350
       Lawn and Garden           350
       Luggage                   350
       Musical Instruments       350
       Office Products           350
       Outdoors                  350
       PC                        350
       Pet Products              350
       Shoes                     350
       Sports                    350
       Tools                     350
       Toys                      350
       V

In [12]:
ratings_by_labels = df.groupby(df["LABEL"]).RATING.value_counts()
ratings_by_labels

LABEL  RATING
0      5         6151
       4         1974
       3          942
       1          868
       2          565
1      5         6059
       4         1999
       3          926
       1          889
       2          627
Name: RATING, dtype: int64

In [13]:
products_by_ratings = df.groupby(df["LABEL"]).VERIFIED_PURCHASE.value_counts()
products_by_ratings

LABEL  VERIFIED_PURCHASE
0      Y                    8821
       N                    1679
1      N                    7623
       Y                    2877
Name: VERIFIED_PURCHASE, dtype: int64

2. Create new features

In [14]:
# create new feature: sentiment classifier

# any rating < 3 is a negative review
df.loc[df["RATING"] < 3, "RATING"] = 0

# a review of 3 is neutral and doesn't fall into either category

# any rating > 3 is a positive review
df.loc[df["RATING"] > 3, "RATING"] = 1

In [15]:
# rating 1 is over-represented compared to rating 0 and rating 3 should be ignored
df.RATING.value_counts()

1    16183
0     2949
3     1868
Name: RATING, dtype: int64

In [16]:
# df with all RATING = 1
df1 = df.loc[df['RATING'] == 1]

# want to make RATING = 1 more proportional to RATING = 0 (select 20% of the data with RATING = 1)
df2 = df1.sample(frac=0.2, replace=True)

# df with all RATING = 0
df3 = df.loc[df['RATING'] == 0]

# combining df2 with df3 to make a resulting df that contains proportional amounts of RATING = 1 and RATING = 0
df4 = pd.concat([df2, df3], ignore_index=True)
df4

Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT
0,15622,0,1,Y,Pet Products,B00K0E8YEU,"Eagle Pack Natural Dry Small Breed Dog Food, C...",Looked up reviews that showed this dog had les...,Looked up reviews that showed this dog had les...
1,15678,0,1,Y,Lawn and Garden,B00DNIP0N8,Thunder (TM) Sunmax 8-Inch White Air Coolable ...,great big@$/#! hood,I am very impressed with the hood. High qualit...
2,6479,1,1,Y,Grocery,B00JH9XSZE,"Gourmet Gold Ganoderma Latte Coffee, The Herb ...",Really tasty and relaxing.,I've always loved coffee but hated any side ef...
3,16289,0,1,Y,Wireless,B007M7E6ZE,Aimo Wireless SAMU380PCLP005 Rubber Essentials...,Pink phone cover,I love the case for my Samsung phone. I droppe...
4,20038,0,1,Y,Luggage,B001CZT4L8,"Travelon Set of 2 Inflatable Hangers, White, O...",No Pointy Shoulders!,Took 4 on a long trip and used them every few ...
...,...,...,...,...,...,...,...,...,...
6181,20961,0,0,Y,Shoes,B0069F61NU,MG Collection Lucca Designer Inspired Glamour ...,not same,"the bag is not same as the picture, nothing is..."
6182,20967,0,0,Y,Shoes,B005B9GFUY,Fila Women's Memory Flux Slip Resistant Traini...,"Too man""ish""",These are so manish looking I sent them back. ...
6183,20970,0,0,Y,Shoes,B008MI08ZO,Stride Rite Star Wars Morphing Light-Up Sneake...,JUNK!,We are on our third pair in less than 2 months...
6184,20983,0,0,Y,Shoes,B00IA6US7G,West Blvd Womens LIMA MOCCASIN Boots 3-Layer F...,Good thing they are only for one outfit to hav...,These run I would say two sizes smaller than w...


In [17]:
# processing data to be split into test and training sets by isolating the columns of interest
raw_data = df4[['RATING', 'VERIFIED_PURCHASE', 'PRODUCT_CATEGORY', 'REVIEW_TEXT', 'LABEL']]
raw_data = [tuple(x) for x in raw_data.values]

3. Data pre-processing

In [18]:
# get a list of common english words ('like', 'and', 'I')
import nltk
nltk.download('stopwords')
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mzheng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
# process review text for model
import string

# returns a table mapping each punctuation symbol to None; for use with translate() later
table = str.maketrans({key: None for key in string.punctuation})

def processor(text):
    # converts a word to its base form
    lemmatizer = nltk.stem.WordNetLemmatizer()

    # stores bigrams (a pair of consecutive words)
    filtered_tokens = []

    # contains all the base words (converted from their original words in the review text)
    lemmatized_tokens = []

    # set of stop words (commonly used english words)
    stop_words = set(nltk.corpus.stopwords.words('english'))

    # returns a string where each character is mapped to its corresponding character in the translation table
    text = text.translate(table)

    # iterate through each word in review text
    for word in text.split(" "):
        if word not in stop_words: 
            # then, the word should be converted to its base form
            lemmatized_tokens.append(lemmatizer.lemmatize(word.lower()))

        # append the bigrams of that base word to filtered_tokens
        filtered_tokens = [' '.join(l) for l in nltk.bigrams(lemmatized_tokens)] + lemmatized_tokens

    return filtered_tokens

In [20]:
# create feature vectors (must use 'processsor' on review_text before inputting)
# feature_vector numerically quantifies the contents of features so that ML models can use it to make predictions

# NOTE: each feature to be used in the 'classifier' must be inputted into the feature vector

def feature_vector(rating, verified_purchase, product_category, review_text):
    # dictionary of features
    feature_dict = dict()

    # rating feature
    feature_dict["R"] = rating

    # verified_purchase feature
    if verified_purchase == "Y":
        feature_dict["VP"] = 1
    else:
        feature_dict["VP"] = 0

    # product_category feature
    if product_category not in feature_dict:
        feature_dict[product_category] = 1
    else:
        feature_dict[product_category] =+ 1

    # review_text feature
    for text in review_text:
        if text not in feature_dict:
            feature_dict[text] = 1
        else:
            feature_dict[text] =+ 1
    
    return feature_dict

In [21]:
nltk.download('wordnet')
nltk.download('omw-1.4')

def train_test_split(raw_data, p):
    # stores training set
    training_set = []

    # stores testing set
    testing_set = []

    # number of rows in raw_data
    all_raw_data = len(raw_data)

    # number of rows in half of raw_data
    half_raw_data = int(len(raw_data)/2)

    # extra rows to split by
    randomized_index = int((p * all_raw_data)/2)

    # suppose you have 100 data values and p=0.8
    # this for-loop selects 0-39 and 50-89 as training data
    for (rating, verified_purchase, product_category, review_text, label) in raw_data[:randomized_index] + raw_data[half_raw_data:half_raw_data + randomized_index]:
        training_set.append((feature_vector(rating, verified_purchase, product_category, processor(review_text)), label))

    # this for-loop selects 40-49 and 90-100 as testing data
    for (rating, verified_purchase, product_category, review_text, label) in raw_data[randomized_index:half_raw_data] + raw_data[half_raw_data + randomized_index:]:
        testing_set.append((feature_vector(rating, verified_purchase, product_category, processor(review_text)), label))
    
    return training_set, testing_set

[nltk_data] Downloading package wordnet to /Users/mzheng/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/mzheng/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [22]:
# split raw data (0.8 is training, 0.2 is testing)
training_set, testing_set = train_test_split(raw_data, 0.8)

4. Model building

In [23]:
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from sklearn.pipeline import Pipeline

In [47]:
# the classifier
# training set is a list of (feature vector, label)
def classifier(training_set):
    # pipeline containing the Linear Support Vector Classifier (SVM) from sklearn
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(training_set) # trains the SVM on training_set

In [48]:
# predicts labels on testing_set using a trained classifier
def predict(testing_set, classifier):
    # for each feature vector in testing_set, map the corresponding prediction made by the classifer to a list and return it
    return classifier.classify_many(map(lambda x: x[0], testing_set))

In [49]:
# train the classifier
classifier = classifier(training_set)

# make predictions using the trained classifier
predictions = predict(testing_set, classifier)

# get true labels of test data
# for each 'label' in testing_set, map it to a list
true_labels = list(map(lambda x: x[1], testing_set))



In [50]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

# accuracy of classifier on test data
accuracy = accuracy_score(true_labels, predictions)

# precision, recall, and fscore on test data
precision, recall, fscore, _ = precision_recall_fscore_support(true_labels, predictions, average='macro')

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F-score: ", fscore)

Accuracy:  0.7600969305331179
Precision:  0.7101330732251098
Recall:  0.7632348833961737
F-score:  0.720296221915401


In [51]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(true_labels, predictions)

confusion_matrix

array([[704, 226],
       [ 71, 237]])