In [1]:
import os
import re
import datetime
import time
from itertools import islice
from operator import itemgetter

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, ShuffleSplit

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score, f1_score, precision_score, accuracy_score, roc_auc_score, confusion_matrix
from sklearn import svm

from imblearn.over_sampling import SMOTE

import pickle

import nltk
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords

Using TensorFlow backend.
[nltk_data] Downloading package stopwords to /Users/gotit/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /Users/gotit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
def get_run_time(t1, t2):
    diff = t2 - t1
    mins = int(diff / 60)
    secs = round(diff % 60, 3)
    return str(mins) + " mins and " + str(secs) + " seconds"

def clean_str(sentence):
    # Remove HTML
    review_text = BeautifulSoup(sentence, features="html.parser").text
    # Remove non-letters
    letters_only = re.sub("[^a-zA-Z\s\s+]", "", review_text).strip()
    return letters_only

def convert_plain_to_csv(text_file, csv_file):
    t0 = time.time()
    with open(text_file, "r") as f1, open(csv_file, "w") as f2:
        i = 0
        f2.write("productId,score,summary,text\n")
        while True:
            next_n_lines = list(islice(f1, 9))  # read 9 line
            if not next_n_lines:
                break

            output_line = ""
            for line in next_n_lines:
                if "product/productId:" in line:
                    output_line += line.split(":")[1].strip() + ","
                elif "review/score:" in line:
                    output_line += line.split(":")[1].strip() + ","
                elif "review/summary:" in line:
                    summary = clean_str(line.split(":")[1].strip()) + ","
                    output_line += summary
                elif "review/text:" in line:
                    text = clean_str(line.split(":")[1].strip()) + "\n"
                    output_line += text

            f2.write(output_line)

            # print status
            i += 1
            if i % 10000 == 0:
                print(i, "reviews converted...")

    print(datetime.datetime.now(), "- Converting completed in", get_run_time(t0, time.time()))

def get_data(file_name):
    if os.path.exists(file_name):
        print("-- " + file_name + " found locally")
        df = pd.read_csv(file_name)
    return df

def review_to_words(review):
    # 1. Convert to lower case, split into individual words
    words = review.lower().split()

    # 2. Get english stop words
    stops = set(stopwords.words("english"))
    
    # 3. Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    
    return " ".join(meaningful_words)


def cleaning_data(dataset, file_name):
    t0 = time.time()
    num_reviews = dataset["text"].size
    clean_train_reviews = []

    # Loop over each review
    for i in range(0, num_reviews):
        # If the index is evenly divisible by 1000, print a message
        if (i + 1) % 10000 == 0:
            print("Review", i + 1, "of", num_reviews, "\n")

        productId = str(dataset["productId"][i])
        score = str(dataset["score"][i])
        summary = str(dataset["summary"][i])
        text = review_to_words(str(dataset["text"][i]))

        clean_train_reviews.append(productId + "," + score + "," + summary + "," + text + "\n")

    print("Writing clean train reviews...")
    with open(file_name, "w") as f:
        f.write("productId,score,summary,text\n")
        for review in clean_train_reviews:
            f.write("%s\n" % review)

    
    print(datetime.datetime.now(), "- Write file completed in", get_run_time(t0, time.time()))

In [6]:
"""
Pre-processing
"""
convert_plain_to_csv("finefoods.txt", "foods.csv")

# Reading the Data
train = get_data("foods.csv")
print("Data dimensions:", train.shape)
print("List features:", train.columns.values)

cleaning_data(train, "clean_train_reviews.csv")

10000 reviews converted...
20000 reviews converted...
30000 reviews converted...
40000 reviews converted...
50000 reviews converted...
60000 reviews converted...
70000 reviews converted...
80000 reviews converted...
90000 reviews converted...
100000 reviews converted...
110000 reviews converted...
120000 reviews converted...
130000 reviews converted...
140000 reviews converted...




150000 reviews converted...
160000 reviews converted...
170000 reviews converted...
180000 reviews converted...
190000 reviews converted...
200000 reviews converted...
210000 reviews converted...
220000 reviews converted...
230000 reviews converted...
240000 reviews converted...
250000 reviews converted...
260000 reviews converted...
270000 reviews converted...
280000 reviews converted...
290000 reviews converted...
300000 reviews converted...
310000 reviews converted...
320000 reviews converted...
330000 reviews converted...
340000 reviews converted...
350000 reviews converted...
360000 reviews converted...
370000 reviews converted...
380000 reviews converted...
390000 reviews converted...
400000 reviews converted...
410000 reviews converted...
420000 reviews converted...
430000 reviews converted...
440000 reviews converted...
450000 reviews converted...
460000 reviews converted...
470000 reviews converted...
480000 reviews converted...
490000 reviews converted...
500000 reviews conve

In [8]:
# read data from file
reviews = pd.read_csv("clean_train_reviews.csv", nrows=20000)
# ignore all 3* reviews
reviews = reviews[reviews["score"] != 3]
# positive sentiment = 4* or 5* reviews (sentriment = True)
reviews["sentiment"] = reviews["score"] >= 4

# X = reviews['text'].values.astype('U')
X = reviews['text']
y = reviews['sentiment']

In [9]:
naive = MultinomialNB()
svm_clf = svm.SVC(kernel='linear', C=1)

In [10]:
ss = ShuffleSplit(n_splits=10, test_size=0.2)
sm = SMOTE()
accs = []
f1s = []
cms = []
pres = []
recs = []
vect = CountVectorizer(analyzer="word",
                            preprocessor=None,
                            stop_words=None,
                            max_features=1000)

for train_index, test_index in ss.split(X):

    
    
    X_train, X_test = X.iloc[train_index].values.astype('U'), X.iloc[test_index].values.astype('U')
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    Encoder = LabelEncoder() 
    y_train = Encoder.fit_transform (y_train) 
    y_test = Encoder.fit_transform (y_test)
    
    # Fit vectorizer and transform X train, then transform X test
    X_train_vect = vect.fit_transform(X_train)
    X_test_vect = vect.transform(X_test)
    
    # Oversample
    X_train_res, y_train_res = sm.fit_sample(X_train_vect, y_train)
    
    # Fit Naive Bayes on the vectorized X with y train labels, 
    # then predict new y labels using X test
    naive.fit(X_train_res, y_train_res)
    y_pred = naive.predict(X_test_vect)
    
    # Determine test set accuracy and f1 score on this fold using the true y labels and predicted y labels
    accs.append(accuracy_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred))
    cms.append(confusion_matrix(y_test, y_pred))
    pres.append(precision_score(y_test, y_pred))
    recs.append(recall_score(y_test, y_pred))

print("\nAverage accuracy across folds: {:.2f}%".format(sum(accs) / len(accs) * 100))
print("\nAverage F1 score across folds: {:.2f}%".format(sum(f1s) / len(f1s) * 100))
print("\nAverage Precision score across folds: {:.2f}%".format(sum(pres) / len(pres) * 100))
print("\nAverage Recall score across folds: {:.2f}%".format(sum(recs) / len(recs) * 100))
print("\nAverage Confusion Matrix across folds: \n {}".format(sum(cms) / len(cms)))


Average accuracy across folds: 84.73%

Average F1 score across folds: 90.58%

Average Precision score across folds: 93.87%

Average Recall score across folds: 87.52%

Average Confusion Matrix across folds: 
 [[ 413.9  176. ]
 [ 384.5 2696.6]]


In [11]:
ss = ShuffleSplit(n_splits=10, test_size=0.2)
sm = SMOTE()
accs = []
f1s = []
cms = []
pres = []
recs = []
vect = TfidfVectorizer(analyzer="word",
                                preprocessor=None,
                                stop_words=None,
                                max_features=1000)

for train_index, test_index in ss.split(X):
    
    X_train, X_test = X.iloc[train_index].values.astype('U'), X.iloc[test_index].values.astype('U')
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    Encoder = LabelEncoder() 
    y_train = Encoder.fit_transform (y_train) 
    y_test = Encoder.fit_transform (y_test)
    
    # Fit vectorizer and transform X train, then transform X test
    X_train_vect = vect.fit_transform(X_train)
    X_test_vect = vect.transform(X_test)
    
    # Oversample
    X_train_res, y_train_res = sm.fit_sample(X_train_vect, y_train)
    
    # Fit Naive Bayes on the vectorized X with y train labels, 
    # then predict new y labels using X test
    naive.fit(X_train_res, y_train_res)
    y_pred = naive.predict(X_test_vect)
    
    # Determine test set accuracy and f1 score on this fold using the true y labels and predicted y labels
    accs.append(accuracy_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred))
    cms.append(confusion_matrix(y_test, y_pred))
    pres.append(precision_score(y_test, y_pred))
    recs.append(recall_score(y_test, y_pred))

print("\nAverage accuracy across folds: {:.2f}%".format(sum(accs) / len(accs) * 100))
print("\nAverage F1 score across folds: {:.2f}%".format(sum(f1s) / len(f1s) * 100))
print("\nAverage Precision score across folds: {:.2f}%".format(sum(pres) / len(pres) * 100))
print("\nAverage Recall score across folds: {:.2f}%".format(sum(recs) / len(recs) * 100))
print("\nAverage Confusion Matrix across folds: \n {}".format(sum(cms) / len(cms)))


Average accuracy across folds: 83.76%

Average F1 score across folds: 89.69%

Average Precision score across folds: 96.00%

Average Recall score across folds: 84.17%

Average Confusion Matrix across folds: 
 [[ 480.   108.1]
 [ 488.1 2594.8]]


In [12]:
ss = ShuffleSplit(n_splits=10, test_size=0.2)
sm = SMOTE()
accs = []
f1s = []
cms = []
pres = []
recs = []
vect = CountVectorizer(analyzer="word",
                            preprocessor=None,
                            stop_words=None,
                            max_features=1000)
                            
for train_index, test_index in ss.split(X):

    
    
    X_train, X_test = X.iloc[train_index].values.astype('U'), X.iloc[test_index].values.astype('U')
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Fit vectorizer and transform X train, then transform X test
    X_train_vect = vect.fit_transform(X_train)
    X_test_vect = vect.transform(X_test)
    
    # Oversample
    X_train_res, y_train_res = sm.fit_sample(X_train_vect, y_train)
    
    # Fit Naive Bayes on the vectorized X with y train labels, 
    # then predict new y labels using X test
    svm_clf.fit(X_train_res, y_train_res)
    y_pred = svm_clf.predict(X_test_vect)
    
    # Determine test set accuracy and f1 score on this fold using the true y labels and predicted y labels
    accs.append(accuracy_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred))
    cms.append(confusion_matrix(y_test, y_pred))
    pres.append(precision_score(y_test, y_pred))
    recs.append(recall_score(y_test, y_pred))

print("\nAverage accuracy across folds: {:.2f}%".format(sum(accs) / len(accs) * 100))
print("\nAverage F1 score across folds: {:.2f}%".format(sum(f1s) / len(f1s) * 100))
print("\nAverage Precision score across folds: {:.2f}%".format(sum(pres) / len(pres) * 100))
print("\nAverage Recall score across folds: {:.2f}%".format(sum(recs) / len(recs) * 100))
print("\nAverage Confusion Matrix across folds: \n {}".format(sum(cms) / len(cms)))


Average accuracy across folds: 85.57%

Average F1 score across folds: 91.19%

Average Precision score across folds: 93.44%

Average Recall score across folds: 89.04%

Average Confusion Matrix across folds: 
 [[ 401.1  192.4]
 [ 337.3 2740.2]]


In [13]:
ss = ShuffleSplit(n_splits=10, test_size=0.2)
sm = SMOTE()
accs = []
f1s = []
cms = []
pres = []
recs = []
vect = TfidfVectorizer(analyzer="word",
                                preprocessor=None,
                                stop_words=None,
                                max_features=1000)

for train_index, test_index in ss.split(X):
    
    X_train, X_test = X.iloc[train_index].values.astype('U'), X.iloc[test_index].values.astype('U')
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    Encoder = LabelEncoder() 
    y_train = Encoder.fit_transform (y_train) 
    y_test = Encoder.fit_transform (y_test)
    
    # Fit vectorizer and transform X train, then transform X test
    X_train_vect = vect.fit_transform(X_train)
    X_test_vect = vect.transform(X_test)
    
    # Oversample
    X_train_res, y_train_res = sm.fit_sample(X_train_vect, y_train)
    
    # Fit Naive Bayes on the vectorized X with y train labels, 
    # then predict new y labels using X test
    svm_clf.fit(X_train_res, y_train_res)
    y_pred = svm_clf.predict(X_test_vect)
    
    # Determine test set accuracy and f1 score on this fold using the true y labels and predicted y labels
    accs.append(accuracy_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred))
    cms.append(confusion_matrix(y_test, y_pred))
    pres.append(precision_score(y_test, y_pred))
    recs.append(recall_score(y_test, y_pred))

print("\nAverage accuracy across folds: {:.2f}%".format(sum(accs) / len(accs) * 100))
print("\nAverage F1 score across folds: {:.2f}%".format(sum(f1s) / len(f1s) * 100))
print("\nAverage Precision score across folds: {:.2f}%".format(sum(pres) / len(pres) * 100))
print("\nAverage Recall score across folds: {:.2f}%".format(sum(recs) / len(recs) * 100))
print("\nAverage Confusion Matrix across folds: \n {}".format(sum(cms) / len(cms)))


Average accuracy across folds: 86.04%

Average F1 score across folds: 91.40%

Average Precision score across folds: 94.60%

Average Recall score across folds: 88.40%

Average Confusion Matrix across folds: 
 [[ 435.1  155.3]
 [ 357.2 2723.4]]
