# Project Part 4.2: Sentiment Analysis with Amazon Reviews Datasets

In [1]:
import os
os.chdir('C:\\Users\\wmj51\\Desktop\\python')
import pandas as pd
import numpy as np

from textblob import TextBlob
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

## vaderSentiment

As Vader has included proper handling of sentences (shown below), I will clean text with the possible noise elements (e.g., @mention,URLs and www.). 

- typical negations (e.g., "not good")
- use of contractions as negations (e.g., "wasn't very good")
- conventional use of punctuation to signal increased sentiment intensity (e.g., "Good!!!")
- conventional use of word-shape to signal emphasis (e.g., using ALL CAPS for words/phrases)
- using degree modifiers to alter sentiment intensity (e.g., intensity boosters such as "very" and intensity dampeners such as "kind of")
- understanding many sentiment-laden slang words (e.g., 'sux')
- understanding many sentiment-laden slang words as modifiers such as 'uber' or 'friggin' or 'kinda'
- understanding many sentiment-laden emoticons such as :) and :D
- translating utf-8 encoded emojis such as 💘 and 💋 and 😁
- understanding sentiment-laden initialisms and acronyms (for example: 'lol')

The possible noise elements that should be removed as follows:
- URLs/www.: URLs and hyperlinks in text data like comments, reviews, and tweets should be removed
- @mention: same with emoticons, even though it carries some information, for sentiment analysis purpose, this can be ignored
- HTML: data contains html entities such as &amp in the text field 

In [3]:
df = pd.read_csv('train_amazon_shortreview.csv', index_col = 0, encoding = "ISO-8859-1")
pd.set_option('display.max_colwidth', -1)

import string
import HTMLParser
html_parser = HTMLParser.HTMLParser()
import re
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup

split_dic = {"cause": "because", "could've": "could have", 
             "he'd": "he would", "he'd've": "he would have", "he'll": "he will", 
             "he'll've": "he will have", "he's": "he is", "how'd": "how did", 
             "how'd'y": "how do you", "how'll": "how will", "how's": "how is", 
             "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 
             "I'll've": "I will have","I'm": "I am", "I've": "I have", 
             "i'd": "i would", "i'd've": "i would have", "i'll": "i will", 
             "i'll've": "i will have","i'm": "i am", "i've": "i have", 
             "it'd": "it would", "it'd've": "it would have", 
             "it'll": "it will", "it'll've": "it will have","it's": "it is", 
             "let's": "let us", "ma'am": "madam", 
             "might've": "might have",
             "must've": "must have",
             "o'clock": "of the clock", 
             "she'd": "she would", 
             "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
             "she's": "she is", "should've": "should have", 
             "so've": "so have","so's": "so as", 
             "this's": "this is",
             "that'd": "that would", "that'd've": "that would have","that's": "that is", 
             "there'd": "there would", "there'd've": "there would have","there's": "there is", 
             "here's": "here is",
             "they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
             "they'll've": "they will have", "they're": "they are", "they've": "they have",
             "to've": "to have", "we'd": "we would", 
             "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", 
             "we're": "we are", "we've": "we have", 
             "what'll": "what will", "what'll've": "what will have", "what're": "what are", 
             "what's": "what is", "what've": "what have", "when's": "when is", 
             "when've": "when have", "where'd": "where did", "where's": "where is", 
             "where've": "where have", "who'll": "who will", "who'll've": "who will have", 
             "who's": "who is", "who've": "who have", "why's": "why is", 
             "why've": "why have", "will've": "will have", 
             "would've": "would have", 
             "y'all": "you all", "y'all'd": "you all would",
             "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
             "you'd": "you would", "you'd've": "you would have", "you'll": "you will", 
             "you'll've": "you will have", "you're": "you are", "you've": "you have" }

pat1 = r'@[\w_]+' # @-mention
pat2 = r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+' # URLs
pat5 = r'www.[^ ]+' # additions to URLs, texts with 'www..'
combined_pat = r'|'.join((pat1, pat2, pat5))

split_pattern = re.compile(r'\b(' + '|'.join(split_dic.keys()) + r')\b')

def tweet_cleaner(demo):
    soup = BeautifulSoup(demo, 'lxml') # HTML
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    split_handled = split_pattern.sub(lambda x: split_dic[x.group()], stripped)

    return split_handled

In [4]:
%%time
df['clean_text'] = [tweet_cleaner(t) for t in df.text]
df.to_csv('train_amazon_shortreview_clean.csv', encoding = 'utf-8')

  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautifu

  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that documen

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


Wall time: 12min 7s


Setting thresholds for classifying sentences as either positive, neutral, or negative, the results can be found as follows: 

- 'Compound range from (-0.15, 0.15) | neu >= 0.55 Accuracy Score : 87.09% (shortreview.csv) 89.62% (longreview.csv)
- 'Compound range from (-0.25, 0.25) | neu >= 0.55 Accuracy Score : 88.34% (shortreview.csv) 90.15% (longreview.csv)
- 'Compound range from (-0.35, 0.35) | neu >= 0.55 Accuracy Score : 89.58% (shortreview.csv) 90.69% (longreview.csv)

In [43]:
from __future__ import division

df = pd.read_csv('train_amazon_longreview_vader.csv', index_col = 0)
df.drop(columns = ['Unnamed: 0.1'], inplace=True)
df.head()

Unnamed: 0,target,clean_text,neg,neu,pos,compound
0,1,I am reading a lot of reviews saying that this...,0.019,0.851,0.129,0.8481
1,1,This soundtrack is my favorite music of all ti...,0.04,0.697,0.263,0.9847
2,1,I truly like this soundtrack and I enjoy video...,0.092,0.631,0.278,0.9753
3,1,"If you have played the game, you know how divi...",0.0,0.725,0.275,0.9781
4,1,I am quite sure any of you actually taking the...,0.015,0.752,0.233,0.9873


In [44]:
df['com'] = df['compound']

df = df.drop(df[(df['compound'] >= -0.15) & (df['compound'] <= 0.15)].index)
df.loc[df['compound'] > 0.15, 'compound'] = 1
df.loc[df['compound'] < -0.15, 'compound'] = 0
df = df.drop(df[df['neu'] >=0.55].index)
df['cheating0.15'] = np.where(df['target'] != df['compound'], 'yes', 'no')

In [45]:
print "Compound range from (-0.15, 0.15) | neu >= 0.55 Accuracy Score : {0:.2f}%".format((1 - (len(df.loc[df['cheating0.15'] == 'yes']) / len(df)))*100)

Compound range from (-0.15, 0.15) | neu >= 0.55 Accuracy Score : 89.62%


## LIWC 

The Tone variable puts the two dimensions (positive emotion and negative emotion dimensions) into a single summary variable Cohn, Mehl, & Pennebaker, 2004). The algorithm is built so that the higher the number, the more positive the tone. Numbers below 50 suggest a more negative emotional tone. The accuracy results can be found as follows: 

- Tone range from (50, 50) | Accuracy Score : 68.01% (shortreview.csv) 67.29& (longreview.csv)
- Tone range from (45, 55) | Accuracy Score : 68.01% (shortreview.csv) 68.18% (longreview.csv)
- Tone range from (40, 60) | Accuracy Score : 68.01% (shortreview.csv) 69.06% (longreview.csv)

In [21]:
df = pd.read_csv('train_amazon_longreview_LIWC.csv', index_col = 0)
df = df.iloc[1:]
df.columns = ['target','text','tone','pos','neg']
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,target,text,tone,pos,neg
0,1,I am reading a lot of reviews saying that this...,65.65,3.16,1.05
1,1,This soundtrack is my favorite music of all ti...,99.0,9.92,0.76
2,1,I truly like this soundtrack and I enjoy video...,99.0,8.4,0.84
3,1,"If you have played the game, you know how divi...",99.0,7.59,1.27
4,1,I am quite sure any of you actually taking the...,99.0,6.38,0.0


In [22]:
df = df.drop(df[(df['tone'] >= 40) & (df['tone'] <= 60)].index)
df.loc[df['tone'] > 60, 'tone'] = 'pos'
df.loc[df['tone'] < 40, 'tone'] = 'neg'
df['tone'] = df['tone'].map({'pos':1, 'neg':0})
df['cheating'] = np.where(df['target'] != df['tone'], 'yes', 'no')

## Algorithms Comparision

In [2]:
df = pd.read_csv('train_amazon_shortreview_clean.csv', index_col = 0, encoding = "ISO-8859-1")
df.dropna(inplace=True)

x = df.clean_text
y = df.target

from sklearn.cross_validation import train_test_split
SEED = 2000
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.02, random_state=SEED)

print ("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_train),
                                                                             (len(x_train[y_train == 0]) / (len(x_train)*1.))*100,
                                                                            (len(x_train[y_train == 1]) / (len(x_train)*1.))*100))
print ("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_test),
                                                                             (len(x_test[y_test == 0]) / (len(x_test)*1.))*100,
                                                                            (len(x_test[y_test == 1]) / (len(x_test)*1.))*100))

  mask |= (ar1 == a)


Train set has total 3527815 entries with 50.00% negative, 50.00% positive
Test set has total 71997 entries with 50.15% negative, 49.85% positive


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from time import time

def accuracy_summary(pipeline, x_train, y_train, x_test, y_test):
    if len(x_test[y_test == 0]) / (len(x_test)*1.) > 0.5:
        null_accuracy = len(x_test[y_test == 0]) / (len(x_test)*1.)
    else:
        null_accuracy = 1. - (len(x_test[y_test == 0]) / (len(x_test)*1.))
    t0 = time()
    sentiment_fit = pipeline.fit(x_train, y_train)
    y_pred = sentiment_fit.predict(x_test)
    train_test_time = time() - t0
    accuracy = accuracy_score(y_test, y_pred)
    print ("null accuracy: {0:.2f}%".format(null_accuracy*100))
    print ("accuracy score: {0:.2f}%".format(accuracy*100))
    if accuracy > null_accuracy:
        print ("model is {0:.2f}% more accurate than null accuracy".format((accuracy-null_accuracy)*100))
    elif accuracy == null_accuracy:
        print ("model has the same accuracy with the null accuracy")
    else:
        print ("model is {0:.2f}% less accurate than null accuracy".format((null_accuracy-accuracy)*100))
    print ("train and test time: {0:.2f}s".format(train_test_time))
    print ("-"*80)
    return accuracy, train_test_time

cvec = CountVectorizer()
lr = LogisticRegression()
n_features = np.arange(10000,100001,10000)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import NearestCentroid
from sklearn.feature_selection import SelectFromModel

names = ["Logistic Regression", "Linear SVC", "LinearSVC with L1-based feature selection","Multinomial NB", 
         "Bernoulli NB", "Ridge Classifier", "AdaBoost", "Perceptron","Passive-Aggresive", "Nearest Centroid"]
classifiers = [
    LogisticRegression(),
    LinearSVC(),
    Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False))),
  ('classification', LinearSVC(penalty="l2"))]),
    MultinomialNB(),
    BernoulliNB(),
    RidgeClassifier(),
    AdaBoostClassifier(),
    Perceptron(),
    PassiveAggressiveClassifier(),
    NearestCentroid()
    ]
zipped_clf = zip(names,classifiers)

cvec = CountVectorizer()

def classifier_comparator(vectorizer=cvec, n_features=10000, stop_words=None, ngram_range=(1, 1), classifier=zipped_clf):
    result = []
    vectorizer.set_params(stop_words=stop_words, max_features=n_features, ngram_range=ngram_range)
    for n,c in classifier:
        checker_pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', c)
        ])
        print "Test result for {}".format(n)
        print c
        clf_accuracy,tt_time = accuracy_summary(checker_pipeline, x_train, y_train, x_test, y_test)
        result.append((n,clf_accuracy,tt_time))
    return result

In [4]:
%%time
unigram_result = classifier_comparator(n_features=100000,ngram_range=(1,1))

Test result for Logistic Regression
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
null accuracy: 50.15%
accuracy score: 84.99%
model is 34.83% more accurate than null accuracy
train and test time: 463.34s
--------------------------------------------------------------------------------
Test result for Linear SVC
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
null accuracy: 50.15%
accuracy score: 83.77%
model is 33.62% more accurate than null accuracy
train and test time: 909.62s
--------------------------------------------------------------------------------
Test result for LinearSVC with L1-based feature s



null accuracy: 50.15%
accuracy score: 78.10%
model is 27.94% more accurate than null accuracy
train and test time: 32.63s
--------------------------------------------------------------------------------
Test result for Passive-Aggresive
PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
              fit_intercept=True, loss='hinge', max_iter=None, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=None,
              verbose=0, warm_start=False)




null accuracy: 50.15%
accuracy score: 80.16%
model is 30.01% more accurate than null accuracy
train and test time: 32.99s
--------------------------------------------------------------------------------
Test result for Nearest Centroid
NearestCentroid(metric='euclidean', shrink_threshold=None)
null accuracy: 50.15%
accuracy score: 69.63%
model is 19.48% more accurate than null accuracy
train and test time: 28.90s
--------------------------------------------------------------------------------
Wall time: 1h 2min 52s


In [3]:
df = pd.read_csv('train_amazon_longreview_clean.csv', index_col = 0, encoding = "ISO-8859-1")
df.dropna(inplace=True)

x = df.clean_text
y = df.target

from sklearn.cross_validation import train_test_split
SEED = 2000
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.02, random_state=SEED)

print ("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_train),
                                                                             (len(x_train[y_train == 0]) / (len(x_train)*1.))*100,
                                                                            (len(x_train[y_train == 1]) / (len(x_train)*1.))*100))
print ("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_test),
                                                                             (len(x_test[y_test == 0]) / (len(x_test)*1.))*100,
                                                                            (len(x_test[y_test == 1]) / (len(x_test)*1.))*100))

  mask |= (ar1 == a)


Train set has total 3527995 entries with 50.00% negative, 50.00% positive
Test set has total 72000 entries with 50.02% negative, 49.98% positive


In [4]:
%%time
unigram_result = classifier_comparator(n_features=100000,ngram_range=(1,1))

Test result for Logistic Regression
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
null accuracy: 50.02%
accuracy score: 88.82%
model is 38.80% more accurate than null accuracy
train and test time: 7783.65s
--------------------------------------------------------------------------------
Test result for Linear SVC
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
null accuracy: 50.02%
accuracy score: 85.29%
model is 35.28% more accurate than null accuracy
train and test time: 2512.68s
--------------------------------------------------------------------------------
Test result for LinearSVC with L1-based feature



null accuracy: 50.02%
accuracy score: 85.58%
model is 35.56% more accurate than null accuracy
train and test time: 394.55s
--------------------------------------------------------------------------------
Test result for Passive-Aggresive
PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
              fit_intercept=True, loss='hinge', max_iter=None, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=None,
              verbose=0, warm_start=False)




null accuracy: 50.02%
accuracy score: 84.33%
model is 34.32% more accurate than null accuracy
train and test time: 377.86s
--------------------------------------------------------------------------------
Test result for Nearest Centroid
NearestCentroid(metric='euclidean', shrink_threshold=None)
null accuracy: 50.02%
accuracy score: 59.46%
model is 9.44% more accurate than null accuracy
train and test time: 381.28s
--------------------------------------------------------------------------------
Wall time: 6h 7min 36s
