In [1]:
# Data: https://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html
# electronics directory
# These are Amazon reviews
!head sentiment-analysis-data/negative.review

<review>
<unique_id>
B00005UKBG:bad:j._brodeur_"disgusted_consumer"
</unique_id>
<asin>
B00005UKBG
</asin>
<product_name>
Atlantic 1316 CD Storage Case (110-Capacity, Wave): Electronics
</product_name>


In [2]:
# We'll use Beautiful soup and only the key review_text
# 2 passes over data: 1. Determine vocab size 
#                         and which index corresponds to which index
#                     2. Create data vectors
# Use logistic regression once the vectors are created

# The key idea here is that dimensions correspond to words
#   and words whose weight has a large magnitude are important
#   discriminators of positive and negative sentiment.

In [3]:
# X : One-hot encoded bag of words
# Y : 1 or 0 (positive / negative)

import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup

In [4]:
from nltk.corpus import stopwords
wordnet_lemmatizer = WordNetLemmatizer()
positive_reviews = BeautifulSoup(open('sentiment-analysis-data/positive.review'), 'html5lib').findAll('review_text')
negative_reviews = BeautifulSoup(open('sentiment-analysis-data/negative.review'), 'html5lib').findAll('review_text')

In [6]:
# Since the number of positive and negative reviews are imbalanced...
# (more positive), make the number equal
np.random.shuffle(positive_reviews)
positive_reviews = positive_reviews[:len(negative_reviews)]

In [13]:
# Create index for each word for the index of each word in the final vector
# First find out which words and assign indices
word_index_map = {}
current_index = 0

def my_tokenizer(s):
    # Lowercase everything
    s = s.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    # throw our short words
    tokens = [t for t in tokens if len(t) > 2]
    # Lemmatize to turn words into base forms
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
    # remove stop words
    tokens = [t for t in tokens if t not in stopwords.words()]
    return tokens

# Save tokenized arrays for later use

positive_tokenized = []
negative_tokenized = []

for review in positive_reviews:
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1
            
for review in negative_reviews:
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [15]:
# Take each token and create a data array

def token_to_vector(tokens, label):
    # Passing two arguments to make shuffling easy
    x = np.zeros(len(word_index_map) + 1)
    # (+ 1 for the labels)
    
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    # Word proportion instead of counts
    x = x / x.sum()
    x[-1] = label
    return x


N = len(positive_tokenized) + len(negative_tokenized)

data = np.zeros((N, len(word_index_map) + 1))
i = 0 # which sample I'm looiking at

for tokens in positive_tokenized:
    xy = token_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1
    
for tokens in negative_tokenized:
    xy = token_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

np.random.shuffle(data)

X = data[:, :-1]
Y = data[:, -1]

Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Classification rate", model.score(Xtest, Ytest))

Classification rate 0.72


In [16]:
# We can look at the weights to see which words have positive or negative sentiment
threshold = 0.7

for word, index in word_index_map.items():
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print('{} : {:.4f}'.format(word, weight))

best : 0.9817
easy : 1.4088
doe : -1.0727
back : -1.4086
price : 2.2901
well : 0.9267
great : 3.5278
get : -1.0626
used : 0.9944
item : -0.9187
quality : 1.3815
use : 1.5314
thing : -0.8338
memory : 0.7959
n't : -1.9429
little : 0.7215
buy : -0.7904
wa : -1.2653
support : -0.8038
good : 2.0053
sound : 1.0552
month : -0.7037
love : 0.9368
highly : 0.8778
money : -0.7546
fast : 0.7891
perfect : 0.8587
would : -0.7879
speaker : 0.7676
excellent : 1.2280
waste : -0.8519
return : -0.9906
