In [1]:
# creating train, dev, and test sets 
import sentiment_reader
import numpy as np

# creating sets
sentiment_corpus = sentiment_reader.SentimentCorpus()
print('Training set:', len(sentiment_corpus.train_X))
print('Test set:', len(sentiment_corpus.test_X))
print('Dev set:', len(sentiment_corpus.dev_X))
print('Vocabulary size:', len(sentiment_corpus.feat_dict))

Training set: 1600
Test set: 400
Dev set: 0
Vocabulary size: 13989


In [2]:
# training networks using Sigmoid activation
from sklearn.neural_network import MLPClassifier
mlp_sig1 = MLPClassifier(hidden_layer_sizes=(100), 
                         activation='logistic', 
                         solver='sgd', 
                         max_iter=1000)
mlp_sig1.fit(sentiment_corpus.train_X, sentiment_corpus.train_y.ravel())

mlp_sig2 = MLPClassifier(hidden_layer_sizes=(100, 100), 
                         activation='logistic', 
                         solver='sgd', 
                         max_iter=1000)
mlp_sig2.fit(sentiment_corpus.train_X, sentiment_corpus.train_y.ravel())

mlp_sig3 = MLPClassifier(hidden_layer_sizes=(100, 100, 100), 
                         activation='logistic', 
                         solver='sgd', 
                         max_iter=1000)
mlp_sig3.fit(sentiment_corpus.train_X, sentiment_corpus.train_y.ravel())



MLPClassifier(activation='logistic', hidden_layer_sizes=(100, 100, 100),
              max_iter=1000, solver='sgd')

In [3]:
# training networks using Tanh activation
mlp_tanh1 = MLPClassifier(hidden_layer_sizes=(100), 
                          activation='tanh', 
                          solver='sgd', 
                          max_iter=1000)
mlp_tanh1.fit(sentiment_corpus.train_X, sentiment_corpus.train_y.ravel())

mlp_tanh2 = MLPClassifier(hidden_layer_sizes=(100, 100), 
                          activation='tanh', 
                          solver='sgd', 
                          max_iter=1000)
mlp_tanh2.fit(sentiment_corpus.train_X, sentiment_corpus.train_y.ravel())

mlp_tanh3 = MLPClassifier(hidden_layer_sizes=(100, 100, 100), 
                          activation='tanh', 
                          solver='sgd', 
                          max_iter=1000)
mlp_tanh3.fit(sentiment_corpus.train_X, sentiment_corpus.train_y.ravel())

MLPClassifier(activation='tanh', hidden_layer_sizes=(100, 100, 100),
              max_iter=1000, solver='sgd')

In [4]:
# training networks using RelU activation
mlp_relu1 = MLPClassifier(hidden_layer_sizes=(100), 
                          activation='relu', 
                          solver='sgd', 
                          max_iter=1000)
mlp_relu1.fit(sentiment_corpus.train_X, sentiment_corpus.train_y.ravel())

mlp_relu2 = MLPClassifier(hidden_layer_sizes=(100, 100), 
                          activation='relu', 
                          solver='sgd', 
                          max_iter=1000)
mlp_relu2.fit(sentiment_corpus.train_X, sentiment_corpus.train_y.ravel())

mlp_relu3 = MLPClassifier(hidden_layer_sizes=(100, 100, 100), 
                          activation='relu', 
                          solver='sgd', 
                          max_iter=1000)
mlp_relu3.fit(sentiment_corpus.train_X, sentiment_corpus.train_y.ravel())

MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=1000, solver='sgd')

In [5]:
# testing NNs with test set
# calculating metrics for Sigmoid activation
prediction_sig1 = mlp_sig1.predict(sentiment_corpus.test_X)
accuracy_sig1 = mlp_sig1.score(sentiment_corpus.test_X, sentiment_corpus.test_y)
prediction_sig2 = mlp_sig2.predict(sentiment_corpus.test_X)
accuracy_sig2 = mlp_sig2.score(sentiment_corpus.test_X, sentiment_corpus.test_y)
prediction_sig3 = mlp_sig3.predict(sentiment_corpus.test_X)
accuracy_sig3 = mlp_sig3.score(sentiment_corpus.test_X, sentiment_corpus.test_y)

# calculating metrics for Tanh activation
prediction_tanh1 = mlp_tanh1.predict(sentiment_corpus.test_X)
accuracy_tanh1 = mlp_tanh1.score(sentiment_corpus.test_X, sentiment_corpus.test_y)
prediction_tanh2 = mlp_tanh2.predict(sentiment_corpus.test_X)
accuracy_tanh2 = mlp_tanh2.score(sentiment_corpus.test_X, sentiment_corpus.test_y)
prediction_tanh3 = mlp_tanh3.predict(sentiment_corpus.test_X)
accuracy_tanh3 = mlp_tanh3.score(sentiment_corpus.test_X, sentiment_corpus.test_y)

#calculating metrics for Relu activation
prediction_relu1 = mlp_relu1.predict(sentiment_corpus.test_X)
accuracy_relu1 = mlp_relu1.score(sentiment_corpus.test_X, sentiment_corpus.test_y)
prediction_relu2 = mlp_relu2.predict(sentiment_corpus.test_X)
accuracy_relu2 = mlp_relu2.score(sentiment_corpus.test_X, sentiment_corpus.test_y)
prediction_relu3 = mlp_relu3.predict(sentiment_corpus.test_X)
accuracy_relu3 = mlp_relu3.score(sentiment_corpus.test_X, sentiment_corpus.test_y)

In [7]:
# displaying metrics for each trained network
from sklearn.metrics import classification_report
# metrics for Sigmoid activation
print('1 layer model with Signmoid activation function: ')
print('Accuracy:', accuracy_sig1)
print(classification_report(sentiment_corpus.test_y, prediction_sig1))
print('2 layer model with Signmoid activation function: ')
print('Accuracy:', accuracy_sig2)
print(classification_report(sentiment_corpus.test_y, prediction_sig2))
print('3 layer model with Signmoid activation function: ')
print('Accuracy:', accuracy_sig3)
print(classification_report(sentiment_corpus.test_y, prediction_sig3))

# metrics for Tanh activation
print('1 layer model with Tanh activation function: ')
print('Accuracy:', accuracy_tanh1)
print(classification_report(sentiment_corpus.test_y, prediction_tanh1))
print('2 layer model with Tanh activation function: ')
print('Accuracy:', accuracy_tanh2)
print(classification_report(sentiment_corpus.test_y, prediction_tanh2))
print('3 layer model with Tanh activation function: ')
print('Accuracy:', accuracy_tanh3)
print(classification_report(sentiment_corpus.test_y, prediction_tanh3))

# metrics for Relu activation
print('1 layer model with RelU activation function: ')
print('Accuracy:', accuracy_relu1)
print(classification_report(sentiment_corpus.test_y, prediction_relu1))
print('2 layer model with RelU activation function: ')
print('Accuracy:', accuracy_relu2)
print(classification_report(sentiment_corpus.test_y, prediction_relu2))
print('3 layer model with RelU activation function: ')
print('Accuracy:', accuracy_relu3)
print(classification_report(sentiment_corpus.test_y, prediction_relu3))

1 layer model with Signmoid activation function: 
Accuracy: 0.83
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       212
           1       0.81      0.84      0.82       188

    accuracy                           0.83       400
   macro avg       0.83      0.83      0.83       400
weighted avg       0.83      0.83      0.83       400

2 layer model with Signmoid activation function: 
Accuracy: 0.4675
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       212
           1       0.47      0.99      0.64       188

    accuracy                           0.47       400
   macro avg       0.23      0.50      0.32       400
weighted avg       0.22      0.47      0.30       400

3 layer model with Signmoid activation function: 
Accuracy: 0.47
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       212
           1       0.47      1.00      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
from gensim.models import KeyedVectors
embeddings = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [89]:
# creating a new dictionary, taking the average of bigram embeddings
feat_embeddings = sentiment_corpus.feat_counts
# iterating over vocabulary in corpus
for key in sentiment_corpus.feat_dict.keys():
    # temporary embedding to average bigrams
    embedding = np.zeros(300, dtype='float32')
    # bigrams averaging
    if key.find('_') != -1:
        normalize = 0
        bigrams = key.split('_')
        try:
            embedding += embeddings[bigrams[0]]
            normalize += 1
        except KeyError:
            print(bigrams[0] + ' is not in embeddings matrix')
        try:
            embedding += embeddings[bigrams[1]]
            normalize += 1
        except KeyError:
            print(bigrams[0] + ' is not in embeddings matrix')
            continue
        if normalize > 1:
            for i in embedding:
                normalize = np.array(2.0)
                embedding /= normalize
                
    # adding unigram embeddings
    else:
        try:
            embedding += embeddings[key]
        except KeyError:
            print(key + ' is not in embeddings matrix')
            continue
    feat_embeddings[key] = embedding

a is not in embeddings matrix
<num> is not in embeddings matrix
a is not in embeddings matrix
after is not in embeddings matrix
a is not in embeddings matrix
a is not in embeddings matrix
all is not in embeddings matrix
by is not in embeddings matrix
of is not in embeddings matrix
lot is not in embeddings matrix
one is not in embeddings matrix
back is not in embeddings matrix
to is not in embeddings matrix
crichton is not in embeddings matrix
with is not in embeddings matrix
to is not in embeddings matrix
<num> is not in embeddings matrix
of is not in embeddings matrix
team is not in embeddings matrix
some is not in embeddings matrix
to is not in embeddings matrix
bulk is not in embeddings matrix
a is not in embeddings matrix
want is not in embeddings matrix
overview is not in embeddings matrix
bit is not in embeddings matrix
stories is not in embeddings matrix
a is not in embeddings matrix
read is not in embeddings matrix
field is not in embeddings matrix
was is not in embeddings matr

read is not in embeddings matrix
to is not in embeddings matrix
book's is not in embeddings matrix
us is not in embeddings matrix
dose is not in embeddings matrix
organized is not in embeddings matrix
and is not in embeddings matrix
language is not in embeddings matrix
front is not in embeddings matrix
side is not in embeddings matrix
a is not in embeddings matrix
work is not in embeddings matrix
happened is not in embeddings matrix
continues is not in embeddings matrix
to is not in embeddings matrix
of is not in embeddings matrix
of is not in embeddings matrix
of is not in embeddings matrix
to is not in embeddings matrix
a is not in embeddings matrix
and is not in embeddings matrix
period is not in embeddings matrix
a is not in embeddings matrix
<num> is not in embeddings matrix
<num> is not in embeddings matrix
of is not in embeddings matrix
page is not in embeddings matrix
of is not in embeddings matrix
and is not in embeddings matrix
and is not in embeddings matrix
story is not in 

ready is not in embeddings matrix
i've is not in embeddings matrix
to is not in embeddings matrix
necessary is not in embeddings matrix
invasion is not in embeddings matrix
wishes is not in embeddings matrix
a is not in embeddings matrix
linked is not in embeddings matrix
"one" is not in embeddings matrix
reason is not in embeddings matrix
likely is not in embeddings matrix
closer is not in embeddings matrix
when is not in embeddings matrix
readers is not in embeddings matrix
sought is not in embeddings matrix
means is not in embeddings matrix
tolstoy is not in embeddings matrix
concepts is not in embeddings matrix
notion is not in embeddings matrix
lies is not in embeddings matrix
of is not in embeddings matrix
and is not in embeddings matrix
end is not in embeddings matrix
a is not in embeddings matrix
of is not in embeddings matrix
issue is not in embeddings matrix
evidence is not in embeddings matrix
a is not in embeddings matrix
<year> is not in embeddings matrix
concept is not in

a is not in embeddings matrix
boring is not in embeddings matrix
addict's is not in embeddings matrix
i'm is not in embeddings matrix
around is not in embeddings matrix
hillerman's is not in embeddings matrix
to is not in embeddings matrix
to is not in embeddings matrix
a is not in embeddings matrix
to is not in embeddings matrix
and is not in embeddings matrix
repetitive is not in embeddings matrix
flat is not in embeddings matrix
to is not in embeddings matrix
dubner is not in embeddings matrix
prepared is not in embeddings matrix
hint is not in embeddings matrix
disappointment is not in embeddings matrix
reviews is not in embeddings matrix
flynn is not in embeddings matrix
assumptions is not in embeddings matrix
of is not in embeddings matrix
reichs is not in embeddings matrix
to is not in embeddings matrix
baldacci is not in embeddings matrix
and is not in embeddings matrix
because is not in embeddings matrix
to is not in embeddings matrix
three is not in embeddings matrix
junes is

In [91]:
import codecs
# mapping feature to index
embeddings_dict = {}
i = 0
for key in feat_embeddings.keys():
    embeddings_dict[key] = i
    i += 1

# getting number of positive reviews and breaking input file into a list of reviews
reviews = []
nr_pos = 0
with codecs.open("positive.review", 'r', 'utf8') as pos_file:
    for line in pos_file:
        nr_pos += 1
        sentance = []
        toks = line.split(" ")
        for feat in toks[0:-1]:
            name, counts = feat.split(":")
            if name in feat_embeddings:
                sentance.append(name)
        reviews.append(sentance)
                
# getting number of positive reviews and breaking input file into a list of reviews        
nr_neg = 0
with codecs.open("negative.review", 'r', 'utf8') as neg_file:
    for line in neg_file:
        nr_neg += 1
        sentance = []
        toks = line.split(" ")
        for feat in toks[0:-1]:
            name, counts = feat.split(":")
            if name in feat_embeddings:
                sentance.append(name)
        reviews.append(sentance)

# creating  a single embedding for each review
review_embeddings = []
for review in reviews:
    words = 0
    sentance_embedding = []
    for word in review:
        if word in embeddings_dict:
            if words == 0:
                sentance_embedding = feat_embeddings[word]
            else:
                sentance_embedding = np.add(sentance_embedding, feat_embeddings[word])
            words += 1
    review_embeddings.append(np.asarray(sentance_embedding)/words)

X_embeddings = np.zeros((sentiment_corpus.nr_instances, 300), dtype=object)
y_embeddings = np.vstack((np.zeros([nr_pos,1], dtype=int), np.ones([nr_neg,1], dtype=int)))



In [92]:
with codecs.open("positive.review", 'r', 'utf8') as pos_file:
    nr_pos = 0
    for line in pos_file:
        review = review_embeddings[nr_pos]
        for x in range(300):
            X_embeddings[nr_pos, x] = review[x]
        nr_pos += 1
        
with codecs.open("negative.review", 'r', 'utf8') as neg_file:
    nr_neg = 0
    for line in neg_file:
        review = review_embeddings[nr_pos+nr_neg]
        for x in range(300):
            X_embeddings[nr_pos+nr_neg, x] = review[x]
        nr_neg += 1

In [93]:
# shuffle the order, mix positive and negative examples
new_order = np.arange(sentiment_corpus.nr_instances)
np.random.seed(0) # set seed
np.random.shuffle(new_order)
X_embeddings = X_embeddings[new_order,:]
y_embeddings = y_embeddings[new_order,:]
    
# creating train, test, and dev sets 
train_y, dev_y, test_y, train_X, dev_X, test_X = sentiment_reader.split_train_dev_test(X_embeddings, y_embeddings, 0.8, 0, 0.2)

In [94]:
mlp_embeddings = MLPClassifier(hidden_layer_sizes=(100), 
                          activation='tanh', 
                          solver='sgd', 
                          max_iter=1000)
mlp_embeddings.fit(train_X, train_y.ravel())
prediction_embeddings = mlp_embeddings.predict(test_X)
accuracy_embeddings = mlp_embeddings.score(test_X, test_y)
print('1 layer model with Tanh activation function: ')
print('Accuracy:', accuracy_embeddings)
print(classification_report(test_y, prediction_embeddings))

1 layer model with Tanh activation function: 
Accuracy: 0.595
              precision    recall  f1-score   support

           0       0.74      0.37      0.49       212
           1       0.54      0.85      0.66       188

    accuracy                           0.59       400
   macro avg       0.64      0.61      0.58       400
weighted avg       0.65      0.59      0.57       400

