In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
nltk.download('sentiwordnet')
from nltk.corpus import sentiwordnet as swn
import pickle

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\miska\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\miska\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\miska\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


### Unsupervised learning part

For unsupervised learning we will be using the same tokens that has been created for supervised learning part. Before they were saved in txt file, so now we have to load them:

In [2]:
with open("tokens_v2.txt", "rb") as fp:   # Unpickling
     X_processed = pickle.load(fp)

In [57]:
y = pd.read_csv('y_v2.csv', sep=';', header=None)
y = y.drop(0, axis=1)
y = y.rename({1: 'sentiment'}, axis=1)
y.head(5)

Unnamed: 0,sentiment
0,1.0
1,1.0
2,0.0
3,1.0
4,0.0


For this part sentiwordnet dictionary will be used. And example of its usage below:

In [3]:
happiness = swn.senti_synset('happy.a.03')
print(happiness.neg_score())
print(happiness.pos_score())
print(happiness.obj_score())

0.0
0.5
0.5


From the example we could see that to work with it thru nltk package for each word we need to pass a POS tag (part-of-the-speech). To identify this - we will be using pos_tag function from nltk:

In [64]:
test = nltk.pos_tag(nltk.word_tokenize('happy'))
print(test)

[('happy', 'JJ')]


Where "JJ"stands for adjective.

This is a small issue, as sentiwordnet works with a bit different format of POS. For this we need to build a fucntion which will convert it to the workable format.

In [14]:
def convert_tag(tagged_token):
    token = tagged_token[0]
    if token[1].startswith('NN'):
        tag = 'n'
    elif token[1].startswith('JJ'):
        tag = 'a'
    elif token[1].startswith('V'):
        tag = 'v'
    elif token[1].startswith('R'):
        tag = 'r'
    else:
        tag = ''    
    return (token[0], tag)

Testing function

In [15]:
token, tag = convert_tag(nltk.pos_tag(nltk.word_tokenize('happy')))

In [16]:
print(token)
print(tag)

happy
a


Using senti_synsets we can get all the synonyms for the token, for instance for "happy":

In [30]:
happy = list(swn.senti_synsets(token, tag))

In [31]:
happy

[SentiSynset('happy.a.01'),
 SentiSynset('felicitous.s.02'),
 SentiSynset('glad.s.02'),
 SentiSynset('happy.s.04')]

Then we can count an average of the scores

In [32]:
pos, neg, obj = 0., 0., 0.
for h in happy:
    pos += h.pos_score()
    neg += h.neg_score()
    obj += h.obj_score()
pos = pos / len(happy)
neg = neg / len(happy)
obj = obj / len(happy)
print(pos, neg, obj)

0.5625 0.0 0.4375


Let's build a function for getting an average score:

In [72]:
def avg_score(token, tag):
    synms = list(swn.senti_synsets(token, tag))
    pos, neg, obj = 0., 0., 0.
    for s in synms:
        pos += s.pos_score()
        neg += s.neg_score()
        obj += s.obj_score()
    if len(synms)>0:
        pos = pos / len(synms)
        neg = neg / len(synms)
        obj = obj / len(synms)
    return pos, neg, obj

In [46]:
token, tag = convert_tag(nltk.pos_tag(nltk.word_tokenize('happy')))
print(token, tag)
pos, neg, obj = avg_score(token, tag)

happy a


In [47]:
pos, neg, obj

(0.5625, 0.0, 0.4375)

Let's try one of the sentences, we will calculate a sum of positives, negatives and objectives scores:

In [60]:
print(X_processed[0])
print('---------------------------')
print(y.loc[0, 'sentiment'])

['this', 'place', 'obviously', 'bank', 'famous', 'founder', 'guys', 'good', 'think', 'service', 'slow', 'care', 'provide', 'good', 'dining', 'experience', 'table', 'dirty', 'wait', 'clean', 'bartender', 'nice', 'pretty', 'quick', 'skip', 'year', 'want', 'shake', 'look', 'pretty', 'small', 'price']
---------------------------
1.0


In [73]:
test = X_processed[0]
pos, neg, obj = 0., 0., 0.
for t in test:
    print(t)
    tagged = nltk.pos_tag(nltk.word_tokenize(t))
    token, tag = convert_tag(tagged)
    print(tag)
    if tag != '':
        pos_t, neg_t, obj_t = avg_score(token, tag)
        print(pos_t, neg_t, obj_t)
        pos += pos_t
        neg += neg_t
        obj += obj_t
    else: 
        pass
    print('--------------')
print(f'positive: {pos}')
print(f'negative: {neg}')
print(f'objective: {obj}')

this

--------------
place
n
0.0078125 0.0078125 0.984375
--------------
obviously
r
0.5 0.0 0.5
--------------
bank
n
0.0 0.0 1.0
--------------
famous
a
0.375 0.0 0.625
--------------
founder
n
0.0 0.041666666666666664 0.9583333333333334
--------------
guys
n
0.0 0.0 1.0
--------------
good
a
0.6190476190476191 0.005952380952380952 0.375
--------------
think
n
0.0 0.0 1.0
--------------
service
n
0.0 0.0 1.0
--------------
slow
v
0.0 0.041666666666666664 0.9583333333333334
--------------
care
n
0.20833333333333334 0.20833333333333334 0.5833333333333334
--------------
provide
n
0.0 0.0 0.0
--------------
good
a
0.6190476190476191 0.005952380952380952 0.375
--------------
dining
v
0.0 0.0 1.0
--------------
experience
n
0.0 0.0 1.0
--------------
table
n
0.0 0.0 1.0
--------------
dirty
n
0.0 0.0 0.0
--------------
wait
n
0.0 0.0 1.0
--------------
clean
n
0.0 0.0 1.0
--------------
bartender
n
0.0 0.0 1.0
--------------
nice
a
0.65 0.075 0.275
--------------
pretty
r
0.125 0.25 0.625


As result this sentence is more positive than negative, in fact it's neutral, so let's check that on positive and negative:

In [74]:
y = np.array(y.loc[:, 'sentiment'].values)

In [75]:
y[:10]

array([1., 1., 0., 1., 0., 0., 1., 2., 0., 0.])

In [119]:
test = X_processed[2]
pos, neg, obj = 0., 0., 0.
for t in test:
    print(t)
    tagged = nltk.pos_tag(nltk.word_tokenize(t))
    token, tag = convert_tag(tagged)
    print(tag)
    if tag != '':
        pos_t, neg_t, obj_t = avg_score(token, tag)
        print(pos_t, neg_t, obj_t)
        pos += pos_t
        neg += neg_t
        obj += obj_t
    else: 
        pass
    print('--------------')
print(f'positive: {pos}')
print(f'negative: {neg}')
print(f'objective: {obj}')

impressed
a
0.5 0.5 0.0
--------------
place
n
0.0078125 0.0078125 0.984375
--------------
today
n
0.0625 0.0 0.9375
--------------
come
v
0.03571428571428571 0.005952380952380952 0.9583333333333334
--------------
lunch
n
0.0 0.0 1.0
--------------
order
n
0.016666666666666666 0.0 0.9833333333333333
--------------
shrimp
n
0.0 0.0 1.0
--------------
combo
n
0.0 0.0 1.0
--------------
disappointed
a
0.0 0.5 0.5
--------------
look
n
0.03125 0.09375 0.875
--------------
like

--------------
frozen
n
0.0 0.0 0.0
--------------
shrimp
n
0.0 0.0 1.0
--------------
grocery
n
0.0 0.0 1.0
--------------
store
n
0.0 0.0 1.0
--------------
tasting
v
0.125 0.0625 0.8125
--------------
grease
n
0.0 0.0625 0.9375
--------------
french
a
0.0 0.0 1.0
--------------
fry
n
0.0 0.0 1.0
--------------
weren
n
0.0 0.0 0.0
--------------
season
n
0.16666666666666666 0.0 0.8333333333333334
--------------
drink
n
0.05 0.0 0.95
--------------
luke
n
0.0 0.0 1.0
--------------
warm
n
0.0 0.0 0.0
--------------

It works on negative 1.17 < 1.363

In [120]:
test = X_processed[7]
pos, neg, obj = 0., 0., 0.
for t in test:
    print(t)
    tagged = nltk.pos_tag(nltk.word_tokenize(t))
    token, tag = convert_tag(tagged)
    print(tag)
    if tag != '':
        pos_t, neg_t, obj_t = avg_score(token, tag)
        print(pos_t, neg_t, obj_t)
        pos += pos_t
        neg += neg_t
        obj += obj_t
    else: 
        pass
    print('--------------')
print(f'positive: {pos}')
print(f'negative: {neg}')
print(f'objective: {obj}')

great
a
0.3125 0.020833333333333332 0.6666666666666666
--------------
customer
n
0.0 0.0 1.0
--------------
service
n
0.0 0.0 1.0
--------------
thoroughly
r
0.3125 0.0 0.6875
--------------
explain
n
0.0 0.0 0.0
--------------
treatment
n
0.09375 0.125 0.78125
--------------
reasonably
r
0.25 0.125 0.625
--------------
price
n
0.08928571428571429 0.0 0.9107142857142857
--------------
positive: 1.0580357142857142
negative: 0.27083333333333337
objective: 5.671130952380952


It works in this example as well: 1.05 > 0.27

This examples also show that this method is very limited - we can't understand whether the comment was neutral. To proceed further we will only leave samples with positive or negative 

In [90]:
mask = y != 1.

In [102]:
#finding all the indexes with True
for i in np.where(mask[:10] == True)[0]: print(X_processed[i])

['impressed', 'place', 'today', 'come', 'lunch', 'order', 'shrimp', 'combo', 'disappointed', 'look', 'like', 'frozen', 'shrimp', 'grocery', 'store', 'tasting', 'grease', 'french', 'fry', 'weren', 'season', 'drink', 'luke', 'warm', 'pay', 'dollar', 'meal', 'waste', 'money', 'time']
['locate', 'excalibur', 'floor', 'place', 'doesn', 'donut', 'fresh', 'super', 'small', 'good', 'disgrace']
['taste', 'like', 'real', 'white', 'castle', 'find', 'jersey', 'know', 'harold', 'kumar', 'travel', 'movie', 'this', 'pretty', 'knock', 'real', 'white', 'castle', '-pron-', 'ketchup', 'burger', 'signature', 'white', 'castle', 'sweet', 'cherish', 'white', 'castle', 'restaurant', 'east', 'coast']
['great', 'customer', 'service', 'thoroughly', 'explain', 'treatment', 'reasonably', 'price']
['place', 'online', 'order', 'hour', 'waiting', 'update', 'phone', 'problem', 'then', 'try', 'call', 'multiple', 'time', 'hang', 'line', 'ring', 'time', 'terrible', 'customer', 'service']
['smoothie', 'taste', 'great', 's

In [108]:
#without neutral
X_ = []
for i in np.where(mask == True)[0]:
    X_.append(X_processed[i])
y_ = y[mask]

Now we can check how this library works.

In [109]:
print(len(X_), len(y_))

100002 100002


As expected we have 100002 samples in our dataset.

In [126]:
scoring = []
scoring_details = []
for num, sentence in enumerate(X_):
    if num % 1000 == 0.:
        print(num)
    pos, neg, obj = 0., 0., 0.
    for t in sentence:
        tagged = nltk.pos_tag(nltk.word_tokenize(t))
        token, tag = convert_tag(tagged)
        if tag != '':
            pos_t, neg_t, obj_t = avg_score(token, tag)
            pos += pos_t
            neg += neg_t
            obj += obj_t
        else: 
            pass
    scoring_details.append([pos, neg, obj])
    if (pos-neg) < 0.:
        scoring.append(0.)
    else:
        scoring.append(2.)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000


In [129]:
from sklearn.metrics import accuracy_score
print(f'unsupervised accuracy: {accuracy_score(y_, np.array(scoring))}')

unsupervised accuracy: 0.6768264634707306


Checking sentences where prediction is wrong:

In [130]:
wrong = []
for i in range(len(y_)):
    if y_[i] != scoring[i]:
        wrong.append(i)

In [131]:
wrong[:10]

[2, 5, 7, 13, 14, 15, 19, 22, 28, 30]

In [132]:
scoring_details[2]

[1.5317505411255412, 1.0736494408369408, 22.39460001803752]

In [133]:
y_[2]

0.0

In [134]:
X_[2]

['taste',
 'like',
 'real',
 'white',
 'castle',
 'find',
 'jersey',
 'know',
 'harold',
 'kumar',
 'travel',
 'movie',
 'this',
 'pretty',
 'knock',
 'real',
 'white',
 'castle',
 '-pron-',
 'ketchup',
 'burger',
 'signature',
 'white',
 'castle',
 'sweet',
 'cherish',
 'white',
 'castle',
 'restaurant',
 'east',
 'coast']

#### Trying spacy

In [135]:
import spacy

In [136]:
nlp = spacy.load('en_core_web_sm')

In [141]:
text = ' '.join(str(x) for x in X_[2])

In [142]:
text

'taste like real white castle find jersey know harold kumar travel movie this pretty knock real white castle -pron- ketchup burger signature white castle sweet cherish white castle restaurant east coast'

In [157]:
doc = nlp(text)
pos, neg, obj = 0., 0., 0.
for token in doc:
    if token.is_stop:
        continue
    elif (token.pos_ == 'PUNCT') or (token.pos_ == 'ADP'):
        continue
    else:
        if token.tag_.startswith('NN'):
            tag = 'n'
        elif token.tag_.startswith('JJ'):
            tag = 'a'
        elif token.tag_.startswith('V'):
            tag = 'v'
        elif token.tag_.startswith('R'):
            tag = 'r'
        else:
            tag = ''
        print(token.lemma_, tag)       
        if tag != '':
            pos_t, neg_t, obj_t = avg_score(token.lemma_, tag)
            pos += pos_t
            neg += neg_t
            obj += obj_t
    print('--------------------------------------')
print(pos, neg)

taste v
--------------------------------------
real a
--------------------------------------
white a
--------------------------------------
castle a
--------------------------------------
find v
--------------------------------------
jersey n
--------------------------------------
know v
--------------------------------------
harold n
--------------------------------------
kumar n
--------------------------------------
travel n
--------------------------------------
movie n
--------------------------------------
pretty r
--------------------------------------
knock v
--------------------------------------
real a
--------------------------------------
white a
--------------------------------------
castle a
--------------------------------------
ketchup n
--------------------------------------
burger n
--------------------------------------
signature n
--------------------------------------
white a
--------------------------------------
castle a
--------------------------------------
swe

Spacy is giving the same result for worng sentences, however, just to compare performance I will run it with spacy:

In [160]:
scoring_spacy = []
scoring_det_spacy = []
for sent in X_:
    text = ' '.join(str(x) for x in sent)
    doc = nlp(text)
    pos, neg, obj = 0., 0., 0.
    for token in doc:
        if token.is_stop:
            continue
        elif (token.pos_ == 'PUNCT') or (token.pos_ == 'ADP'):
            continue
        else:
            if token.tag_.startswith('NN'):
                tag = 'n'
            elif token.tag_.startswith('JJ'):
                tag = 'a'
            elif token.tag_.startswith('V'):
                tag = 'v'
            elif token.tag_.startswith('R'):
                tag = 'r'
            else:
                tag = ''     
            if tag != '':
                pos_t, neg_t, obj_t = avg_score(token.lemma_, tag)
                pos += pos_t
                neg += neg_t
                obj += obj_t
    scoring_det_spacy.append([pos, neg, obj])
    if (pos - neg) < 0:
        scoring_spacy.append(0.)
    else:
        scoring_spacy.append(2.)

In [161]:
print(f'unsupervised spacy accuracy: {accuracy_score(y_, np.array(scoring_spacy))}')

unsupervised spacy accuracy: 0.7164356712865743


As we can see the overall result is better with Spacy usage.
This could be compared with results of CNN, as we had three categories there, but this is a very good result for non-labeled dataset.

Trying different inputs:

In [165]:
def un_sentiment(text):
    doc = nlp(text)
    pos, neg, obj = 0., 0., 0.
    for token in doc:
        if token.is_stop:
            continue
        elif (token.pos_ == 'PUNCT') or (token.pos_ == 'ADP'):
            continue
        else:
            if token.tag_.startswith('NN'):
                tag = 'n'
            elif token.tag_.startswith('JJ'):
                tag = 'a'
            elif token.tag_.startswith('V'):
                tag = 'v'
            elif token.tag_.startswith('R'):
                tag = 'r'
            else:
                tag = ''     
            if tag != '':
                pos_t, neg_t, obj_t = avg_score(token.lemma_, tag)
                pos += pos_t
                neg += neg_t
                obj += obj_t
    if pos > neg:
        result = 'positive'
    else:
        result = 'negative'
    return [result, pos, neg, obj]

In [169]:
#negative
un_sentiment('Real lost again. It is terrible, how they can play like this')

['negative', 0.32053571428571426, 1.1406655844155844, 2.538798701298701]

In [170]:
#positive
un_sentiment('Wow, it was amazing movie. What do you think?')

['positive', 0.8317307692307692, 0.1971153846153846, 1.9711538461538463]

In [171]:
#neutral
un_sentiment('what is it like to be rich?')

['positive', 0.5416666666666667, 0.11458333333333334, 1.34375]

In [172]:
#negative
un_sentiment('I am not quite sure that I liked it')

['positive', 0.6715555555555556, 0.24233333333333335, 1.086111111111111]

In [175]:
#positive
un_sentiment('I do not dislike cabin cruisers')

['negative', 0.0, 0.5, 2.5]