### 8.2.1 Sentiment Classification

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import pandas as pd

In [7]:
raw_data = pd.read_csv('IMDB-Dataset.csv')
raw_data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [9]:
review_docs = raw_data.iloc[:,0]
review_docs

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [10]:
senti = raw_data.iloc[:,1]
senti

0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 50000, dtype: object

In [12]:
set(senti)

{'negative', 'positive'}

In [21]:
tfidf = TfidfVectorizer(min_df = 20, max_df = 30000).fit_transform(review_docs)

In [22]:
clf = MultinomialNB().fit(tfidf[:-10], senti[:-10])

In [23]:
clf.predict(tfidf[-10:])

array(['negative', 'negative', 'positive', 'positive', 'negative',
       'positive', 'negative', 'negative', 'negative', 'negative'],
      dtype='<U8')

In [24]:
senti[-10:]

49990    negative
49991    negative
49992    positive
49993    negative
49994    negative
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, dtype: object

### 8.2.2 WordNet based Sentiment Analysis

In [25]:
from nltk.corpus import wordnet

In [31]:
# reference point
good = wordnet.synsets('good')[0]
bad = wordnet.synsets('bad')[0]
print(good)
print(bad)

Synset('good.n.01')
Synset('bad.n.01')


In [35]:
# definition of good/bad
bad.definition()

'that which is below standard or expectations as of ethics or decency'

In [36]:
# examples of reference point 'bad'
bad.examples()

['take the bad with the good']

In [37]:
bad.hypernyms()

[Synset('quality.n.01')]

In [39]:
review = 'i like a nice cup of tea'

tokens = review.split(' ')
tokens

['i', 'like', 'a', 'nice', 'cup', 'of', 'tea']

In [50]:
# different forms of 'i'
syn_token = wordnet.synsets('i')[0]
syn_token

Synset('iodine.n.01')

In [51]:
# check similarity of good reference pt. with 'i'
score = wordnet.path_similarity(good, syn_token)
score

0.1111111111111111

In [52]:
# check similarity of bad reference pt. with 'i'
wordnet.path_similarity(bad, syn_token)

# as similarity of 'i' with bad is more than it means it's tilted towards bad

0.14285714285714285

In [57]:
# now check it for nice with 'wup' similarity instead of 'path' 

syn_token = wordnet.synsets('nice')[0]
print(syn_token)

print(wordnet.wup_similarity(good, syn_token))
print(wordnet.wup_similarity(bad, syn_token))

Synset('nice.n.01')
0.11764705882352941
0.13333333333333333


In [62]:
total_score = 0

for t in tokens:
    syn_token = wordnet.synsets(t)
    if len(syn_token) > 0:
        syn_token = syn_token[0]
        total_score += wordnet.wup_similarity(good, syn_token) - wordnet.wup_similarity(bad, syn_token)
    
total_score
    
# -ve value indicates bad review but this is wrong cuz wordnet is not good, sentiwordnet is good

-0.21097823744882574

### 8.2.3 SentiWordNet based Sentiment Analysis

In [75]:
# import nltk
# nltk.download('sentiwordnet')

In [76]:
from nltk.corpus import wordnet as wnet
from nltk.corpus import sentiwordnet as swnet

In [114]:
review = 'i like to have a nice cup of tea'
review2 = 'i have a very bad experience with this product'
review3 = 'it was okay'
review4 = 'i love the shape but color is bad'
tokens = review.split(' ')
tokens2 = review2.split(' ')
tokens3 = review3.split(' ')
tokens4 = review4.split(' ')
print(tokens)
print(tokens2)
print(tokens3)
print(tokens4)

['i', 'like', 'to', 'have', 'a', 'nice', 'cup', 'of', 'tea']
['i', 'have', 'a', 'very', 'bad', 'experience', 'with', 'this', 'product']
['it', 'was', 'okay']
['i', 'love', 'the', 'shape', 'but', 'color', 'is', 'bad']


In [103]:
syn_token = wordnet.synsets('like')[0]
syn_token

Synset('like.n.01')

In [104]:
syn_token.name()

'like.n.01'

In [105]:
# like wordnet has path_similarity, swnet has senti_synset which doesn't require ref. points but just needs name
senti_syn_token = swnet.senti_synset(syn_token.name())
senti_syn_token

SentiSynset('like.n.01')

In [106]:
print('Positive:',senti_syn_token.pos_score())
print('Negative:',senti_syn_token.neg_score())

Positive: 0.125
Negative: 0.0


In [107]:
# another example
syn_token = wordnet.synsets('bad')[0]
senti_syn_token = swnet.senti_synset(syn_token.name())
print('Positive:',senti_syn_token.pos_score())
print('Negative:',senti_syn_token.neg_score())

Positive: 0.0
Negative: 0.875


In [108]:
# another example
syn_token = wordnet.synsets('love')[0]
senti_syn_token = swnet.senti_synset(syn_token.name())
print('Positive:',senti_syn_token.pos_score())
print('Negative:',senti_syn_token.neg_score())

Positive: 0.625
Negative: 0.0


In [112]:
total_score = 0
for t in tokens:
    syn_token = wnet.synsets(t)
    if len(syn_token) > 0:
        syn_token = syn_token[0]
        senti_syn_token = swnet.senti_synset(syn_token.name())
        score = senti_syn_token.pos_score() - senti_syn_token.neg_score()
        total_score += score
        
total_score

0.125

In [113]:
total_score = 0
for t in tokens2:
    syn_token = wnet.synsets(t)
    if len(syn_token) > 0:
        syn_token = syn_token[0]
        senti_syn_token = swnet.senti_synset(syn_token.name())
        score = senti_syn_token.pos_score() - senti_syn_token.neg_score()
        total_score += score
        
total_score

-0.375

In [115]:
total_score = 0
for t in tokens3:
    syn_token = wnet.synsets(t)
    if len(syn_token) > 0:
        syn_token = syn_token[0]
        senti_syn_token = swnet.senti_synset(syn_token.name())
        score = senti_syn_token.pos_score() - senti_syn_token.neg_score()
        total_score += score
        
total_score

0.0

In [118]:
total_score = 0
for t in tokens4:
    syn_token = wnet.synsets(t)
    if len(syn_token) > 0:
        syn_token = syn_token[0]
        senti_syn_token = swnet.senti_synset(syn_token.name())
        score = senti_syn_token.pos_score() - senti_syn_token.neg_score()
        total_score += score
        
total_score

# cuz 'bad' has higher value than 'love' so we should maintain neg and pos separately

-0.125

In [124]:
pos_score = 0
neg_score = 0
for t in tokens4:
    syn_token = wnet.synsets(t)
    if len(syn_token) > 0:
        syn_token = syn_token[0]
        senti_syn_token = swnet.senti_synset(syn_token.name())
        if senti_syn_token.pos_score() > senti_syn_token.neg_score():
            pos_score += senti_syn_token.pos_score()
        else:
            neg_score += senti_syn_token.neg_score()
        
print('Pos:',pos_score)
print('Neg:','-',neg_score)

Pos: 0.875
Neg: - 0.875
