<a href="https://colab.research.google.com/github/oonid/growth-hacking-with-nlp-sentiment-analysis/blob/master/create_dictionary_based_sentiment_analyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dictionary Based Sentiment Analyzer

* Word tokenization
* Sentence tokenization
* Scoring of the reviews
* Comparison of the scores with the reviews in plots
* Measuring the distribution
* Handling negation
* Adjusting your dictionary-based sentiment analyzer
* Checking your results

In [1]:
# all imports and related

%matplotlib inline

import pandas as pd
import numpy as np
import altair as alt

from nltk import download as nltk_download
from nltk.tokenize import word_tokenize, sent_tokenize

nltk_download('punkt')  # required by word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### load the small_corpus CSV

run process from
[create_dataset.ipynb](https://github.com/oonid/growth-hacking-with-nlp-sentiment-analysis/blob/master/create_dataset.ipynb)

copy file **small_corpus.csv** to this Google Colab Files (via file upload or mount drive).


In [2]:
df = pd.read_csv('small_corpus.csv')
df

Unnamed: 0,ratings,reviews
0,1,Recently UBISOFT had to settle a huge class-ac...
1,1,"code didn't work, got me a refund."
2,1,"these do not work at all, all i get is static ..."
3,1,well let me start by saying that when i first ...
4,1,"Dont waste your money, you will just end up us..."
...,...,...
4495,5,"Nice long micro USB cable, battery lasts a lon..."
4496,5,I've been having a great time with this game. ...
4497,5,d
4498,5,"Really pretty, funny, interesting game. Works ..."


In [3]:
# check if any columns has null, and yes the reviews columns has
df.isnull().any()

ratings    False
reviews     True
dtype: bool

In [4]:
# repair null in column reviews with empty string ''
df.reviews = df.reviews.fillna('')

# test again
df.isnull().any()

ratings    False
reviews    False
dtype: bool

### tokenize the sentences and words of the reviews

In [5]:
word_tokenized = df['reviews'].apply(word_tokenize)
word_tokenized

0       [Recently, UBISOFT, had, to, settle, a, huge, ...
1        [code, did, n't, work, ,, got, me, a, refund, .]
2       [these, do, not, work, at, all, ,, all, i, get...
3       [well, let, me, start, by, saying, that, when,...
4       [Dont, waste, your, money, ,, you, will, just,...
                              ...                        
4495    [Nice, long, micro, USB, cable, ,, battery, la...
4496    [I, 've, been, having, a, great, time, with, t...
4497                                                  [d]
4498    [Really, pretty, ,, funny, ,, interesting, gam...
4499    [i, had, a, lot, of, fun, playing, this, game,...
Name: reviews, Length: 4500, dtype: object

In [6]:
sent_tokenized = df['reviews'].apply(sent_tokenize)
sent_tokenized

0       [Recently UBISOFT had to settle a huge class-a...
1                    [code didn't work, got me a refund.]
2       [these do not work at all, all i get is static...
3       [well let me start by saying that when i first...
4       [Dont waste your money, you will just end up u...
                              ...                        
4495    [Nice long micro USB cable, battery lasts a lo...
4496    [I've been having a great time with this game....
4497                                                  [d]
4498    [Really pretty, funny, interesting game., Work...
4499    [i had a lot of fun playing this game, if your...
Name: reviews, Length: 4500, dtype: object

### download the opinion lexicon of NLTK

use it with reference to it source:

https://www.nltk.org/_modules/nltk/corpus/reader/opinion_lexicon.html



In [7]:
# imports and related

nltk_download('opinion_lexicon')

from nltk.corpus import opinion_lexicon

[nltk_data] Downloading package opinion_lexicon to /root/nltk_data...
[nltk_data]   Unzipping corpora/opinion_lexicon.zip.


In [8]:
print(len(opinion_lexicon.words()))
print(len(opinion_lexicon.negative()))
print(len(opinion_lexicon.positive()))
print(opinion_lexicon.words()[:10])  # print first 10 sorted by file id
print(sorted(opinion_lexicon.words())[:10])  # print first 10 sorted alphabet

6789
4783
2006
['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably', 'abominate', 'abomination', 'abort', 'aborted']
['2-faced', '2-faces', 'a+', 'abnormal', 'abolish', 'abominable', 'abominably', 'abominate', 'abomination', 'abort']


In [9]:
def simple_opinion_test(words):
    if words not in opinion_lexicon.words():
        print('{} not covered on opinion_lexicon'.format(words))
    else:
        if words in opinion_lexicon.negative():
            print('{} is negative'.format(words))
        if words in opinion_lexicon.positive():
            print('{} is positive'.format(words))

simple_opinion_test('awful')
simple_opinion_test('beautiful')
simple_opinion_test('useless')
simple_opinion_test('Great')  # must be lower case
simple_opinion_test('warming')


awful is negative
beautiful is positive
useless is negative
Great not covered on opinion_lexicon
warming not covered on opinion_lexicon


### classify each review in a scale of -1 to +1

In [10]:
dfc = pd.DataFrame()
dfc['word_tokenized'] = word_tokenized
dfc

Unnamed: 0,word_tokenized
0,"[Recently, UBISOFT, had, to, settle, a, huge, ..."
1,"[code, did, n't, work, ,, got, me, a, refund, .]"
2,"[these, do, not, work, at, all, ,, all, i, get..."
3,"[well, let, me, start, by, saying, that, when,..."
4,"[Dont, waste, your, money, ,, you, will, just,..."
...,...
4495,"[Nice, long, micro, USB, cable, ,, battery, la..."
4496,"[I, 've, been, having, a, great, time, with, t..."
4497,[d]
4498,"[Really, pretty, ,, funny, ,, interesting, gam..."


In [0]:
def score_review(l):
    total = 0
    for w in l:
        if w not in [',', '.', ':', "'", '"', '(', ')', '!', '?', '...']:
            if w.lower() in opinion_lexicon.negative():
                total -= 1
            if w.lower() in opinion_lexicon.positive():
                total += 1
    if total > 0:
        return 1
    elif total < 0:
        return -1
    else:
        return 0

dfc['score'] = dfc['word_tokenized'].apply(score_review)

In [0]:
dfc