## NLTK for sentiment analysis
Following:  https://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/

### Practice tokenizing

In [148]:
from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer
from nltk.corpus import stopwords, state_union, gutenberg, wordnet, movie_reviews
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
import pprint
import random
import pickle
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
pp = pprint.PrettyPrinter(indent=4)

In [2]:
example_text = "Next week my family and I are going to Yosemite National Park.  Ansel Adams took many great pictures of Yosemite.  We'll try to climb Half Dome.  It's a rather long hike and we hope that the weather holds.  Mr. S. Harris is my father and Mrs. M. Harris is my mother."
print(sent_tokenize(example_text))
print(word_tokenize(example_text))

['Next week my family and I are going to Yosemite National Park.', 'Ansel Adams took many great pictures of Yosemite.', "We'll try to climb Half Dome.", "It's a rather long hike and we hope that the weather holds.", 'Mr. S. Harris is my father and Mrs. M. Harris is my mother.']
['Next', 'week', 'my', 'family', 'and', 'I', 'are', 'going', 'to', 'Yosemite', 'National', 'Park', '.', 'Ansel', 'Adams', 'took', 'many', 'great', 'pictures', 'of', 'Yosemite', '.', 'We', "'ll", 'try', 'to', 'climb', 'Half', 'Dome', '.', 'It', "'s", 'a', 'rather', 'long', 'hike', 'and', 'we', 'hope', 'that', 'the', 'weather', 'holds', '.', 'Mr.', 'S.', 'Harris', 'is', 'my', 'father', 'and', 'Mrs.', 'M.', 'Harris', 'is', 'my', 'mother', '.']


### Stop words

In [3]:
set(stopwords.words('english'))

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 'd',
 'did',
 'didn',
 'do',
 'does',
 'doesn',
 'doing',
 'don',
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 'has',
 'hasn',
 'have',
 'haven',
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 'it',
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 'more',
 'most',
 'mustn',
 'my',
 'myself',
 'needn',
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 're',
 's',
 'same',
 'shan',
 'she',
 'should',
 'shouldn',
 'so',
 'some',
 'such',
 't',
 'than',
 'that',
 'the',
 'their',
 'theirs',
 'them',
 

In [4]:
stops = set(stopwords.words('english'))
words = word_tokenize(example_text)
filtered = [w for w in words if w not in stops]
print('ALL WORDS: ', words)
print('FILTERED WORDS: ', filtered)

ALL WORDS:  ['Next', 'week', 'my', 'family', 'and', 'I', 'are', 'going', 'to', 'Yosemite', 'National', 'Park', '.', 'Ansel', 'Adams', 'took', 'many', 'great', 'pictures', 'of', 'Yosemite', '.', 'We', "'ll", 'try', 'to', 'climb', 'Half', 'Dome', '.', 'It', "'s", 'a', 'rather', 'long', 'hike', 'and', 'we', 'hope', 'that', 'the', 'weather', 'holds', '.', 'Mr.', 'S.', 'Harris', 'is', 'my', 'father', 'and', 'Mrs.', 'M.', 'Harris', 'is', 'my', 'mother', '.']
FILTERED WORDS:  ['Next', 'week', 'family', 'I', 'going', 'Yosemite', 'National', 'Park', '.', 'Ansel', 'Adams', 'took', 'many', 'great', 'pictures', 'Yosemite', '.', 'We', "'ll", 'try', 'climb', 'Half', 'Dome', '.', 'It', "'s", 'rather', 'long', 'hike', 'hope', 'weather', 'holds', '.', 'Mr.', 'S.', 'Harris', 'father', 'Mrs.', 'M.', 'Harris', 'mother', '.']


### Stemming words

In [5]:
ps = PorterStemmer()
example_words = ['rider', 'ride', 'riding', 'ridded', 'ridely']
for w in example_words:
    print(ps.stem(w))

rider
ride
ride
rid
ride


In [6]:
example_sentence = 'I was riding in the ridely vehicle and enjoying the ride from which I had ridded before'
for w in word_tokenize(example_sentence):
    print(ps.stem(w))

I
wa
ride
in
the
ride
vehicl
and
enjoy
the
ride
from
which
I
had
rid
befor


### Parts of speech tagging

Here is a list of the acronyms for POS tagging:

```text
POS tag list:

CC	coordinating conjunction
CD	cardinal digit
DT	determiner
EX	existential there (like: "there is" ... think of it like "there exists")
FW	foreign word
IN	preposition/subordinating conjunction
JJ	adjective	'big'
JJR	adjective, comparative	'bigger'
JJS	adjective, superlative	'biggest'
LS	list marker	1)
MD	modal	could, will
NN	noun, singular 'desk'
NNS	noun plural	'desks'
NNP	proper noun, singular	'Harrison'
NNPS	proper noun, plural	'Americans'
PDT	predeterminer	'all the kids'
POS	possessive ending	parent's
PRP	personal pronoun	I, he, she
PRP$	possessive pronoun	my, his, hers
RB	adverb	very, silently,
RBR	adverb, comparative	better
RBS	adverb, superlative	best
RP	particle	give up
TO	to	go 'to' the store.
UH	interjection	errrrrrrrm
VB	verb, base form	take
VBD	verb, past tense	took
VBG	verb, gerund/present participle	taking
VBN	verb, past participle	taken
VBP	verb, sing. present, non-3d	take
VBZ	verb, 3rd person sing. present	takes
WDT	wh-determiner	which
WP	wh-pronoun	who, what
WP$	possessive wh-pronoun	whose
WRB	wh-abverb	where, when
```

PunktSentenceTokenizer tokenizer is capable of unsupervised machine learning, so you can actually train it on any body of text that you use.

In [7]:
train_text = state_union.raw("2005-GWBush.txt")
test_text = state_union.raw("2006-GWBush.txt")

In [8]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(test_text)
print(tokenized[:3])

["PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION\n \nJanuary 31, 2006\n\nTHE PRESIDENT: Thank you all.", 'Mr. Speaker, Vice President Cheney, members of Congress, members of the Supreme Court and diplomatic corps, distinguished guests, and fellow citizens: Today our nation lost a beloved, graceful, courageous woman who called America to its founding ideals and carried on a noble dream.', 'Tonight we are comforted by the hope of a glad reunion with the husband who was taken so long ago, and we are grateful for the good life of Coretta Scott King.']


In [9]:
def process_sentences(sents):
    for sent in sents:
        # Tokenize by word and get POS tags for each
        words = nltk.word_tokenize(sent)
        tagged = nltk.pos_tag(words)
        # Pretty print
        pp.pprint(tagged)

mysents = tokenized[:2]
process_sentences(mysents)

[   ('PRESIDENT', 'NNP'),
    ('GEORGE', 'NNP'),
    ('W.', 'NNP'),
    ('BUSH', 'NNP'),
    ("'S", 'POS'),
    ('ADDRESS', 'NNP'),
    ('BEFORE', 'IN'),
    ('A', 'NNP'),
    ('JOINT', 'NNP'),
    ('SESSION', 'NNP'),
    ('OF', 'IN'),
    ('THE', 'NNP'),
    ('CONGRESS', 'NNP'),
    ('ON', 'NNP'),
    ('THE', 'NNP'),
    ('STATE', 'NNP'),
    ('OF', 'IN'),
    ('THE', 'NNP'),
    ('UNION', 'NNP'),
    ('January', 'NNP'),
    ('31', 'CD'),
    (',', ','),
    ('2006', 'CD'),
    ('THE', 'NNP'),
    ('PRESIDENT', 'NNP'),
    (':', ':'),
    ('Thank', 'NNP'),
    ('you', 'PRP'),
    ('all', 'DT'),
    ('.', '.')]
[   ('Mr.', 'NNP'),
    ('Speaker', 'NNP'),
    (',', ','),
    ('Vice', 'NNP'),
    ('President', 'NNP'),
    ('Cheney', 'NNP'),
    (',', ','),
    ('members', 'NNS'),
    ('of', 'IN'),
    ('Congress', 'NNP'),
    (',', ','),
    ('members', 'NNS'),
    ('of', 'IN'),
    ('the', 'DT'),
    ('Supreme', 'NNP'),
    ('Court', 'NNP'),
    ('and', 'CC'),
    ('diplomatic', 'JJ'),


### Chunking with NLTK

In [11]:
chunk_gram = r'''Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}'''
def process_sentences_regex(sents, chunk_gram):
    for sent in sents:
        # Tokenize by word and get POS tags for each
        words = nltk.word_tokenize(sent)
        tagged = nltk.pos_tag(words)
        chunk_parser = nltk.RegexpParser(chunk_gram)
        chunked = chunk_parser.parse(tagged)
        # Interactive drawing - uncomment if care to see the tree
#         chunked.draw()
        
        
mysents = tokenized[:2]
process_sentences_regex(mysents, chunk_gram)

This line, broken down:

```text
<RB.?>* = "0 or more of any tense of adverb," followed by:

<VB.?>* = "0 or more of any tense of verb," followed by:

<NNP>+ = "One or more proper nouns," followed by

<NN>? = "zero or one singular noun."
```

In [16]:
chunk_gram = r'''Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}'''
chunk_parser = nltk.RegexpParser(chunk_gram)
def process_sentences_regex(sents, chunk_gram):
    for sent in sents:
        # Tokenize by word and get POS tags for each
        words = nltk.word_tokenize(sent)
        tagged = nltk.pos_tag(words)
        chunked = chunk_parser.parse(tagged)
        subtreed = chunked.subtrees(filter=lambda t: t.label() == 'Chunk')
        for subtree in subtreed:
            print(subtree)
        
        
mysents = tokenized[:2]
process_sentences_regex(mysents, chunk_gram)

(Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
(Chunk ADDRESS/NNP)
(Chunk A/NNP JOINT/NNP SESSION/NNP)
(Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
(Chunk THE/NNP UNION/NNP January/NNP)
(Chunk THE/NNP PRESIDENT/NNP)
(Chunk Thank/NNP)
(Chunk Mr./NNP Speaker/NNP)
(Chunk Vice/NNP President/NNP Cheney/NNP)
(Chunk Congress/NNP)
(Chunk Supreme/NNP Court/NNP)
(Chunk called/VBD America/NNP)


### Chinking with NLTK

Chinking is a lot like chunking, it is basically a way for you to remove a chunk from a chunk. The chunk that you remove from your chunk is your chink.

In [18]:
chunk_gram = r'''Chunk: {<.*>+}
                        }<VB.?|IN|DT|TO>+{'''
chunk_parser = nltk.RegexpParser(chunk_gram)
def process_sentences_regex(sents, chunk_gram):
    for sent in sents:
        # Tokenize by word and get POS tags for each
        words = nltk.word_tokenize(sent)
        tagged = nltk.pos_tag(words)
        chunked = chunk_parser.parse(tagged)
        subtreed = chunked.subtrees(filter=lambda t: t.label() == 'Chunk')
        for subtree in subtreed:
            print(subtree)
        
        
mysents = tokenized[:2]
process_sentences_regex(mysents, chunk_gram)

(Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP 'S/POS ADDRESS/NNP)
(Chunk A/NNP JOINT/NNP SESSION/NNP)
(Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
(Chunk
  THE/NNP
  UNION/NNP
  January/NNP
  31/CD
  ,/,
  2006/CD
  THE/NNP
  PRESIDENT/NNP
  :/:
  Thank/NNP
  you/PRP)
(Chunk ./.)
(Chunk
  Mr./NNP
  Speaker/NNP
  ,/,
  Vice/NNP
  President/NNP
  Cheney/NNP
  ,/,
  members/NNS)
(Chunk Congress/NNP ,/, members/NNS)
(Chunk
  Supreme/NNP
  Court/NNP
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:)
(Chunk our/PRP$ nation/NN)
(Chunk ,/, graceful/JJ ,/, courageous/JJ woman/NN who/WP)
(Chunk America/NNP)
(Chunk its/PRP$ founding/NN ideals/NNS and/CC)
(Chunk noble/JJ dream/NN ./.)


This means we're removing one or more verbs, prepositions, determiners, or the word 'to'.

    }<VB.?|IN|DT|TO>+{

### Named entity recognition

In [32]:
train_text = state_union.raw("2005-GWBush.txt")
test_text = state_union.raw("2006-GWBush.txt")
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(test_text)

def process_sentences_ne(sents, ne_filter='NE', binary=True):
    """This function finds named entities in sentences by tokenizing
       them into words, labeling the parts of speech (POS tags) and
       chunking by named entities.
       
       Parameters
       ----------
       
       sents : list of str
           List of sentences pre-tokenized
       
       ne_filter : str
           String to filter out the named entities by their tag which
           is either None or 'NE'
           
       binary : bool
          True or False.  Indicates whether to tag by simply 'NE' (True)
          or indicate the class of the named entity (False).
          
       Returns
       -------
       
       No return value.
       
    """
    for sent in sents:
        # Tokenize by word and get POS tags for each
        words = nltk.word_tokenize(sent)
        tagged = nltk.pos_tag(words)
        # Extract named entities
        named_ent = nltk.ne_chunk(tagged, binary=binary)
        if ne_filter is not None:
            subtreed = named_ent.subtrees(filter=lambda t: t.label() == ne_filter)
        else:
            subtreed = named_ent.subtrees()
        for subtree in subtreed:
            print(subtree)
        
        
mysents = tokenized[5:7]
process_sentences_ne(mysents, ne_filter=None, binary=False)

(S 31/CD ,/, 2006/CD ./.)
(S
  (FACILITY White/NNP)
  (ORGANIZATION House/NNP)
  photo/NN
  by/IN
  (PERSON Eric/NNP)
  DraperEvery/NNP
  time/NN
  I/PRP
  'm/VBP
  invited/JJ
  to/TO
  this/DT
  rostrum/NN
  ,/,
  I/PRP
  'm/VBP
  humbled/VBN
  by/IN
  the/DT
  privilege/NN
  ,/,
  and/CC
  mindful/NN
  of/IN
  the/DT
  history/NN
  we/PRP
  've/VBP
  seen/VBN
  together/RB
  ./.)
(FACILITY White/NNP)
(ORGANIZATION House/NNP)
(PERSON Eric/NNP)


**NE Type and Examples (returned when binary is set to False):**

```
ORGANIZATION - Georgia-Pacific Corp., WHO
PERSON - Eddy Bonte, President Obama
LOCATION - Murray River, Mount Everest
DATE - June, 2008-06-29
TIME - two fifty a m, 1:30 p.m.
MONEY - 175 million Canadian Dollars, GBP 10.40
PERCENT - twenty pct, 18.75 %
FACILITY - Washington Monument, Stonehenge
GPE - South East Asia, Midlothian
```

### Lemmatizing

A very similar operation to stemming is called lemmatizing. The major difference between these is, as you saw earlier, stemming can often create non-existent words, whereas lemmas are actual words.

Using from `nltk.stem import WordNetLemmatizer`.

In [94]:
lemmatizer = WordNetLemmatizer()
print('tabbies -> ', lemmatizer.lemmatize('tabbies'))
print('cacti -> ', lemmatizer.lemmatize('cacti'))
print('better -> ', lemmatizer.lemmatize('better', pos='a'))
print('best -> ', lemmatizer.lemmatize('best', pos='a'))
print('ran -> ', lemmatizer.lemmatize('ran'))
print('ran -> ', lemmatizer.lemmatize('ran', pos='v'))

tabbies ->  tabby
cacti ->  cactus
better ->  good
best ->  best
ran ->  ran
ran ->  run


### Intro to the corpora

In [72]:
sample_text = gutenberg.raw("bible-kjv.txt")
tokenized = sent_tokenize(sample_text)
pp.pprint(tokenized[:3])

[   '[The King James Bible]\n'
    '\n'
    'The Old Testament of the King James Bible\n'
    '\n'
    'The First Book of Moses:  Called Genesis\n'
    '\n'
    '\n'
    '1:1 In the beginning God created the heaven and the earth.',
    '1:2 And the earth was without form, and void; and darkness was upon\n'
    'the face of the deep.',
    'And the Spirit of God moved upon the face of the\nwaters.']


For a list of the corpora see http://www.nltk.org/book/ch02.html#tab-corpora.

### WordNet

WordNet is a lexical database for the English language, which was created by Princeton, and is part of the NLTK corpus.

You can use WordNet alongside the NLTK module to find the meanings of words, synonyms, antonyms, and more. 

In [82]:
syns = wordnet.synsets("program")
# Synset
synset = syns[0]
print(synset.name())
# Just the word
print(synset.lemmas()[0].name())
# Definition
print(synset.definition())
# Examples
print(synset.examples())

plan.n.01
plan
a series of steps to be carried out or goals to be accomplished
['they drew up a six-step plan', 'they discussed plans for a new bond issue']


In [90]:
_synsets = wordnet.synsets('good')
synonyms = []
antonyms = []

for syn in _synsets:
    for l in syn.lemmas():
        synonyms.append(l.name())
        ants = l.antonyms()
        if ants:
            antonyms.append(ants[0].name())
            
print('SYNONYMS: ', set(synonyms))
print('ANTONYMS: ', set(antonyms))

SYNONYMS:  {'salutary', 'honest', 'unspoilt', 'unspoiled', 'soundly', 'dependable', 'skilful', 'serious', 'well', 'adept', 'upright', 'trade_good', 'secure', 'effective', 'honorable', 'commodity', 'ripe', 'right', 'sound', 'skillful', 'estimable', 'in_effect', 'full', 'beneficial', 'safe', 'good', 'proficient', 'just', 'respectable', 'goodness', 'expert', 'in_force', 'practiced', 'thoroughly', 'dear', 'near', 'undecomposed'}
ANTONYMS:  {'bad', 'badness', 'evilness', 'evil', 'ill'}


Next, we can also easily use WordNet to compare the similarity of two words and their tenses, by incorporating the Wu and Palmer method for semantic related-ness.

In [97]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('boat.n.01')
print(w1.wup_similarity(w2))

0.9090909090909091


In [98]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('sail.n.01')
print(w1.wup_similarity(w2))

0.5263157894736842


In [99]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('truck.n.01')
print(w1.wup_similarity(w2))

0.6956521739130435


In [100]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('cat.n.01')
print(w1.wup_similarity(w2))

0.32


### Text classification and converting words to features

In [None]:
# In each category (pos/neg) take all file ids (reviews), and
# store the tokenized words for that file id along with
# pos/neg label:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle the documents for training and testing
random.shuffle(documents)

In [None]:
# First document in the list
print(documents[0])

# Make a list of all words, lowercase
all_words = [w.lower() for w in movie_reviews.words()]

In [106]:
# Frequency distribution
all_words = nltk.FreqDist(all_words)

In [110]:
# Top 15 words
print(all_words.most_common(15))

# Count of a word
print(all_words['cat'])

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]
59


In [None]:
# Top someodd words
word_features = list(all_words.keys())[:3000]

# Returns a dict with the word and its presence (bool) in document
def find_features(document):
    words = set(document)
    features = {}
    features = {w: w in words for w in word_features}
    return features

print(find_features(movie_reviews.words('neg/cv000_29416.txt')))
# List of words and their presence in document along with label for that document
featuresets = [(find_features(rev), category) for (rev, category) in documents]
print(featuresets[0])

### Training and saving naive Bayes classifier model

In [135]:
# Split into training and test set

training_set, test_set = featuresets[:1900], featuresets[1900:]

# Train
classifier = nltk.NaiveBayesClassifier.train(training_set)

# Test
print("Classifier accuracy  = ", (nltk.classify.accuracy(classifier, test_set)))

Classifier accuracy  =  0.66


In [122]:
# What are the most valuable words when it comes to the neg/pos labels
# You'll see the ratio of occurrences neg:pos or pos:neg, depending, to 
# see how often a word appears in one label as compared to the other.
classifier.show_most_informative_features(15)

Most Informative Features
               atrocious = True              neg : pos    =     11.3 : 1.0
               ludicrous = True              neg : pos    =     10.4 : 1.0
                    gump = True              pos : neg    =      9.3 : 1.0
              incoherent = True              neg : pos    =      8.1 : 1.0
                 layered = True              pos : neg    =      7.9 : 1.0
               furniture = True              neg : pos    =      6.8 : 1.0
                musicals = True              pos : neg    =      6.5 : 1.0
                  skimpy = True              neg : pos    =      6.1 : 1.0
                  osment = True              pos : neg    =      5.8 : 1.0
                   flynt = True              pos : neg    =      5.8 : 1.0
                  rebels = True              pos : neg    =      5.8 : 1.0
            surveillance = True              neg : pos    =      5.5 : 1.0
                   dreck = True              neg : pos    =      5.5 : 1.0

Save the model with pickle

In [124]:
with open('naivebayes.pickle', 'wb') as f:
    pickle.dump(classifier, f)

Reload model

In [125]:
with open('naivebayes.pickle', 'rb') as f:
    classifier = pickle.load(f)

In [132]:
# Test on one document (#5) without the label (just the first element of our tuple)
classifier.classify(test_set[4][0])

'neg'

### NLTK and Scikit-Learn
There's an API made by the nltk folks for leveraging sklearn.

In [136]:
mnb_classifer = SklearnClassifier(MultinomialNB())
mnb_classifer.train(training_set)
print('MultinomialNB accuracy = ',
      nltk.classify.accuracy(mnb_classifer, test_set))

MultinomialNB accuracy =  0.69


In [146]:
bnb_classifer = SklearnClassifier(BernoulliNB())
bnb_classifer.train(training_set)
print('BernoulliNB accuracy = ',
      nltk.classify.accuracy(bnb_classifer, test_set))

BernoulliNB accuracy =  0.65


In [140]:
lr_classifier = SklearnClassifier(LogisticRegression())
lr_classifier.train(training_set)
print('LogisticRegression accuracy = ',
      nltk.classify.accuracy(lr_classifier, test_set))

LogisticRegression accuracy =  0.71


In [141]:
sgd_classifier = SklearnClassifier(SGDClassifier())
sgd_classifier.train(training_set)
print('SGDClassifier accuracy = ',
      nltk.classify.accuracy(sgd_classifier, test_set))

SGDClassifier accuracy =  0.59


In [143]:
svc_classifier = SklearnClassifier(SVC())
svc_classifier.train(training_set)
print('SVC accuracy = ',
      nltk.classify.accuracy(svc_classifier, test_set))

SVC accuracy =  0.36


In [144]:
linearsvc_classifier = SklearnClassifier(LinearSVC())
linearsvc_classifier.train(training_set)
print('LinearSVC accuracy = ',
      nltk.classify.accuracy(linearsvc_classifier, test_set))

LinearSVC accuracy =  0.65


In [145]:
nusvc_classifier = SklearnClassifier(NuSVC())
nusvc_classifier.train(training_set)
print('NuSVC accuracy = ',
      nltk.classify.accuracy(nusvc_classifier, test_set))

NuSVC accuracy =  0.64


### Combining algos

In [160]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
        
    def classify(self, features):
        """Iterates through classifiers and classifies
           based on features, returning the most popular "vote"
           or classification."""
        votes = [c.classify(features) for c in self._classifiers]
        try:
            m = mode(votes)
        except Exception as e:
            m = None
        return m
    
    def vote_ratio(self, features):
        """For or against."""
        votes = [c.classify(features) for c in self._classifiers]
        try:
            vote_ratio = votes.count(mode(votes)) / len(votes)
        except Exception as e:
            vote_ratio = None
        return vote_ratio

def find_features(document):
    """Returns a dict with the word and its presence (bool) in document"""
    words = set(document)
    features = {}
    features = {w: w in words for w in word_features}
    return features
   

#### Putting it all together

In [164]:
     
# In each category (pos/neg) take all file ids (reviews), and
# store the tokenized words for that file id along with
# pos/neg label:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle the documents for training and testing
random.shuffle(documents)

# List of words in movie reviews
all_words = [w.lower() for w in movie_reviews.words()]

# Frequency distribution
all_words = nltk.FreqDist(all_words)

# Top someodd words
word_features = list(all_words.keys())[:3000]

# List of words and their presence in document along with label for that document
featuresets = [(find_features(rev), category) for (rev, category) in documents]

# Split into test and training data sets
training_set, test_set = featuresets[:1900], featuresets[1900:]

# Reload model
with open('naivebayes.pickle', 'rb') as f:
    classifier = pickle.load(f)
    

# Train and test
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Classifier accuracy  = ", (nltk.classify.accuracy(classifier, test_set)))


mnb_classifer = SklearnClassifier(MultinomialNB())
mnb_classifer.train(training_set)
print('MultinomialNB accuracy = ',
      nltk.classify.accuracy(mnb_classifer, test_set))
                                  
bnb_classifer = SklearnClassifier(BernoulliNB())
bnb_classifer.train(training_set)
print('BernoulliNB accuracy = ',
      nltk.classify.accuracy(bnb_classifer, test_set))

lr_classifier = SklearnClassifier(LogisticRegression())
lr_classifier.train(training_set)
print('LogisticRegression accuracy = ',
      nltk.classify.accuracy(lr_classifier, test_set))

sgd_classifier = SklearnClassifier(SGDClassifier())
sgd_classifier.train(training_set)
print('SGDClassifier accuracy = ',
      nltk.classify.accuracy(sgd_classifier, test_set))
                                  
svc_classifier = SklearnClassifier(SVC())
svc_classifier.train(training_set)
print('SVC accuracy = ',
      nltk.classify.accuracy(svc_classifier, test_set))
                                  
linearsvc_classifier = SklearnClassifier(LinearSVC())
linearsvc_classifier.train(training_set)
print('LinearSVC accuracy = ',
      nltk.classify.accuracy(linearsvc_classifier, test_set))
                                  
nusvc_classifier = SklearnClassifier(NuSVC())
nusvc_classifier.train(training_set)
print('NuSVC accuracy = ',
      nltk.classify.accuracy(nusvc_classifier, test_set))

# Voting

voted_classifier = VoteClassifier(classifier,
                                  mnb_classifer,
                                  bnb_classifer,
                                  lr_classifier,
                                  sgd_classifier,
                                  svc_classifier,
                                  linearsvc_classifier,
                                  nusvc_classifier)

print('voted_classifier accuracy = ', (nltk.classify.accuracy(voted_classifier, test_set)))
print("Classification:", voted_classifier.classify(test_set[0][0]), "Vote ratio:",voted_classifier.vote_ratio(test_set[0][0]))
print("Classification:", voted_classifier.classify(test_set[1][0]), "Vote ratio:",voted_classifier.vote_ratio(test_set[1][0]))
print("Classification:", voted_classifier.classify(test_set[2][0]), "Vote ratio:",voted_classifier.vote_ratio(test_set[2][0]))
print("Classification:", voted_classifier.classify(test_set[3][0]), "Vote ratio:",voted_classifier.vote_ratio(test_set[3][0]))
print("Classification:", voted_classifier.classify(test_set[4][0]), "Vote ratio:",voted_classifier.vote_ratio(test_set[4][0]))
print("Classification:", voted_classifier.classify(test_set[5][0]), "Vote ratio:",voted_classifier.vote_ratio(test_set[5][0]))

Classifier accuracy  =  0.64
MultinomialNB accuracy =  0.7
BernoulliNB accuracy =  0.64
LogisticRegression accuracy =  0.67
SGDClassifier accuracy =  0.65
SVC accuracy =  0.48
LinearSVC accuracy =  0.63
NuSVC accuracy =  0.67
voted_classifier accuracy =  0.58
Classification: neg Vote ratio: 0.75
Classification: None Vote ratio: None
Classification: pos Vote ratio: 0.875
Classification: pos Vote ratio: 1.0
Classification: neg Vote ratio: 0.875
Classification: neg Vote ratio: 0.875
