## NLTK for sentiment analysis
Following:  https://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/

### Practice tokenizing

In [21]:
from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer
from nltk.corpus import stopwords, state_union, gutenberg, wordnet, movie_reviews
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
import pprint
import random
import pickle
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
pp = pprint.PrettyPrinter(indent=4)

In [20]:
example_text = "Bayes’ theorem was named after Thomas Bayes (1701–1761), who studied how to compute a distribution for the probability parameter of a binomial distribution (in modern terminology). Bayes’ unpublished manuscript was significantly edited by Richard Price before it was posthumously read at the Royal Society. Price edited[3] Bayes’ major work “An Essay towards solving a Problem in the Doctrine of Chances” (1763), which appeared in “Philosophical Transactions,”[4] and contains Bayes’ Theorem. Price wrote an introduction to the paper which provides some of the philosophical basis of Bayesian statistics. In 1765 he was elected a Fellow of the Royal Society in recognition of his work on the legacy of Bayes."
print(sent_tokenize(example_text))
print(word_tokenize(example_text))

['Bayes’ theorem was named after Thomas Bayes (1701–1761), who studied how to compute a distribution for the probability parameter of a binomial distribution (in modern terminology).', 'Bayes’ unpublished manuscript was significantly edited by Richard Price before it was posthumously read at the Royal Society.', 'Price edited[3] Bayes’ major work “An Essay towards solving a Problem in the Doctrine of Chances” (1763), which appeared in “Philosophical Transactions,”[4] and contains Bayes’ Theorem.', 'Price wrote an introduction to the paper which provides some of the philosophical basis of Bayesian statistics.', 'In 1765 he was elected a Fellow of the Royal Society in recognition of his work on the legacy of Bayes.']
['Bayes', '’', 'theorem', 'was', 'named', 'after', 'Thomas', 'Bayes', '(', '1701–1761', ')', ',', 'who', 'studied', 'how', 'to', 'compute', 'a', 'distribution', 'for', 'the', 'probability', 'parameter', 'of', 'a', 'binomial', 'distribution', '(', 'in', 'modern', 'terminology

### Stop words

In [None]:
set(stopwords.words('english'))

In [None]:
stops = set(stopwords.words('english'))
words = word_tokenize(example_text)
filtered = [w for w in words if w not in stops]
print('ALL WORDS: ', words)
print('FILTERED WORDS: ', filtered)

### Stemming words

In [None]:
ps = PorterStemmer()
example_words = ['rider', 'ride', 'riding', 'ridded', 'ridely']
for w in example_words:
    print(ps.stem(w))

In [None]:
example_sentence = 'I was riding in the ridely vehicle and enjoying the ride from which I had ridded before'
for w in word_tokenize(example_sentence):
    print(ps.stem(w))

### Parts of speech tagging

Here is a list of the acronyms for POS tagging:

```text
POS tag list:

CC	coordinating conjunction
CD	cardinal digit
DT	determiner
EX	existential there (like: "there is" ... think of it like "there exists")
FW	foreign word
IN	preposition/subordinating conjunction
JJ	adjective	'big'
JJR	adjective, comparative	'bigger'
JJS	adjective, superlative	'biggest'
LS	list marker	1)
MD	modal	could, will
NN	noun, singular 'desk'
NNS	noun plural	'desks'
NNP	proper noun, singular	'Harrison'
NNPS	proper noun, plural	'Americans'
PDT	predeterminer	'all the kids'
POS	possessive ending	parent's
PRP	personal pronoun	I, he, she
PRP$	possessive pronoun	my, his, hers
RB	adverb	very, silently,
RBR	adverb, comparative	better
RBS	adverb, superlative	best
RP	particle	give up
TO	to	go 'to' the store.
UH	interjection	errrrrrrrm
VB	verb, base form	take
VBD	verb, past tense	took
VBG	verb, gerund/present participle	taking
VBN	verb, past participle	taken
VBP	verb, sing. present, non-3d	take
VBZ	verb, 3rd person sing. present	takes
WDT	wh-determiner	which
WP	wh-pronoun	who, what
WP$	possessive wh-pronoun	whose
WRB	wh-abverb	where, when
```

PunktSentenceTokenizer tokenizer is capable of unsupervised machine learning, so you can actually train it on any body of text that you use.

In [None]:
train_text = state_union.raw("2005-GWBush.txt")
test_text = state_union.raw("2006-GWBush.txt")

In [None]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(test_text)
print(tokenized[:3])

In [None]:
def process_sentences(sents):
    for sent in sents:
        # Tokenize by word and get POS tags for each
        words = nltk.word_tokenize(sent)
        tagged = nltk.pos_tag(words)
        # Pretty print
        pp.pprint(tagged)

mysents = tokenized[:2]
process_sentences(mysents)

### Chunking with NLTK

In [None]:
chunk_gram = r'''Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}'''
def process_sentences_regex(sents, chunk_gram):
    for sent in sents:
        # Tokenize by word and get POS tags for each
        words = nltk.word_tokenize(sent)
        tagged = nltk.pos_tag(words)
        chunk_parser = nltk.RegexpParser(chunk_gram)
        chunked = chunk_parser.parse(tagged)
        # Interactive drawing - uncomment if care to see the tree
#         chunked.draw()
        
        
mysents = tokenized[:2]
process_sentences_regex(mysents, chunk_gram)

This line, broken down:

```text
<RB.?>* = "0 or more of any tense of adverb," followed by:

<VB.?>* = "0 or more of any tense of verb," followed by:

<NNP>+ = "One or more proper nouns," followed by

<NN>? = "zero or one singular noun."
```

In [None]:
chunk_gram = r'''Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}'''
chunk_parser = nltk.RegexpParser(chunk_gram)
def process_sentences_regex(sents, chunk_gram):
    for sent in sents:
        # Tokenize by word and get POS tags for each
        words = nltk.word_tokenize(sent)
        tagged = nltk.pos_tag(words)
        chunked = chunk_parser.parse(tagged)
        subtreed = chunked.subtrees(filter=lambda t: t.label() == 'Chunk')
        for subtree in subtreed:
            print(subtree)
        
        
mysents = tokenized[:2]
process_sentences_regex(mysents, chunk_gram)

### Chinking with NLTK

Chinking is a lot like chunking, it is basically a way for you to remove a chunk from a chunk. The chunk that you remove from your chunk is your chink.

In [None]:
chunk_gram = r'''Chunk: {<.*>+}
                        }<VB.?|IN|DT|TO>+{'''
chunk_parser = nltk.RegexpParser(chunk_gram)
def process_sentences_regex(sents, chunk_gram):
    for sent in sents:
        # Tokenize by word and get POS tags for each
        words = nltk.word_tokenize(sent)
        tagged = nltk.pos_tag(words)
        chunked = chunk_parser.parse(tagged)
        subtreed = chunked.subtrees(filter=lambda t: t.label() == 'Chunk')
        for subtree in subtreed:
            print(subtree)
        
        
mysents = tokenized[:2]
process_sentences_regex(mysents, chunk_gram)

This means we're removing one or more verbs, prepositions, determiners, or the word 'to'.

    }<VB.?|IN|DT|TO>+{

### Named entity recognition

In [None]:
train_text = state_union.raw("2005-GWBush.txt")
test_text = state_union.raw("2006-GWBush.txt")
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(test_text)

def process_sentences_ne(sents, ne_filter='NE', binary=True):
    """This function finds named entities in sentences by tokenizing
       them into words, labeling the parts of speech (POS tags) and
       chunking by named entities.
       
       Parameters
       ----------
       
       sents : list of str
           List of sentences pre-tokenized
       
       ne_filter : str
           String to filter out the named entities by their tag which
           is either None or 'NE'
           
       binary : bool
          True or False.  Indicates whether to tag by simply 'NE' (True)
          or indicate the class of the named entity (False).
          
       Returns
       -------
       
       No return value.
       
    """
    for sent in sents:
        # Tokenize by word and get POS tags for each
        words = nltk.word_tokenize(sent)
        tagged = nltk.pos_tag(words)
        # Extract named entities
        named_ent = nltk.ne_chunk(tagged, binary=binary)
        if ne_filter is not None:
            subtreed = named_ent.subtrees(filter=lambda t: t.label() == ne_filter)
        else:
            subtreed = named_ent.subtrees()
        for subtree in subtreed:
            print(subtree)
        
        
mysents = tokenized[5:7]
process_sentences_ne(mysents, ne_filter=None, binary=False)

**NE Type and Examples (returned when binary is set to False):**

```
ORGANIZATION - Georgia-Pacific Corp., WHO
PERSON - Eddy Bonte, President Obama
LOCATION - Murray River, Mount Everest
DATE - June, 2008-06-29
TIME - two fifty a m, 1:30 p.m.
MONEY - 175 million Canadian Dollars, GBP 10.40
PERCENT - twenty pct, 18.75 %
FACILITY - Washington Monument, Stonehenge
GPE - South East Asia, Midlothian
```

### Lemmatizing

A very similar operation to stemming is called lemmatizing. The major difference between these is, as you saw earlier, stemming can often create non-existent words, whereas lemmas are actual words.

Using from `nltk.stem import WordNetLemmatizer`.

In [None]:
lemmatizer = WordNetLemmatizer()
print('tabbies -> ', lemmatizer.lemmatize('tabbies'))
print('cacti -> ', lemmatizer.lemmatize('cacti'))
print('better -> ', lemmatizer.lemmatize('better', pos='a'))
print('best -> ', lemmatizer.lemmatize('best', pos='a'))
print('ran -> ', lemmatizer.lemmatize('ran'))
print('ran -> ', lemmatizer.lemmatize('ran', pos='v'))

### Intro to the corpora

In [None]:
sample_text = gutenberg.raw("bible-kjv.txt")
tokenized = sent_tokenize(sample_text)
pp.pprint(tokenized[:3])

For a list of the corpora see http://www.nltk.org/book/ch02.html#tab-corpora.

### WordNet

WordNet is a lexical database for the English language, which was created by Princeton, and is part of the NLTK corpus.

You can use WordNet alongside the NLTK module to find the meanings of words, synonyms, antonyms, and more. 

In [None]:
syns = wordnet.synsets("program")
# Synset
synset = syns[0]
print(synset.name())
# Just the word
print(synset.lemmas()[0].name())
# Definition
print(synset.definition())
# Examples
print(synset.examples())

In [None]:
_synsets = wordnet.synsets('good')
synonyms = []
antonyms = []

for syn in _synsets:
    for l in syn.lemmas():
        synonyms.append(l.name())
        ants = l.antonyms()
        if ants:
            antonyms.append(ants[0].name())
            
print('SYNONYMS: ', set(synonyms))
print('ANTONYMS: ', set(antonyms))

Next, we can also easily use WordNet to compare the similarity of two words and their tenses, by incorporating the Wu and Palmer method for semantic related-ness.

In [None]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('boat.n.01')
print(w1.wup_similarity(w2))

In [None]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('sail.n.01')
print(w1.wup_similarity(w2))

In [None]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('truck.n.01')
print(w1.wup_similarity(w2))

In [None]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('cat.n.01')
print(w1.wup_similarity(w2))

### Text classification and converting words to features

In [None]:
# In each category (pos/neg) take all file ids (reviews), and
# store the tokenized words for that file id along with
# pos/neg label:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle the documents for training and testing
random.shuffle(documents)

In [None]:
# First document in the list
print(documents[0])

# Make a list of all words, lowercase
all_words = [w.lower() for w in movie_reviews.words()]

In [None]:
# Frequency distribution
all_words = nltk.FreqDist(all_words)

In [None]:
# Top 15 words
print(all_words.most_common(15))

# Count of a word
print(all_words['cat'])

In [None]:
# Top someodd words
word_features = list(all_words.keys())[:3000]

# Returns a dict with the word and its presence (bool) in document
def find_features(document):
    words = set(document)
    features = {}
    features = {w: w in words for w in word_features}
    return features

print(find_features(movie_reviews.words('neg/cv000_29416.txt')))
# List of words and their presence in document along with label for that document
featuresets = [(find_features(rev), category) for (rev, category) in documents]
print(featuresets[0])

### Training and saving naive Bayes classifier model

In [None]:
# Split into training and test set

training_set, test_set = featuresets[:1900], featuresets[1900:]

# Train
classifier = nltk.NaiveBayesClassifier.train(training_set)

# Test
print("Classifier accuracy  = ", (nltk.classify.accuracy(classifier, test_set)))

In [None]:
# What are the most valuable words when it comes to the neg/pos labels
# You'll see the ratio of occurrences neg:pos or pos:neg, depending, to 
# see how often a word appears in one label as compared to the other.
classifier.show_most_informative_features(15)

Save the model with pickle

In [None]:
with open('naivebayes.pickle', 'wb') as f:
    pickle.dump(classifier, f)

Reload model

In [None]:
with open('naivebayes.pickle', 'rb') as f:
    classifier = pickle.load(f)

In [None]:
# Test on one document (#5) without the label (just the first element of our tuple)
classifier.classify(test_set[4][0])

### NLTK and Scikit-Learn
There's an API made by the nltk folks for leveraging sklearn.

In [None]:
mnb_classifer = SklearnClassifier(MultinomialNB())
mnb_classifer.train(training_set)
print('MultinomialNB accuracy = ',
      nltk.classify.accuracy(mnb_classifer, test_set))

In [None]:
bnb_classifer = SklearnClassifier(BernoulliNB())
bnb_classifer.train(training_set)
print('BernoulliNB accuracy = ',
      nltk.classify.accuracy(bnb_classifer, test_set))

In [None]:
lr_classifier = SklearnClassifier(LogisticRegression())
lr_classifier.train(training_set)
print('LogisticRegression accuracy = ',
      nltk.classify.accuracy(lr_classifier, test_set))

In [None]:
sgd_classifier = SklearnClassifier(SGDClassifier())
sgd_classifier.train(training_set)
print('SGDClassifier accuracy = ',
      nltk.classify.accuracy(sgd_classifier, test_set))

In [None]:
svc_classifier = SklearnClassifier(SVC())
svc_classifier.train(training_set)
print('SVC accuracy = ',
      nltk.classify.accuracy(svc_classifier, test_set))

In [None]:
linearsvc_classifier = SklearnClassifier(LinearSVC())
linearsvc_classifier.train(training_set)
print('LinearSVC accuracy = ',
      nltk.classify.accuracy(linearsvc_classifier, test_set))

In [None]:
nusvc_classifier = SklearnClassifier(NuSVC())
nusvc_classifier.train(training_set)
print('NuSVC accuracy = ',
      nltk.classify.accuracy(nusvc_classifier, test_set))

### Combining algos

In [None]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
        
    def classify(self, features):
        """Iterates through classifiers and classifies
           based on features, returning the most popular "vote"
           or classification."""
        votes = [c.classify(features) for c in self._classifiers]
        try:
            m = mode(votes)
        except Exception as e:
            m = None
        return m
    
    def vote_ratio(self, features):
        """For or against."""
        votes = [c.classify(features) for c in self._classifiers]
        try:
            vote_ratio = votes.count(mode(votes)) / len(votes)
        except Exception as e:
            vote_ratio = None
        return vote_ratio

def find_features(document):
    """Returns a dict with the word and its presence (bool) in document"""
    words = set(document)
    features = {}
    features = {w: w in words for w in word_features}
    return features
   

#### Putting it all together

In [17]:
     
# In each category (pos/neg) take all file ids (reviews), and
# store the tokenized words for that file id along with
# pos/neg label:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle the documents for training and testing
random.shuffle(documents)

# List of words in movie reviews
all_words = [w.lower() for w in movie_reviews.words()]

# Frequency distribution
all_words = nltk.FreqDist(all_words)

# Top someodd words
word_features = list(all_words.keys())[:3000]

# List of words and their presence in document along with label for that document
featuresets = [(find_features(rev), category) for (rev, category) in documents]

# Split into test and training data sets
training_set, test_set = featuresets[:1900], featuresets[1900:]

# Reload model
with open('naivebayes.pickle', 'rb') as f:
    classifier = pickle.load(f)
    

# Train and test
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Classifier accuracy  = ", (nltk.classify.accuracy(classifier, test_set)))


mnb_classifer = SklearnClassifier(MultinomialNB())
mnb_classifer.train(training_set)
print('MultinomialNB accuracy = ',
      nltk.classify.accuracy(mnb_classifer, test_set))
                                  
bnb_classifer = SklearnClassifier(BernoulliNB())
bnb_classifer.train(training_set)
print('BernoulliNB accuracy = ',
      nltk.classify.accuracy(bnb_classifer, test_set))

lr_classifier = SklearnClassifier(LogisticRegression())
lr_classifier.train(training_set)
print('LogisticRegression accuracy = ',
      nltk.classify.accuracy(lr_classifier, test_set))

sgd_classifier = SklearnClassifier(SGDClassifier())
sgd_classifier.train(training_set)
print('SGDClassifier accuracy = ',
      nltk.classify.accuracy(sgd_classifier, test_set))
                                  
svc_classifier = SklearnClassifier(SVC())
svc_classifier.train(training_set)
print('SVC accuracy = ',
      nltk.classify.accuracy(svc_classifier, test_set))
                                  
linearsvc_classifier = SklearnClassifier(LinearSVC())
linearsvc_classifier.train(training_set)
print('LinearSVC accuracy = ',
      nltk.classify.accuracy(linearsvc_classifier, test_set))
                                  
nusvc_classifier = SklearnClassifier(NuSVC())
nusvc_classifier.train(training_set)
print('NuSVC accuracy = ',
      nltk.classify.accuracy(nusvc_classifier, test_set))

# Voting

voted_classifier = VoteClassifier(classifier,
                                  mnb_classifer,
                                  bnb_classifer,
                                  lr_classifier,
                                  sgd_classifier,
                                  svc_classifier,
                                  linearsvc_classifier,
                                  nusvc_classifier)

print('voted_classifier accuracy = ', (nltk.classify.accuracy(voted_classifier, test_set)))
print("Classification:", voted_classifier.classify(test_set[0][0]), "Vote ratio:",voted_classifier.vote_ratio(test_set[0][0]))
print("Classification:", voted_classifier.classify(test_set[1][0]), "Vote ratio:",voted_classifier.vote_ratio(test_set[1][0]))
print("Classification:", voted_classifier.classify(test_set[2][0]), "Vote ratio:",voted_classifier.vote_ratio(test_set[2][0]))
print("Classification:", voted_classifier.classify(test_set[3][0]), "Vote ratio:",voted_classifier.vote_ratio(test_set[3][0]))
print("Classification:", voted_classifier.classify(test_set[4][0]), "Vote ratio:",voted_classifier.vote_ratio(test_set[4][0]))
print("Classification:", voted_classifier.classify(test_set[5][0]), "Vote ratio:",voted_classifier.vote_ratio(test_set[5][0]))

Classifier accuracy  =  0.64
MultinomialNB accuracy =  0.63
BernoulliNB accuracy =  0.65
LogisticRegression accuracy =  0.67
SGDClassifier accuracy =  0.64
SVC accuracy =  0.45
LinearSVC accuracy =  0.66
NuSVC accuracy =  0.63
voted_classifier accuracy =  0.63
Classification: pos Vote ratio: 1.0
Classification: pos Vote ratio: 0.875
Classification: pos Vote ratio: 0.875
Classification: neg Vote ratio: 0.875
Classification: pos Vote ratio: 0.875
Classification: neg Vote ratio: 0.875


### Short reviews - full code sample

In [19]:
with open('../data/positive.txt', encoding='latin-1', mode='r') as f:
    short_pos = f.read()
with open('../data/negative.txt', encoding='latin-1', mode='r') as f:
    short_neg = f.read()
    
documents = []

for r in short_pos.split('\n'):
    documents.append( (r, 'pos') )
for r in short_neg.split('\n'):
    documents.append( (r, 'neg') )

all_words = []

short_pos_words = word_tokenize(short_pos.replace('\n', ' '))
short_neg_words = word_tokenize(short_neg.replace('\n', ' '))

for w in short_pos_words:
    all_words.append(w.lower())

for w in short_neg_words:
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:5000]

class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
        
    def classify(self, features):
        """Iterates through classifiers and classifies
           based on features, returning the most popular "vote"
           or classification."""
        votes = [c.classify(features) for c in self._classifiers]
        try:
            m = mode(votes)
        except Exception as e:
            m = None
        return m
    
    def vote_ratio(self, features):
        """For or against."""
        votes = [c.classify(features) for c in self._classifiers]
        try:
            vote_ratio = votes.count(mode(votes)) / len(votes)
        except Exception as e:
            vote_ratio = None
        return vote_ratio

def find_features(document):
    """Returns a dict with the word and its presence (bool) in document"""
    words = set(document)
    features = {}
    features = {w: w in words for w in word_features}
    return features

featuresets = [(find_features(rev), category) for (rev, category) in documents]
random.shuffle(featuresets)

# Split into test and training data sets
training_set, test_set = featuresets[:4000], featuresets[4000:]

# Reload model
with open('naivebayes.pickle', 'rb') as f:
    classifier = pickle.load(f)
    

# Train and test
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Classifier accuracy  = ", (nltk.classify.accuracy(classifier, test_set)))
classifier.show_most_informative_features(15)

mnb_classifer = SklearnClassifier(MultinomialNB())
mnb_classifer.train(training_set)
print('MultinomialNB accuracy = ',
      nltk.classify.accuracy(mnb_classifer, test_set))
                                  
bnb_classifer = SklearnClassifier(BernoulliNB())
bnb_classifer.train(training_set)
print('BernoulliNB accuracy = ',
      nltk.classify.accuracy(bnb_classifer, test_set))

lr_classifier = SklearnClassifier(LogisticRegression())
lr_classifier.train(training_set)
print('LogisticRegression accuracy = ',
      nltk.classify.accuracy(lr_classifier, test_set))

sgd_classifier = SklearnClassifier(SGDClassifier())
sgd_classifier.train(training_set)
print('SGDClassifier accuracy = ',
      nltk.classify.accuracy(sgd_classifier, test_set))
                                  
svc_classifier = SklearnClassifier(SVC())
svc_classifier.train(training_set)
print('SVC accuracy = ',
      nltk.classify.accuracy(svc_classifier, test_set))
                                  
linearsvc_classifier = SklearnClassifier(LinearSVC())
linearsvc_classifier.train(training_set)
print('LinearSVC accuracy = ',
      nltk.classify.accuracy(linearsvc_classifier, test_set))
                                  
nusvc_classifier = SklearnClassifier(NuSVC())
nusvc_classifier.train(training_set)
print('NuSVC accuracy = ',
      nltk.classify.accuracy(nusvc_classifier, test_set))

# Voting

voted_classifier = VoteClassifier(classifier,
                                  mnb_classifer,
                                  bnb_classifer,
                                  lr_classifier,
                                  sgd_classifier,
                                  svc_classifier,
                                  linearsvc_classifier,
                                  nusvc_classifier)

print('voted_classifier accuracy = ', (nltk.classify.accuracy(voted_classifier, test_set)))

print("A few test sample predictions...")
print("Classification:", voted_classifier.classify(test_set[0][0]), "Vote ratio:",voted_classifier.vote_ratio(test_set[0][0]))
print("Classification:", voted_classifier.classify(test_set[1][0]), "Vote ratio:",voted_classifier.vote_ratio(test_set[1][0]))
print("Classification:", voted_classifier.classify(test_set[2][0]), "Vote ratio:",voted_classifier.vote_ratio(test_set[2][0]))
print("Classification:", voted_classifier.classify(test_set[3][0]), "Vote ratio:",voted_classifier.vote_ratio(test_set[3][0]))
print("Classification:", voted_classifier.classify(test_set[4][0]), "Vote ratio:",voted_classifier.vote_ratio(test_set[4][0]))
print("Classification:", voted_classifier.classify(test_set[5][0]), "Vote ratio:",voted_classifier.vote_ratio(test_set[5][0]))

Classifier accuracy  =  0.5078031212484994
Most Informative Features
                       4 = True              neg : pos    =      2.5 : 1.0
                       a = False             neg : pos    =      1.8 : 1.0
                       1 = True              neg : pos    =      1.6 : 1.0
                       ! = True              neg : pos    =      1.6 : 1.0
                       r = False             neg : pos    =      1.5 : 1.0
                       9 = True              neg : pos    =      1.4 : 1.0
                       6 = True              neg : pos    =      1.3 : 1.0
                       [ = True              neg : pos    =      1.2 : 1.0
                       u = False             neg : pos    =      1.1 : 1.0
                       o = False             pos : neg    =      1.1 : 1.0
                       : = True              pos : neg    =      1.1 : 1.0
                       u = True              pos : neg    =      1.0 : 1.0
                       1 = Fals

In [16]:
print("A few test sample predictions...")
print("Classification:", voted_classifier.classify(test_set[10][0]), "Vote ratio:",voted_classifier.vote_ratio(test_set[0][0]))
print(test_set[0][1])
print("Classification:", voted_classifier.classify(test_set[11][0]), "Vote ratio:",voted_classifier.vote_ratio(test_set[1][0]))
print(test_set[1][1])
print("Classification:", voted_classifier.classify(test_set[12][0]), "Vote ratio:",voted_classifier.vote_ratio(test_set[2][0]))
print(test_set[2][1])
print("Classification:", voted_classifier.classify(test_set[13][0]), "Vote ratio:",voted_classifier.vote_ratio(test_set[3][0]))
print(test_set[3][1])
print("Classification:", voted_classifier.classify(test_set[14][0]), "Vote ratio:",voted_classifier.vote_ratio(test_set[4][0]))
print(test_set[4][1])
print("Classification:", voted_classifier.classify(test_set[15][0]), "Vote ratio:",voted_classifier.vote_ratio(test_set[5][0]))
print(test_set[5][1])

A few test sample predictions...
Classification: neg Vote ratio: 0.625
neg
Classification: neg Vote ratio: 0.625
pos
Classification: neg Vote ratio: 0.625
pos
Classification: neg Vote ratio: 0.625
pos
Classification: neg Vote ratio: 0.625
pos
Classification: neg Vote ratio: 0.625
pos
