In [69]:
#https://pythonprogramming.net/combine-classifier-algorithms-nltk-tutorial/?completed=/sklearn-scikit-learn-nltk-tutorial/
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [70]:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

from nltk.classify import ClassifierI
from statistics import mode

class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
        
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        
        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

print documents[1]
random.shuffle(documents)

all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())
    
all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]
#print (word_features)


([u'the', u'happy', u'bastard', u"'", u's', u'quick', u'movie', u'review', u'damn', u'that', u'y2k', u'bug', u'.', u'it', u"'", u's', u'got', u'a', u'head', u'start', u'in', u'this', u'movie', u'starring', u'jamie', u'lee', u'curtis', u'and', u'another', u'baldwin', u'brother', u'(', u'william', u'this', u'time', u')', u'in', u'a', u'story', u'regarding', u'a', u'crew', u'of', u'a', u'tugboat', u'that', u'comes', u'across', u'a', u'deserted', u'russian', u'tech', u'ship', u'that', u'has', u'a', u'strangeness', u'to', u'it', u'when', u'they', u'kick', u'the', u'power', u'back', u'on', u'.', u'little', u'do', u'they', u'know', u'the', u'power', u'within', u'.', u'.', u'.', u'going', u'for', u'the', u'gore', u'and', u'bringing', u'on', u'a', u'few', u'action', u'sequences', u'here', u'and', u'there', u',', u'virus', u'still', u'feels', u'very', u'empty', u',', u'like', u'a', u'movie', u'going', u'for', u'all', u'flash', u'and', u'no', u'substance', u'.', u'we', u'don', u"'", u't', u'know'

In [71]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

featuresets = [(find_features(rev), category) for (rev, category) in documents]

training_set = featuresets[:1900]
testing_set = featuresets[1900:]
#print training_set[0]
#print testing_set[0]

In [72]:
NB_classifier = nltk.NaiveBayesClassifier.train(training_set)
print ("NB Classifier Accuracy percent:", (nltk.classify.accuracy(NB_classifier, testing_set))*100)
NB_classifier.show_most_informative_features(20)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print ("MNB Classifier Accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print ("Bernoulli NB Classifier Accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print ("Logistic Regression Classifier Accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print ("SGDClassifier_classifier Accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print ("SVC Classifier Accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print ("LinearSVC Classifier Accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print ("NuSVC Classifier Accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)



('NB Classifier Accuracy percent:', 71.0)
Most Informative Features
               insulting = True              neg : pos    =     16.7 : 1.0
                    sans = True              neg : pos    =      8.2 : 1.0
            refreshingly = True              pos : neg    =      7.8 : 1.0
              mediocrity = True              neg : pos    =      7.5 : 1.0
                 wasting = True              neg : pos    =      7.5 : 1.0
               dismissed = True              pos : neg    =      7.1 : 1.0
                  fabric = True              pos : neg    =      6.5 : 1.0
             overwhelmed = True              pos : neg    =      6.5 : 1.0
             bruckheimer = True              neg : pos    =      6.2 : 1.0
              unoriginal = True              neg : pos    =      6.1 : 1.0
                  doubts = True              pos : neg    =      5.9 : 1.0
                    lang = True              pos : neg    =      5.8 : 1.0
                 topping = True 

In [73]:
voted_classifier = VoteClassifier(NB_classifier,
                                  NuSVC_classifier,
                                  LinearSVC_classifier,
                                  SGDClassifier_classifier,
                                  MNB_classifier, 
                                  BernoulliNB_classifier,
                                  LogisticRegression_classifier)

print("Voted Classifier Accuracy Percent: ", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:", voted_classifier.confidence(testing_set[0][0])*100)
print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:", voted_classifier.confidence(testing_set[1][0])*100)
print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:", voted_classifier.confidence(testing_set[2][0])*100)
print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:", voted_classifier.confidence(testing_set[3][0])*100)
print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:", voted_classifier.confidence(testing_set[4][0])*100)
print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:", voted_classifier.confidence(testing_set[5][0])*100)


('Voted Classifier Accuracy Percent: ', 71.0)
('Classification:', u'neg', 'Confidence %:', 100)
('Classification:', u'pos', 'Confidence %:', 0)
('Classification:', u'neg', 'Confidence %:', 100)
('Classification:', u'neg', 'Confidence %:', 100)
('Classification:', u'pos', 'Confidence %:', 0)
('Classification:', u'neg', 'Confidence %:', 0)
