# 用nltk和scikitlean里的naive bayes相关算法来测试nltk里的movie_reviews词库

In [7]:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle

首先导入相关的库， pickle为了之后的保存序列化

In [9]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

In [14]:
print(len(documents)) 
print(documents[0])

2000
(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'", 's', 'the', 'deal', '?', 'watch', 'the', 'movie', 'and', '"', 'sorta', '"', 'find', 'out', '.', '.', '.', 'critique', ':', 'a', 'mind', '-', 'fuck', 'movie', 'for', 'the', 'teen', 'generation', 'that', 'touches', 'on', 'a', 'very', 'cool', 'idea', ',', 'but', 'presents', 'it', 'in', 'a', 'very', 'bad', 'package', '.', 'which', 'is', 'what', 'makes', 'this', 'review', 'an', 'even', 'harder', 'one', 'to', 'write', ',', 'since', 'i', 'generally', 'applaud', 'films', 'which', 'attempt', 'to', 'break', 'the', 'mold', ',', 'mess', 'with', 'your', 'head', 'and', 'such', '(', 'lost', 'highway', '&', 'memento', ')', ',', 'but', 'there', 'are', 'good', 'and

以上的list comprehension主要是将词库里的2000篇影评tokenize，将word tokenization组成一个list；并将每个list和这篇影评的category（positive/negative)组成一个set，documents里就有2000个这样的set

In [15]:
for i in range(2000):
    print(documents[i][1])

neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg
neg


我们将这2000篇影评的category都print出来，发现是全部negative之后跟着全部的positive，这会对之后的model training造成bias，所以目前需要打乱他们的顺序，要用的random库里的shuffle

In [16]:
random.shuffle(documents)

In [17]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = [w for (w, c) in all_words.most_common(3000)]

那么用什么来当作我们分类的feature呢，我们挑选了词库里出现频率最高的3000个词 

In [18]:
# return true: document里的某个word在3000个高频词里 
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

featuresets = [(find_features(rev), category) for (rev, category) in documents]
# (word,True/False), pos/neg

将之前标注好的数据集分成training和testing set

In [20]:
training_set = featuresets[:1900]
testing_set =  featuresets[1900:]

train the model and test

In [22]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [23]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

Classifier accuracy percent: 83.0


我们用pickle保存我们之前做的分类器，下次run可以save time

In [25]:
save_classifier = open("naivebayes.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [26]:
# if we wanna load and use? 
classifier_f = open("naivebayes.pickle","rb")
classifier = pickle.load(classifier_f)
classifier_f.close()

尝试一下别的分类器

In [27]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)


Original Naive Bayes Algo accuracy percent: 83.0
Most Informative Features
             wonderfully = True              pos : neg    =     11.6 : 1.0
             outstanding = True              pos : neg    =     10.5 : 1.0
                   mulan = True              pos : neg    =      8.9 : 1.0
                  seagal = True              neg : pos    =      8.3 : 1.0
                  finest = True              pos : neg    =      7.5 : 1.0
              schumacher = True              neg : pos    =      7.1 : 1.0
                 idiotic = True              neg : pos    =      6.4 : 1.0
                   damon = True              pos : neg    =      6.3 : 1.0
                  wasted = True              neg : pos    =      6.0 : 1.0
                   jolie = True              neg : pos    =      5.8 : 1.0
             beautifully = True              pos : neg    =      5.7 : 1.0
                lebowski = True              pos : neg    =      5.6 : 1.0
                   flynt 

做一个voting机制，把这些分类器都综合起来

In [28]:
from nltk.classify import ClassifierI
from statistics import mode

class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf


In [29]:
voted_classifier = VoteClassifier(classifier,
                                  NuSVC_classifier,
                                  LinearSVC_classifier,
                                  SGDClassifier_classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier,
                                  LogisticRegression_classifier)

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)
print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100)


voted_classifier accuracy percent: 85.0
Classification: pos Confidence %: 57.14285714285714
Classification: neg Confidence %: 100.0
