In [1]:
#importing libraries
import nltk
import random
from nltk.corpus import movie_reviews
import pickle
import collections

In [2]:
#Create list of movie review document
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]


random.shuffle(documents)

print(documents[1])

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

counter=collections.Counter(all_words)
print(counter.most_common())
          
 

(['well', 'there', 'goes', 'another', 'one', '.', 'sadly', 'this', 'like', 'other', 'movies', 'this', 'year', 'wasn', "'", 't', 'good', '.', 'this', 'one', 'being', 'almost', 'as', 'bad', 'as', "'", 'the', 'omega', 'code', "'", 'but', 'not', 'quite', '.', 'from', 'the', 'opening', 'credits', 'i', 'had', 'a', 'good', 'feeling', 'this', 'would', 'be', 'bad', ',', 'and', 'well', 'i', 'guess', 'i', 'was', 'right', '.', 'with', 'bad', 'excuses', 'for', 'acting', ',', 'a', 'horrible', 'screenplay', 'and', 'straight', '-', 'out', 'bad', 'direction', "'", 'the', 'bachelor', "'", 'is', 'a', 'terribly', 'unfunny', 'movie', 'that', 'doesn', "'", 't', 'work', 'on', 'any', 'levels', 'accept', 'that', 'fact', 'that', 'rene', 'zellwegar', 'who', 'does', 'give', 'a', 'good', 'performance', '.', 'the', 'two', 'cameos', 'by', 'brooke', 'shields', 'and', 'mariah', 'carey', 'are', 'also', 'good', 'with', 'brooke', 'being', 'the', 'best', '.', 'the', 'movie', 'is', 'troubled', 'from', 'the', 'start', 'beca

In [3]:
#converting words to features
word_features = list(all_words.keys())[:3000]
print(word_features)



In [4]:
#we write a function that will be used to create feature set. The feature set is used to train the classifier.
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

In [5]:
# print one feature set.
print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))



In [6]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [7]:
# split the data with:
#creating train and test dataset

training_set = featuresets[:1900]

# set that we'll test against.
testing_set = featuresets[1900:]


In [8]:
#create pickle file:

file = open('test.pickle', 'wb')

# dump information to that file
pickle.dump(testing_set, file)

# close the file
file.close()

In [9]:
#train our classifier , using naive Bayes classifier
classifier = nltk.NaiveBayesClassifier.train(training_set)
print(classifier)

<nltk.classify.naivebayes.NaiveBayesClassifier object at 0x7f97c2084668>


In [10]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)


Classifier accuracy percent: 77.0


In [11]:
#show 15 most informative features.
features= classifier.show_most_informative_features(15)
features

Most Informative Features
                   sucks = True              neg : pos    =     10.7 : 1.0
                     ugh = True              neg : pos    =      9.7 : 1.0
           unimaginative = True              neg : pos    =      8.4 : 1.0
                  welles = True              neg : pos    =      8.4 : 1.0
                  annual = True              pos : neg    =      8.3 : 1.0
                 frances = True              pos : neg    =      8.3 : 1.0
             silverstone = True              neg : pos    =      7.7 : 1.0
              schumacher = True              neg : pos    =      7.5 : 1.0
                  shoddy = True              neg : pos    =      7.1 : 1.0
                  turkey = True              neg : pos    =      6.8 : 1.0
                 kidding = True              neg : pos    =      6.4 : 1.0
                 idiotic = True              neg : pos    =      6.4 : 1.0
                  suvari = True              neg : pos    =      6.4 : 1.0

In [12]:
save_classifier = open("naivebayes.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()