In [17]:
#sentiment analyzer

In [18]:
import nltk
import random
from nltk.corpus import movie_reviews, stopwords
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
from nltk.tokenize import word_tokenize

In [19]:
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\mshza\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mshza\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mshza\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
documents = []

for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((list(movie_reviews.words(fileid)),category))

random.shuffle(documents)

print("total documents:", len(documents))

total documents: 2000


In [21]:
#text cleaning

stop_words = set(stopwords.words('english'))

def clean_text(words):
    words = [w.lower() for w in words]
    words = [w for w in words if w.isalpha()]
    words = [w for w in words if w not in stop_words]

    return words

In [22]:
#feature extraction

all_words = []

for words, _ in documents:
    all_words.extend(clean_text(words))

all_words_freq = nltk.FreqDist(all_words)

word_features = list(all_words_freq.keys())[:8000]

In [23]:
def exact_features(document_words):
    document_words = set(clean_text(document_words))

    features = {}

    for word in word_features:
        features[word] = (word in document_words)

    return features

In [24]:
#create training and testing data

featuresets = [(exact_features(words), label) for (words, label) in documents]

train_set = featuresets[:1600]
test_set = featuresets[1600:]

In [25]:
classifier = NaiveBayesClassifier.train(train_set)


In [26]:
print("Accuracy:", accuracy(classifier, test_set))


Accuracy: 0.8175


In [27]:
classifier.show_most_informative_features(10)


Most Informative Features
             beautifully = True              pos : neg    =     12.2 : 1.0
               ludicrous = True              neg : pos    =     12.2 : 1.0
             outstanding = True              pos : neg    =     12.0 : 1.0
               maintains = True              pos : neg    =     11.3 : 1.0
                 idiotic = True              neg : pos    =     10.9 : 1.0
                  seagal = True              neg : pos    =     10.6 : 1.0
                  darker = True              pos : neg    =     10.0 : 1.0
                  elliot = True              pos : neg    =     10.0 : 1.0
            lighthearted = True              pos : neg    =     10.0 : 1.0
               strongest = True              pos : neg    =     10.0 : 1.0


In [28]:
import pickle

with open("sentiment_model_1.pkl", "wb") as f:
    pickle.dump(classifier, f)

with open("word_features_1.pkl","wb") as f:
    pickle.dump(word_features,f)