In [1]:
# p.245 10.9 感情分析器
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy as nltk_accuracy

In [2]:
def extract_features(words):
    return dict([word, True] for word in words)

In [3]:
fileids_pos = movie_reviews.fileids('pos')
fileids_neg = movie_reviews.fileids('neg')

features_pos = [(extract_features(movie_reviews.words(fileids=[f])),
               'Positive') for f in fileids_pos]
features_neg = [(extract_features(movie_reviews.words(fileids=[f])),
               'Negative') for f in fileids_neg]

In [4]:
threshold = 0.8
num_pos = int(threshold * len(features_pos))
num_neg = int(threshold * len(features_neg))

features_train = features_pos[:num_pos] + features_neg[:num_neg]
features_test = features_pos[num_pos:] + features_neg[num_neg:]
print('Number of training datapoints:', len(features_train))
print('Number of test datapoints:', len(features_test))

Number of training datapoints: 1600
Number of test datapoints: 400


In [5]:
classifier = NaiveBayesClassifier.train(features_train)
print('Accuracy of the classifier:',
     nltk_accuracy(classifier, features_test))

Accuracy of the classifier: 0.735


In [6]:
N = 15
print('Top ' + str(N) + ' most informative words:')

for i, item in enumerate(classifier.most_informative_features()[:N]):
    print(str(i + 1) + '. ' + item[0])

Top 15 most informative words:
1. outstanding
2. insulting
3. vulnerable
4. ludicrous
5. uninvolving
6. astounding
7. avoids
8. fascination
9. seagal
10. anna
11. animators
12. darker
13. affecting
14. symbol
15. idiotic


In [7]:
input_reviews = [
    'The costumes in this movie were great',
    'I think the story was terrible and the characters were very weak',
    'People say that the director of the movie is amazing',
    'Thies is such an idiotic movie. I will not recommend it to anyone.'
]

In [8]:
print("Movie review predictions:")
for review in input_reviews:
    print("\nReview:", review)
    features = extract_features(review.split())
    probabilities = classifier.prob_classify(features)
    predicted_sentiment = probabilities.max()
    
    print("Predicted sentiment:", predicted_sentiment)
    print("Probability:", round(probabilities.prob(predicted_sentiment), 2))

Movie review predictions:

Review: The costumes in this movie were great
Predicted sentiment: Positive
Probability: 0.59

Review: I think the story was terrible and the characters were very weak
Predicted sentiment: Negative
Probability: 0.8

Review: People say that the director of the movie is amazing
Predicted sentiment: Positive
Probability: 0.6

Review: Thies is such an idiotic movie. I will not recommend it to anyone.
Predicted sentiment: Negative
Probability: 0.87
