# Sentiment analysis
* playing around with the naive bayes classifier to classify text as expressing either a positive or negative sentiment
* reference: http://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/

In [35]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
import pickle

In [36]:
def word_feats(words):
    return dict([(word, True) for word in words])

In [37]:
# list of the various text files containing negative or positive reviews
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

In [38]:
# create the data used to feed into the classifier
# movie_reviews.words(fileids=[f]) is a list of words contained in file f
# list of tuples containing a dictionary (True key) with its associated positive or negative label
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]

In [39]:
# select samples for training (remainder is for testing)
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4

In [40]:
# combine the training negative and positive features
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]

# combine the test negative and positive features
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))

train on 1500 instances, test on 500 instances


In [41]:
# train naive bayes classifier
classifier = NaiveBayesClassifier.train(trainfeats)

# check accuracy
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)

# words most accurately predicted to be positive or negative
classifier.show_most_informative_features()

accuracy: 0.728
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0


# saving and loading the trained classifer

In [42]:
f = open('my_classifier.pickle', 'wb')
pickle.dump(classifier, f)
f.close()

In [43]:
f = open('my_classifier.pickle', 'rb')

classifier = pickle.load(f)

f.close()

In [44]:
# some sample titles from EJMR
# test_thread_title = "Economic sanctions are dumb"
# test_thread_title = "Does a taco count as a sandwich?"
test_thread_title = "White hair at a young age?"



test_thread_title_feat = word_feats(nltk.word_tokenize(test_thread_title))
test_thread_title_feat



{'?': True,
 'White': True,
 'a': True,
 'age': True,
 'at': True,
 'hair': True,
 'young': True}

In [45]:
classifier.classify(test_thread_title_feat)

'pos'