# Sentiment analysis
* playing around with the naive bayes classifier to classify text as expressing either a positive or negative sentiment
* reference: http://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/

In [21]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
import pickle

In [20]:
def word_feats(words):
    return dict([(word, True) for word in words])

In [10]:
# list of the various text files containing negative or positive reviews
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

[u'neg/cv000_29416.txt',
 u'neg/cv001_19502.txt',
 u'neg/cv002_17424.txt',
 u'neg/cv003_12683.txt',
 u'neg/cv004_12641.txt',
 u'neg/cv005_29357.txt',
 u'neg/cv006_17022.txt',
 u'neg/cv007_4992.txt',
 u'neg/cv008_29326.txt',
 u'neg/cv009_29417.txt',
 u'neg/cv010_29063.txt',
 u'neg/cv011_13044.txt',
 u'neg/cv012_29411.txt',
 u'neg/cv013_10494.txt',
 u'neg/cv014_15600.txt',
 u'neg/cv015_29356.txt',
 u'neg/cv016_4348.txt',
 u'neg/cv017_23487.txt',
 u'neg/cv018_21672.txt',
 u'neg/cv019_16117.txt',
 u'neg/cv020_9234.txt',
 u'neg/cv021_17313.txt',
 u'neg/cv022_14227.txt',
 u'neg/cv023_13847.txt',
 u'neg/cv024_7033.txt',
 u'neg/cv025_29825.txt',
 u'neg/cv026_29229.txt',
 u'neg/cv027_26270.txt',
 u'neg/cv028_26964.txt',
 u'neg/cv029_19943.txt',
 u'neg/cv030_22893.txt',
 u'neg/cv031_19540.txt',
 u'neg/cv032_23718.txt',
 u'neg/cv033_25680.txt',
 u'neg/cv034_29446.txt',
 u'neg/cv035_3343.txt',
 u'neg/cv036_18385.txt',
 u'neg/cv037_19798.txt',
 u'neg/cv038_9781.txt',
 u'neg/cv039_5963.txt',
 u'neg/

In [7]:
# create the data used to feed into the classifier
# movie_reviews.words(fileids=[f]) is a list of words contained in file f
# list of tuples containing a dictionary (True key) with its associated positive or negative label
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]

In [None]:
# select samples for training (remainder is for testing)
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4

In [19]:
# combine the training negative and positive features
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]

# combine the test negative and positive features
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))

train on 1500 instances, test on 500 instances


In [26]:
# train naive bayes classifier
classifier = NaiveBayesClassifier.train(trainfeats)

# check accuracy
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)

# words most accurately predicted to be positive or negative
classifier.show_most_informative_features()

accuracy: 0.728
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0


# saving and loading the trained classifer

In [22]:
f = open('my_classifier.pickle', 'wb')
pickle.dump(classifier, f)
f.close()

In [23]:
f = open('my_classifier.pickle', 'rb')

classifier = pickle.load(f)

f.close()

In [33]:
# some sample titles from EJMR
# test_thread_title = "Economic sanctions are dumb"
# test_thread_title = "Does a taco count as a sandwich?"
test_thread_title = "White hair at a young age?"



test_thread_title_feat = word_feats(nltk.word_tokenize(test_thread_title))
test_thread_title_feat



{'?': True,
 'White': True,
 'a': True,
 'age': True,
 'at': True,
 'hair': True,
 'young': True}

In [34]:
classifier.classify(test_thread_title_feat)

'pos'