In [1]:
import collections
import nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
import pandas as pd
nltk.download('stopwords')
nltk.download('movie_reviews')

# Importing the dataset
df = pd.read_csv('sentiment.tsv', delimiter = '\t', header=None,names = ["sentiment", "text"])
 
def evaluate_classifier(featx):
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')
 
    negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
 
    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
 
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    
#     testfeats =  df['text']
 
    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
 
    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)
 
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
#     print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
#     print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
#     print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
#     print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
    classifier.show_most_informative_features()

[nltk_data] Downloading package stopwords to /opt/pynb/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /opt/pynb/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [2]:

def word_feats(words):
    return dict([(word, True) for word in words])
 
# evaluate_classifier(word_feats)

In [3]:
from nltk.corpus import stopwords
stopset = set(stopwords.words('english'))
 
def stopword_filtered_word_feats(words):
    return dict([(word, True) for word in words if word not in stopset])
 
# evaluate_classifier(stopword_filtered_word_feats)

In [4]:
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
 
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
 
# evaluate_classifier(bigram_word_feats)


In [5]:
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
 
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

featx = bigram_word_feats
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
 
negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
 
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4
 
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    
#     testfeats =  df['text']
 
classifier = NaiveBayesClassifier.train(trainfeats)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
 
for i, (feats, label) in enumerate(testfeats):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)
 
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
#     print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
#     print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
#     print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
#     print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
classifier.show_most_informative_features()
    

accuracy: 0.816
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
     (u'matt', u'damon') = True              pos : neg    =     12.3 : 1.0
        (u'give', u'us') = True              neg : pos    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
  (u'absolutely', u'no') = True              neg : pos    =     10.6 : 1.0


In [7]:
feats

{(u"'", u's'): True,
 (u'-', u'chosen'): True,
 (u'-', u'seeing'): True,
 (u'/', u'director'): True,
 (u';', u'neither'): True,
 (u'address', u'them'): True,
 (u'airplane', u'started'): True,
 (u'almost', u'felt'): True,
 (u'also', u'recalls'): True,
 (u'although', u'these'): True,
 (u'amazing', u'coincidence'): True,
 (u'andrew', u'niccol'): True,
 (u'announcer', u'warns'): True,
 (u'answer', u'them'): True,
 (u'asked', u'them'): True,
 (u'at', u'every'): True,
 (u'became', u'known'): True,
 (u'becomes', u'determined'): True,
 (u'been', u'fake'): True,
 (u'best', u'not'): True,
 (u'big', u'hoax'): True,
 (u'both', u'films'): True,
 (u'built', u'by'): True,
 (u'by', u'andrew'): True,
 (u'by', u'government'): True,
 (u'cage', u'-'): True,
 (u'cameo', u'as'): True,
 (u'carefully', u'scripted'): True,
 (u'certainly', u'didn'): True,
 (u'chaos', u'theory'): True,
 (u'coffee', u'breaks'): True,
 (u'comes', u'back'): True,
 (u'comes', u'under'): True,
 (u'coming', u'apart'): True,
 (u'commun