In [1]:
import collections
import nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews

It has been shown that the inclusion of word bigram features leads to imporved sentiment analysis for various classifiers (Wang and Manning, 2012).
In this example, we will see whether the use of bigrams as features improve the accuracy of our sentiment classification. 

References:
Wang and Manning. Baselines and Bigrams: Simple, Good Sentiment and Topic Classification. In ACL, 2012. 

In [2]:
#import these tools
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

Here, we will need to create a function called "bigrams_as_features" which returns the 200 most frequent bigrams. 

One way to do so is to find significant bigrams by using nltk.collocations.BigramCollocationFinder along with nltk.metrics.BigramAssocMeasures. 

From NLTK: Finding collocations requires first calculating the frequencies of words and
their appearance in the context of other words. Often the collection of words
will then requiring filtering to only retain useful content terms. Each ngram
of words may then be scored according to some association measure, in order
to determine the relative likelihood of each ngram being a collocation.

The ``BigramCollocationFinder`` and ``TrigramCollocationFinder`` classes provide
these functionalities, dependent on being provided a function which scores a
ngram given appropriate frequency counts. A number of standard association
measures are provided in bigram_measures and trigram_measures.

For more details and a demo see here: http://www.nltk.org/_modules/nltk/collocations.html 

In [3]:
def bigrams_as_features(words, score_bg=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_bg, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

In [4]:
#Note, this is just the same code as previously. The only thing that changes is the feature extraction method, 
#which is user specified.
def evaluate_classifier(feature_extraction):
    
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')
 
    negreviews = [(feature_extraction(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posreviews = [(feature_extraction(movie_reviews.words(fileids=[f])), 'pos') for f in posids]

    negsplit = int(len(negreviews)*0.75)
    possplit = int(len(posreviews)*0.75)

    trainingset = negreviews[:negsplit] + posreviews[:possplit]
    testset = negreviews[negsplit:] + posreviews[possplit:]
    
    classifier = NaiveBayesClassifier.train(trainingset)
 
    print('accuracy:', nltk.classify.util.accuracy(classifier, testset))
    classifier.show_most_informative_features()

After defining the function lets evaluate the classfier using bigrams as features. Note that this can take up to several minutes to run.

In [5]:
evaluate_classifier(bigrams_as_features)

accuracy: 0.816
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
       ('matt', 'damon') = True              pos : neg    =     12.3 : 1.0
          ('give', 'us') = True              neg : pos    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
    ('absolutely', 'no') = True              neg : pos    =     10.6 : 1.0


**Question**

Is the accuracy improved? What do you think it would happen if we had used all bigrams rather than the top-200? 

In [8]:
evaluate_classifier(lambda x: bigrams_as_features(x, n = 2000))

accuracy: 0.74
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
        ('not', 'funny') = True              neg : pos    =     13.7 : 1.0
       ('matt', 'damon') = True              pos : neg    =     13.7 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
       ('and', 'boring') = True              neg : pos    =     13.0 : 1.0
    ('a', 'wonderfully') = True              pos : neg    =     13.0 : 1.0
      ('is', 'terrific') = True              pos : neg    =     13.0 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
      ('perfect', 'for') = True              pos : neg    =     12.3 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0


**Question**

Evaluate the classifier which uses trigrams as features.

In [7]:
def trigrams_as_features(words, score_bg=BigramAssocMeasures.chi_sq, n=200):
    trigram_finder = nltk.TrigramCollocationFinder.from_words(words)
    trigrams = trigram_finder.nbest(score_bg, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, trigrams)])

In [9]:
evaluate_classifier(trigrams_as_features)

NameError: name 'TrigramCollocationFinder' is not defined