In [1]:
import nltk
# movie review sentences
from nltk.corpus import sentence_polarity
import random

In [2]:
## repeat the setup of the movie review sentences for classification
# for each sentence(document), get its words and category (positive/negative)
documents = [(sent, cat) for cat in sentence_polarity.categories() 
    for sent in sentence_polarity.sents(categories=cat)]
random.shuffle(documents)

In [3]:
# get all words from all movie_reviews and put into a frequency distribution
# note lowercase, but no stemming or stopwords
word_features = [word for (sent,cat) in documents for word in sent]

In [5]:
# define features (keywords) of a document for a BOW/unigram baseline
# each feature is 'contains(keyword)' and is true or false depending
# on whether that keyword is in the document
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [6]:
# get features sets for a document, including keyword features and category feature
featuresets = [(document_features(d, word_features), c) for (d, c) in documents]

# training using naive Baysian classifier, training set is 90% of data
train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)

# evaluate the accuracy of the classifier
nltk.classify.accuracy(classifier, test_set)

0.737

In [7]:
# evaluation measures showing performance of classifier
from nltk.metrics import *
reflist = []
testlist = []
for (features, label) in test_set:
    reflist.append(label)
    testlist.append(classifier.classify(features))
    
# Confusion matrix gives true positives, false negatives, false positives, and true negatives
#   where we interpret female as "yes" and male as "no"
cm = ConfusionMatrix(reflist, testlist)
print(cm)

# define a set of item identifiers that are gold labels and a set of item identifiers that are predicted labels
# this uses index numbers for the labels
refpos = set([i for i,label in enumerate(reflist) if label == 'pos'])
refneg = set([i for i,label in enumerate(reflist) if label == 'neg'])
testpos = set([i for i,label in enumerate(testlist) if label == 'pos'])
testneg = set([i for i,label in enumerate(testlist) if label == 'neg'])

# compute precision, recall and F-measure for each label

def printmeasures(label, refset, testset):
    print(label, 'precision:', precision(refset, testset))
    print(label, 'recall:', recall(refset, testset)) 
    print(label, 'F-measure:', f_measure(refset, testset))

printmeasures('pos', refpos, testpos)
printmeasures('neg', refneg, testneg)

    |   n   p |
    |   e   o |
    |   g   s |
----+---------+
neg |<372>123 |
pos | 140<365>|
----+---------+
(row = reference; col = test)

pos precision: 0.7479508196721312
pos recall: 0.7227722772277227
pos F-measure: 0.7351460221550856
neg precision: 0.7265625
neg recall: 0.7515151515151515
neg F-measure: 0.7388282025819265


In [8]:
# this list of negation words includes some "approximate negators" like hardly and rarely
negationwords = ['no', 'not', 'never', 'none', 'nowhere', 'nothing', 'noone', 'rather', 
                 'hardly', 'scarcely', 'rarely', 'seldom', 'neither', 'nor']

In [9]:
# One strategy with negation words is to negate the word following the negation word
# other strategies negate all words up to the next punctuation
# Strategy is to go through the document words in order adding the word features,
#   but if the word follows a negation words, change the feature to negated word
# Start the feature set with all 2000 word features and 2000 Not word features set to false
def NOT_features(document, word_features, negationwords):
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = False
        features['contains(NOT{})'.format(word)] = False
    # go through document words in order
    for i in range(0, len(document)):
        word = document[i]
        if ((i + 1) < len(document)) and ((word in negationwords) or (word.endswith("n't"))):
            i += 1
            features['contains(NOT{})'.format(document[i])] = (document[i] in word_features)
        else:
            features['contains({})'.format(word)] = (word in word_features)
    return features

In [10]:
# define the feature sets
NOT_featuresets = [(NOT_features(d, word_features, negationwords), c) for (d, c) in documents]
# show the values of a couple of example features
print(NOT_featuresets[0][0]['contains(NOTlike)'])
print(NOT_featuresets[0][0]['contains(always)'])

False
False


In [11]:
train_set, test_set = NOT_featuresets[1000:], NOT_featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.777

In [12]:
# evaluation measures showing performance of classifier
from nltk.metrics import *
reflist = []
testlist = []
for (features, label) in test_set:
    reflist.append(label)
    testlist.append(classifier.classify(features))
    
# Confusion matrix gives true positives, false negatives, false positives, and true negatives
#   where we interpret female as "yes" and male as "no"
cm = ConfusionMatrix(reflist, testlist)
print(cm)

# define a set of item identifiers that are gold labels and a set of item identifiers that are predicted labels
# this uses index numbers for the labels
refpos = set([i for i,label in enumerate(reflist) if label == 'pos'])
refneg = set([i for i,label in enumerate(reflist) if label == 'neg'])
testpos = set([i for i,label in enumerate(testlist) if label == 'pos'])
testneg = set([i for i,label in enumerate(testlist) if label == 'neg'])

# compute precision, recall and F-measure for each label

def printmeasures(label, refset, testset):
    print(label, 'precision:', precision(refset, testset))
    print(label, 'recall:', recall(refset, testset)) 
    print(label, 'F-measure:', f_measure(refset, testset))

printmeasures('pos', refpos, testpos)
printmeasures('neg', refneg, testneg)

    |   n   p |
    |   e   o |
    |   g   s |
----+---------+
neg |<395>100 |
pos | 123<382>|
----+---------+
(row = reference; col = test)

pos precision: 0.7925311203319502
pos recall: 0.7564356435643564
pos F-measure: 0.7740628166160081
neg precision: 0.7625482625482626
neg recall: 0.797979797979798
neg F-measure: 0.7798617966436328


In [13]:
####   adding Bigram features   ####
# set up for using bigrams
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [14]:
# create the bigram finder on all the words in sequence
finder = BigramCollocationFinder.from_words(all_words_list)

In [15]:
# define the top 500 bigrams using the chi squared measure
bigram_features = finder.nbest(bigram_measures.raw_freq, 500)
print(bigram_features[:50])

[('.', '.'), ('.', 'the'), ('.', 'a'), ('of', 'the'), (',', 'but'), (',', 'and'), ('in', 'the'), ('the', 'film'), ('is', 'a'), ('.', "it's"), (',', 'the'), ('of', 'a'), ('to', 'the'), ('and', 'the'), ('to', 'be'), ('the', 'movie'), ('.', 'this'), ('.', 'it'), ('for', 'the'), ('it', 'is'), ('with', 'a'), ('.', 'an'), ('as', 'a'), ('in', 'a'), ('on', 'the'), ('one', 'of'), ('and', 'a'), ('this', 'is'), ('a', 'movie'), ("it's", 'a'), (',', 'it'), ('.', 'if'), ('with', 'the'), ('film', 'is'), ('like', 'a'), (',', 'a'), ('for', 'a'), ('it', '.'), ('the', 'most'), ('film', '.'), ('.', 'but'), ('but', 'it'), ('of', 'its'), ('movie', '.'), (',', "it's"), (',', 'this'), ('a', 'film'), ('as', 'the'), ('from', 'the'), ('.', 'i')]


In [16]:
# define features that include words as before 
# add the most frequent significant bigrams
# this function takes the list of words in a document as an argument and returns a feature dictionary
# it depends on the variables word_features and bigram_features
def bigram_document_features(document, word_features, bigram_features):
    document_words = set(document)
    document_bigrams = nltk.bigrams(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    for bigram in bigram_features:
        features['bigram({} {})'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)    
    return features

In [17]:
# use this function to create feature sets for all sentences
bigram_featuresets = [(bigram_document_features(d, word_features, bigram_features), c) for (d, c) in documents]

# number of features for document 0
print(len(bigram_featuresets[0][0].keys()))

# features in document 0
print(bigram_featuresets[0][0])

2000
{'contains(.)': True, 'contains(the)': False, 'contains(,)': True, 'contains(a)': True, 'contains(and)': True, 'contains(of)': False, 'contains(to)': False, 'contains(is)': False, 'contains(in)': False, 'contains(that)': False, 'contains(it)': True, 'contains(as)': False, 'contains(but)': True, 'contains(with)': False, 'contains(film)': False, 'contains(this)': False, 'contains(for)': False, 'contains(its)': False, 'contains(an)': False, 'contains(movie)': False, "contains(it's)": True, 'contains(be)': False, 'contains(on)': False, 'contains(you)': True, 'contains(not)': False, 'contains(by)': False, 'contains(about)': False, 'contains(one)': False, 'contains(more)': False, 'contains(like)': True, 'contains(has)': False, 'contains(are)': False, 'contains(at)': False, 'contains(from)': False, 'contains(than)': False, 'contains(")': False, 'contains(all)': False, 'contains(--)': False, 'contains(his)': True, 'contains(have)': False, 'contains(so)': False, 'contains(if)': False, 'con

In [18]:
# train a classifier and report accuracy
train_set, test_set = bigram_featuresets[1000:], bigram_featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.738

In [19]:
# evaluation measures showing performance of classifier
from nltk.metrics import *
reflist = []
testlist = []
for (features, label) in test_set:
    reflist.append(label)
    testlist.append(classifier.classify(features))
    
# Confusion matrix gives true positives, false negatives, false positives, and true negatives
#   where we interpret female as "yes" and male as "no"
cm = ConfusionMatrix(reflist, testlist)
print(cm)

# define a set of item identifiers that are gold labels and a set of item identifiers that are predicted labels
# this uses index numbers for the labels
refpos = set([i for i,label in enumerate(reflist) if label == 'pos'])
refneg = set([i for i,label in enumerate(reflist) if label == 'neg'])
testpos = set([i for i,label in enumerate(testlist) if label == 'pos'])
testneg = set([i for i,label in enumerate(testlist) if label == 'neg'])

# compute precision, recall and F-measure for each label

def printmeasures(label, refset, testset):
    print(label, 'precision:', precision(refset, testset))
    print(label, 'recall:', recall(refset, testset)) 
    print(label, 'F-measure:', f_measure(refset, testset))

printmeasures('pos', refpos, testpos)
printmeasures('neg', refneg, testneg)

    |   n   p |
    |   e   o |
    |   g   s |
----+---------+
neg |<371>124 |
pos | 138<367>|
----+---------+
(row = reference; col = test)

pos precision: 0.7474541751527495
pos recall: 0.7267326732673267
pos F-measure: 0.7369477911646586
neg precision: 0.7288801571709234
neg recall: 0.7494949494949495
neg F-measure: 0.7390438247011953


In [20]:
# this function takes a document list of words and returns a feature dictionary
# it runs the default pos tagger (the Stanford tagger) on the document
#   and counts 4 types of pos tags to use as features
def POS_features(document, word_features):
    document_words = set(document)
    tagged_words = nltk.pos_tag(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    numNoun = 0
    numVerb = 0
    numAdj = 0
    numAdverb = 0
    for (word, tag) in tagged_words:
        if tag.startswith('N'): numNoun += 1
        if tag.startswith('V'): numVerb += 1
        if tag.startswith('J'): numAdj += 1
        if tag.startswith('R'): numAdverb += 1
    features['nouns'] = numNoun
    features['verbs'] = numVerb
    features['adjectives'] = numAdj
    features['adverbs'] = numAdverb
    return features

In [21]:
# define feature sets using this function
POS_featuresets = [(POS_features(d, word_features), c) for (d, c) in documents]
# number of features for document 0
print(len(POS_featuresets[0][0].keys()))

1504


In [22]:
# the first sentence
print(documents[0])
# the pos tag features for this sentence
print('num nouns', POS_featuresets[0][0]['nouns'])
print('num verbs', POS_featuresets[0][0]['verbs'])
print('num adjectives', POS_featuresets[0][0]['adjectives'])
print('num adverbs', POS_featuresets[0][0]['adverbs'])

(["it's", 'coherent', ',', 'well', 'shot', ',', 'and', 'tartly', 'acted', ',', 'but', 'it', 'wears', 'you', 'down', 'like', 'a', 'dinner', 'guest', 'showing', 'off', 'his', 'doctorate', '.'], 'neg')
num nouns 4
num verbs 3
num adjectives 2
num adverbs 4


In [23]:
# train and test the classifier
train_set, test_set = POS_featuresets[1000:], POS_featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.725

In [24]:
# evaluation measures showing performance of classifier
from nltk.metrics import *
reflist = []
testlist = []
for (features, label) in test_set:
    reflist.append(label)
    testlist.append(classifier.classify(features))
    
# Confusion matrix gives true positives, false negatives, false positives, and true negatives
#   where we interpret female as "yes" and male as "no"
cm = ConfusionMatrix(reflist, testlist)
print(cm)

# define a set of item identifiers that are gold labels and a set of item identifiers that are predicted labels
# this uses index numbers for the labels
refpos = set([i for i,label in enumerate(reflist) if label == 'pos'])
refneg = set([i for i,label in enumerate(reflist) if label == 'neg'])
testpos = set([i for i,label in enumerate(testlist) if label == 'pos'])
testneg = set([i for i,label in enumerate(testlist) if label == 'neg'])

# compute precision, recall and F-measure for each label

def printmeasures(label, refset, testset):
    print(label, 'precision:', precision(refset, testset))
    print(label, 'recall:', recall(refset, testset)) 
    print(label, 'F-measure:', f_measure(refset, testset))

printmeasures('pos', refpos, testpos)
printmeasures('neg', refneg, testneg)


    |   n   p |
    |   e   o |
    |   g   s |
----+---------+
neg |<365>130 |
pos | 145<360>|
----+---------+
(row = reference; col = test)

pos precision: 0.7346938775510204
pos recall: 0.7128712871287128
pos F-measure: 0.7236180904522613
neg precision: 0.7156862745098039
neg recall: 0.7373737373737373
neg F-measure: 0.7263681592039801


In [25]:
def Combined_document_features(document, word_features, bigram_features):
    document_words = set(document)
    document_bigrams = nltk.bigrams(document)
    tagged_words = nltk.pos_tag(document)
    features = {}
    numNoun = 0
    numVerb = 0
    numAdj = 0
    numAdverb = 0
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    for bigram in bigram_features:
        features['bigram({} {})'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)    
    for (word, tag) in tagged_words:
        if tag.startswith('N'): numNoun += 1
        if tag.startswith('V'): numVerb += 1
        if tag.startswith('J'): numAdj += 1
        if tag.startswith('R'): numAdverb += 1
    features['nouns'] = numNoun
    features['verbs'] = numVerb
    features['adjectives'] = numAdj
    features['adverbs'] = numAdverb
    return features

# use this function to create feature sets for all sentences
comb_featuresets = [(Combined_document_features(d, word_features, bigram_features), c) for (d, c) in documents]

# features in document 0
print(comb_featuresets[0][0])

# the first sentence
print(documents[0])
# the pos tag features for this sentence
print('num nouns', comb_featuresets[0][0]['nouns'])
print('num verbs', comb_featuresets[0][0]['verbs'])
print('num adjectives', comb_featuresets[0][0]['adjectives'])
print('num adverbs', comb_featuresets[0][0]['adverbs'])

{'contains(.)': True, 'contains(the)': False, 'contains(,)': True, 'contains(a)': True, 'contains(and)': True, 'contains(of)': False, 'contains(to)': False, 'contains(is)': False, 'contains(in)': False, 'contains(that)': False, 'contains(it)': True, 'contains(as)': False, 'contains(but)': True, 'contains(with)': False, 'contains(film)': False, 'contains(this)': False, 'contains(for)': False, 'contains(its)': False, 'contains(an)': False, 'contains(movie)': False, "contains(it's)": True, 'contains(be)': False, 'contains(on)': False, 'contains(you)': True, 'contains(not)': False, 'contains(by)': False, 'contains(about)': False, 'contains(one)': False, 'contains(more)': False, 'contains(like)': True, 'contains(has)': False, 'contains(are)': False, 'contains(at)': False, 'contains(from)': False, 'contains(than)': False, 'contains(")': False, 'contains(all)': False, 'contains(--)': False, 'contains(his)': True, 'contains(have)': False, 'contains(so)': False, 'contains(if)': False, 'contains

In [26]:
# train and test the classifier
train_set, test_set = comb_featuresets[1000:], comb_featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.726

In [27]:
# evaluation measures showing performance of classifier
from nltk.metrics import *
reflist = []
testlist = []
for (features, label) in test_set:
    reflist.append(label)
    testlist.append(classifier.classify(features))
    
# Confusion matrix gives true positives, false negatives, false positives, and true negatives
#   where we interpret pos as "yes" and neg as "no"
cm = ConfusionMatrix(reflist, testlist)
print(cm)

# define a set of item identifiers that are gold labels and a set of item identifiers that are predicted labels
# this uses index numbers for the labels
refpos = set([i for i,label in enumerate(reflist) if label == 'pos'])
refneg = set([i for i,label in enumerate(reflist) if label == 'neg'])
testpos = set([i for i,label in enumerate(testlist) if label == 'pos'])
testneg = set([i for i,label in enumerate(testlist) if label == 'neg'])

# compute precision, recall and F-measure for each label

def printmeasures(label, refset, testset):
    print(label, 'precision:', precision(refset, testset))
    print(label, 'recall:', recall(refset, testset)) 
    print(label, 'F-measure:', f_measure(refset, testset))

printmeasures('pos', refpos, testpos)
printmeasures('neg', refneg, testneg)


    |   n   p |
    |   e   o |
    |   g   s |
----+---------+
neg |<366>129 |
pos | 145<360>|
----+---------+
(row = reference; col = test)

pos precision: 0.7361963190184049
pos recall: 0.7128712871287128
pos F-measure: 0.7243460764587525
neg precision: 0.7162426614481409
neg recall: 0.7393939393939394
neg F-measure: 0.7276341948310139
