In [None]:
import nltk 
nltk.download('names')
nltk.download('brown')
nltk.download('movie_reviews')

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [None]:
# define a feature extraction function for each name
def gender_features(word):
    return{'last_letter': word[-1]}

In [None]:
print(gender_features('Shrek'))

{'last_letter': 'k'}


In [None]:
# resource for male and female first names
from nltk.corpus import names
print(names.words('male.txt')[:20])
print(names.words('female.txt')[:20])


['Aamir', 'Aaron', 'Abbey', 'Abbie', 'Abbot', 'Abbott', 'Abby', 'Abdel', 'Abdul', 'Abdulkarim', 'Abdullah', 'Abe', 'Abel', 'Abelard', 'Abner', 'Abraham', 'Abram', 'Ace', 'Adair', 'Adam']
['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi', 'Abbie', 'Abby', 'Abigael', 'Abigail', 'Abigale', 'Abra', 'Acacia', 'Ada', 'Adah', 'Adaline', 'Adara', 'Addie', 'Addis', 'Adel', 'Adela']


In [None]:
# make list of male and female names paired with gender
namesgender = ([(name, 'male') for name in names.words('male.txt')] +
          [(name, 'female') for name in names.words('female.txt')])
print(len(namesgender))
print(namesgender[:20])
print(namesgender[7924:])

7944
[('Aamir', 'male'), ('Aaron', 'male'), ('Abbey', 'male'), ('Abbie', 'male'), ('Abbot', 'male'), ('Abbott', 'male'), ('Abby', 'male'), ('Abdel', 'male'), ('Abdul', 'male'), ('Abdulkarim', 'male'), ('Abdullah', 'male'), ('Abe', 'male'), ('Abel', 'male'), ('Abelard', 'male'), ('Abner', 'male'), ('Abraham', 'male'), ('Abram', 'male'), ('Ace', 'male'), ('Adair', 'male'), ('Adam', 'male')]
[('Zena', 'female'), ('Zenia', 'female'), ('Zia', 'female'), ('Zilvia', 'female'), ('Zita', 'female'), ('Zitella', 'female'), ('Zoe', 'female'), ('Zola', 'female'), ('Zonda', 'female'), ('Zondra', 'female'), ('Zonnya', 'female'), ('Zora', 'female'), ('Zorah', 'female'), ('Zorana', 'female'), ('Zorina', 'female'), ('Zorine', 'female'), ('Zsa Zsa', 'female'), ('Zsazsa', 'female'), ('Zulema', 'female'), ('Zuzana', 'female')]


In [None]:
# put the list into random order
import random
random.shuffle(namesgender)
print(namesgender[:20])

[('Grant', 'male'), ('Donielle', 'female'), ('Babette', 'female'), ('Inez', 'female'), ('Max', 'male'), ('Mason', 'male'), ('Ajai', 'male'), ('Elane', 'female'), ('Edi', 'female'), ('Berni', 'female'), ('Nicole', 'female'), ('Terese', 'female'), ('Perceval', 'male'), ('Marian', 'female'), ('Torry', 'male'), ('Charmain', 'female'), ('Kingsley', 'male'), ('Carolan', 'female'), ('Quentin', 'male'), ('Christina', 'female')]


In [None]:
# separate the names into training and test
train_names = namesgender[500:]
test_names = namesgender[:500]


In [None]:
# use our features to train a classify and test on the development test set
train_set = [(gender_features(n), g) for (n, g) in train_names]
test_set = [(gender_features(n), g) for (n, g) in test_names]
print(train_set[:20])


[({'last_letter': 'd'}, 'male'), ({'last_letter': 's'}, 'male'), ({'last_letter': 'd'}, 'male'), ({'last_letter': 'n'}, 'male'), ({'last_letter': 'i'}, 'female'), ({'last_letter': 't'}, 'male'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'y'}, 'female'), ({'last_letter': 's'}, 'male'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'h'}, 'male'), ({'last_letter': 's'}, 'male'), ({'last_letter': 'y'}, 'male'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'h'}, 'male'), ({'last_letter': 't'}, 'male'), ({'last_letter': 'n'}, 'female'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'e'}, 'female')]


In [None]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
# classify new instances
print(classifier.classify(gender_features('Neo')))
print(classifier.classify(gender_features('Trinity')))

male
female


In [None]:
# classify accuracy function runs the classifier on the test set and reports
#   comparisons between predicted labels and actual/gold labels
print(nltk.classify.accuracy(classifier, test_set))

0.758


In [None]:
# this function available for naive bayes classifiers
print(classifier.show_most_informative_features(20))


Most Informative Features
             last_letter = 'a'            female : male   =     33.1 : 1.0
             last_letter = 'k'              male : female =     32.3 : 1.0
             last_letter = 'f'              male : female =     16.7 : 1.0
             last_letter = 'p'              male : female =     11.9 : 1.0
             last_letter = 'v'              male : female =     10.5 : 1.0
             last_letter = 'd'              male : female =      9.5 : 1.0
             last_letter = 'm'              male : female =      8.8 : 1.0
             last_letter = 'o'              male : female =      8.2 : 1.0
             last_letter = 'r'              male : female =      8.0 : 1.0
             last_letter = 'w'              male : female =      5.1 : 1.0
             last_letter = 'g'              male : female =      4.9 : 1.0
             last_letter = 'z'              male : female =      4.3 : 1.0
             last_letter = 's'              male : female =      4.2 : 1.0

In [None]:
# define a function that will compare the classifier labels with the gold standard labels
def geterrors(test):
    errors = []
    for (name, tag) in test:
        guess = classifier.classify(gender_features(name))
        if guess != tag:
            errors.append( (tag, guess, name) )
    return errors

errors = geterrors(test_names)
print(len(errors))

109


In [None]:
# define a function to print the errors
def printerrors(errors):
    for (tag, guess, name) in sorted(errors):
        print('correct={:<8s} guess={:<8s} name={:<30s}'.format(tag, guess, name))

printerrors(errors)


correct=female   guess=male     name=Annabell                      
correct=female   guess=male     name=Arden                         
correct=female   guess=male     name=Arlen                         
correct=female   guess=male     name=Aurel                         
correct=female   guess=male     name=Beitris                       
correct=female   guess=male     name=Brandais                      
correct=female   guess=male     name=Calypso                       
correct=female   guess=male     name=Cameo                         
correct=female   guess=male     name=Carilyn                       
correct=female   guess=male     name=Carolan                       
correct=female   guess=male     name=Caryl                         
correct=female   guess=male     name=Charil                        
correct=female   guess=male     name=Charleen                      
correct=female   guess=male     name=Charmain                      
correct=female   guess=male     name=Christal   

##Exercise 1: Define a new feature extraction function that includes features for two-letter suffixes

In [None]:
def gender_features3(word):
  return {'suffix1': word[-1],'suffix2': word[-2]}

In [None]:
print(gender_features3('Shrek'))

{'suffix1': 'k', 'suffix2': 'e'}


In [None]:
# use our features to train a classify and test on the development test set
train_set = [(gender_features3(n), g) for (n, g) in train_names]
test_set = [(gender_features3(n), g) for (n, g) in test_names]
print(train_set[:20])


[({'suffix1': 'e', 'suffix2': 'i'}, 'female'), ({'suffix1': 'a', 'suffix2': 'r'}, 'female'), ({'suffix1': 'a', 'suffix2': 'n'}, 'female'), ({'suffix1': 'e', 'suffix2': 'b'}, 'male'), ({'suffix1': 'g', 'suffix2': 'e'}, 'male'), ({'suffix1': 'n', 'suffix2': 'i'}, 'female'), ({'suffix1': 'a', 'suffix2': 'd'}, 'female'), ({'suffix1': 'n', 'suffix2': 'a'}, 'male'), ({'suffix1': 's', 'suffix2': 'd'}, 'male'), ({'suffix1': 's', 'suffix2': 'a'}, 'male'), ({'suffix1': 'a', 'suffix2': 'h'}, 'female'), ({'suffix1': 'i', 'suffix2': 'n'}, 'female'), ({'suffix1': 'a', 'suffix2': 'n'}, 'female'), ({'suffix1': 'e', 'suffix2': 's'}, 'female'), ({'suffix1': 'e', 'suffix2': 't'}, 'female'), ({'suffix1': 'a', 'suffix2': 'e'}, 'female'), ({'suffix1': 's', 'suffix2': 'i'}, 'male'), ({'suffix1': 'e', 'suffix2': 'l'}, 'male'), ({'suffix1': 't', 'suffix2': 'r'}, 'male'), ({'suffix1': 'e', 'suffix2': 'k'}, 'female')]


In [None]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
# classify accuracy function runs the classifier on the test set and reports
#   comparisons between predicted labels and actual/gold labels
print(nltk.classify.accuracy(classifier, test_set))

0.766


In [None]:
# this function available for naive bayes classifiers
print(classifier.show_most_informative_features(20))


Most Informative Features
                 suffix1 = 'a'            female : male   =     34.3 : 1.0
                 suffix1 = 'k'              male : female =     29.5 : 1.0
                 suffix1 = 'f'              male : female =     16.1 : 1.0
                 suffix1 = 'p'              male : female =     12.0 : 1.0
                 suffix1 = 'v'              male : female =      9.9 : 1.0
                 suffix1 = 'd'              male : female =      9.7 : 1.0
                 suffix1 = 'o'              male : female =      8.9 : 1.0
                 suffix1 = 'm'              male : female =      8.4 : 1.0
                 suffix2 = 'o'              male : female =      7.1 : 1.0
                 suffix1 = 'r'              male : female =      7.1 : 1.0
                 suffix2 = 'u'              male : female =      6.9 : 1.0
                 suffix1 = 'z'              male : female =      5.6 : 1.0
                 suffix1 = 'w'              male : female =      5.1 : 1.0

In [None]:
# define a function that will compare the classifier labels with the gold standard labels
def geterrors(test):
    errors = []
    for (name, tag) in test:
        guess = classifier.classify(gender_features3(name))
        if guess != tag:
            errors.append( (tag, guess, name) )
    return errors

errors = geterrors(test_names)
print(len(errors))

117


In [None]:
def gender_features2(word):
  return {'prefix1': word[0],'prefix2': word[1], 'suffix1': word[-1],'suffix2': word[-2]}

In [None]:
print(gender_features2('Shrek'))

{'prefix1': 'S', 'prefix2': 'h', 'suffix1': 'k', 'suffix2': 'e'}


In [None]:
# use our features to train a classify and test on the development test set
train_set = [(gender_features2(n), g) for (n, g) in train_names]
test_set = [(gender_features2(n), g) for (n, g) in test_names]
print(train_set[:20])

[({'prefix1': 'D', 'prefix2': 'a', 'suffix1': 'e', 'suffix2': 'i'}, 'female'), ({'prefix1': 'A', 'prefix2': 's', 'suffix1': 'a', 'suffix2': 'r'}, 'female'), ({'prefix1': 'M', 'prefix2': 'o', 'suffix1': 'a', 'suffix2': 'n'}, 'female'), ({'prefix1': 'A', 'prefix2': 'b', 'suffix1': 'e', 'suffix2': 'b'}, 'male'), ({'prefix1': 'O', 'prefix2': 'l', 'suffix1': 'g', 'suffix2': 'e'}, 'male'), ({'prefix1': 'M', 'prefix2': 'u', 'suffix1': 'n', 'suffix2': 'i'}, 'female'), ({'prefix1': 'H', 'prefix2': 'a', 'suffix1': 'a', 'suffix2': 'd'}, 'female'), ({'prefix1': 'S', 'prefix2': 'h', 'suffix1': 'n', 'suffix2': 'a'}, 'male'), ({'prefix1': 'R', 'prefix2': 'e', 'suffix1': 's', 'suffix2': 'd'}, 'male'), ({'prefix1': 'B', 'prefix2': 'a', 'suffix1': 's', 'suffix2': 'a'}, 'male'), ({'prefix1': 'L', 'prefix2': 'e', 'suffix1': 'a', 'suffix2': 'h'}, 'female'), ({'prefix1': 'W', 'prefix2': 'i', 'suffix1': 'i', 'suffix2': 'n'}, 'female'), ({'prefix1': 'T', 'prefix2': 'r', 'suffix1': 'a', 'suffix2': 'n'}, 'femal

In [None]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
# classify accuracy function runs the classifier on the test set and reports
#   comparisons between predicted labels and actual/gold labels
print(nltk.classify.accuracy(classifier, test_set))

0.782


In [None]:
# this function available for naive bayes classifiers
print(classifier.show_most_informative_features(20))

Most Informative Features
                 suffix1 = 'a'            female : male   =     34.3 : 1.0
                 suffix1 = 'k'              male : female =     29.5 : 1.0
                 suffix1 = 'f'              male : female =     16.1 : 1.0
                 suffix1 = 'p'              male : female =     12.0 : 1.0
                 suffix1 = 'v'              male : female =      9.9 : 1.0
                 suffix1 = 'd'              male : female =      9.7 : 1.0
                 prefix2 = 'k'              male : female =      9.2 : 1.0
                 suffix1 = 'o'              male : female =      8.9 : 1.0
                 suffix1 = 'm'              male : female =      8.4 : 1.0
                 prefix2 = 'z'              male : female =      7.2 : 1.0
                 suffix2 = 'o'              male : female =      7.1 : 1.0
                 suffix1 = 'r'              male : female =      7.1 : 1.0
                 suffix2 = 'u'              male : female =      6.9 : 1.0

In [None]:
# define a function that will compare the classifier labels with the gold standard labels
def geterrors(test):
    errors = []
    for (name, tag) in test:
        guess = classifier.classify(gender_features3(name))
        if guess != tag:
            errors.append( (tag, guess, name) )
    return errors

errors = geterrors(test_names)
print(len(errors))

117


In [None]:
def gender_features4(word):
  if len(word) >=2:
    return {'prefix1': word[0],
            'last_letter': word[-1], 
            'last_two_letters': word[-2:]}
  elif len(word) <=2:
    return
    {'last_two_letters': word[-2:]}

In [None]:
# use our features to train a classify and test on the development test set
train_set = [(gender_features4(n), g) for (n, g) in train_names]
test_set = [(gender_features4(n), g) for (n, g) in test_names]
print(train_set[:20])

[({'prefix1': 'D', 'last_letter': 'e', 'last_two_letters': 'ie'}, 'female'), ({'prefix1': 'A', 'last_letter': 'a', 'last_two_letters': 'ra'}, 'female'), ({'prefix1': 'M', 'last_letter': 'a', 'last_two_letters': 'na'}, 'female'), ({'prefix1': 'A', 'last_letter': 'e', 'last_two_letters': 'be'}, 'male'), ({'prefix1': 'O', 'last_letter': 'g', 'last_two_letters': 'eg'}, 'male'), ({'prefix1': 'M', 'last_letter': 'n', 'last_two_letters': 'in'}, 'female'), ({'prefix1': 'H', 'last_letter': 'a', 'last_two_letters': 'da'}, 'female'), ({'prefix1': 'S', 'last_letter': 'n', 'last_two_letters': 'an'}, 'male'), ({'prefix1': 'R', 'last_letter': 's', 'last_two_letters': 'ds'}, 'male'), ({'prefix1': 'B', 'last_letter': 's', 'last_two_letters': 'as'}, 'male'), ({'prefix1': 'L', 'last_letter': 'a', 'last_two_letters': 'ha'}, 'female'), ({'prefix1': 'W', 'last_letter': 'i', 'last_two_letters': 'ni'}, 'female'), ({'prefix1': 'T', 'last_letter': 'a', 'last_two_letters': 'na'}, 'female'), ({'prefix1': 'E', 'la

In [None]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
# classify accuracy function runs the classifier on the test set and reports
#   comparisons between predicted labels and actual/gold labels
print(nltk.classify.accuracy(classifier, test_set))

0.818


In [None]:
# define a function to print the errors
def printerrors(errors):
    for (tag, guess, name) in sorted(errors):
        print('correct={:<8s} guess={:<8s} name={:<30s}'.format(tag, guess, name))

printerrors(errors)

correct=female   guess=male     name=Arden                         
correct=female   guess=male     name=Arlen                         
correct=female   guess=male     name=Aurel                         
correct=female   guess=male     name=Beitris                       
correct=female   guess=male     name=Brandais                      
correct=female   guess=male     name=Briney                        
correct=female   guess=male     name=Britaney                      
correct=female   guess=male     name=Calypso                       
correct=female   guess=male     name=Cameo                         
correct=female   guess=male     name=Carolan                       
correct=female   guess=male     name=Charleen                      
correct=female   guess=male     name=Charmain                      
correct=female   guess=male     name=Christal                      
correct=female   guess=male     name=Chrysler                      
correct=female   guess=male     name=Coral      

##Lab Part 2

In [None]:
## classify part of speech based on sentence context
from nltk.corpus import brown

# define features for the "i"th word in the sentence, including three types of suffix 
#     and one pre-word
# the pos features function takes the sentence of untagged words and the index of a word i
#   it creates features for word i, including the previous word i-1
def pos_features(sentence, i):    
    features = {"suffix(1)": sentence[i][-1:],
		    "suffix(2)": sentence[i][-2:],
		    "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features 

In [None]:
# look at features of a specific word in a specific sentence
# first sentence of brown corpus
sentence0 = brown.sents()[0]
print(sentence0)

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']


In [None]:
# word 8 of sentence 0
print(sentence0[8])


investigation


In [None]:
# pos features of the word 8 
print(pos_features(sentence0, 8))

{'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}


In [None]:
# get the POS tagged sentences with categories of news
tagged_sents = brown.tagged_sents(categories='news')
tag_sent0 = tagged_sents[0]
tag_sent0


[('The', 'AT'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'),
 ('Jury', 'NN-TL'),
 ('said', 'VBD'),
 ('Friday', 'NR'),
 ('an', 'AT'),
 ('investigation', 'NN'),
 ('of', 'IN'),
 ("Atlanta's", 'NP$'),
 ('recent', 'JJ'),
 ('primary', 'NN'),
 ('election', 'NN'),
 ('produced', 'VBD'),
 ('``', '``'),
 ('no', 'AT'),
 ('evidence', 'NN'),
 ("''", "''"),
 ('that', 'CS'),
 ('any', 'DTI'),
 ('irregularities', 'NNS'),
 ('took', 'VBD'),
 ('place', 'NN'),
 ('.', '.')]

In [None]:
# the function nltk.tag.untag will take the tags off
nltk.tag.untag(tag_sent0)


['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of',
 "Atlanta's",
 'recent',
 'primary',
 'election',
 'produced',
 '``',
 'no',
 'evidence',
 "''",
 'that',
 'any',
 'irregularities',
 'took',
 'place',
 '.']

In [None]:
# the python enumerate function generates an index number for each item in a list
for i,(word,tag) in enumerate(tag_sent0):
    print (i, word, tag)


0 The AT
1 Fulton NP-TL
2 County NN-TL
3 Grand JJ-TL
4 Jury NN-TL
5 said VBD
6 Friday NR
7 an AT
8 investigation NN
9 of IN
10 Atlanta's NP$
11 recent JJ
12 primary NN
13 election NN
14 produced VBD
15 `` ``
16 no AT
17 evidence NN
18 '' ''
19 that CS
20 any DTI
21 irregularities NNS
22 took VBD
23 place NN
24 . .


In [None]:
# get feature sets of words appearing in the corpus, from untagged sentences.
# and then get their tags from corresponding tagged sentence
# use the Python function enumerate to pair the index numbers with sentence words 
#   for the pos features function
featuresets = []
for tagged_sent in tagged_sents:
	untagged_sent = nltk.tag.untag(tagged_sent)
	for i, (word, tag) in enumerate(tagged_sent):
		featuresets.append( (pos_features(untagged_sent, i), tag) )

In [None]:
# look at the feature sets of the first 10 words
for f in featuresets[:10]:
	print (f)
	

({'suffix(1)': 'e', 'suffix(2)': 'he', 'suffix(3)': 'The', 'prev-word': '<START>'}, 'AT')
({'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ton', 'prev-word': 'The'}, 'NP-TL')
({'suffix(1)': 'y', 'suffix(2)': 'ty', 'suffix(3)': 'nty', 'prev-word': 'Fulton'}, 'NN-TL')
({'suffix(1)': 'd', 'suffix(2)': 'nd', 'suffix(3)': 'and', 'prev-word': 'County'}, 'JJ-TL')
({'suffix(1)': 'y', 'suffix(2)': 'ry', 'suffix(3)': 'ury', 'prev-word': 'Grand'}, 'NN-TL')
({'suffix(1)': 'd', 'suffix(2)': 'id', 'suffix(3)': 'aid', 'prev-word': 'Jury'}, 'VBD')
({'suffix(1)': 'y', 'suffix(2)': 'ay', 'suffix(3)': 'day', 'prev-word': 'said'}, 'NR')
({'suffix(1)': 'n', 'suffix(2)': 'an', 'suffix(3)': 'an', 'prev-word': 'Friday'}, 'AT')
({'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}, 'NN')
({'suffix(1)': 'f', 'suffix(2)': 'of', 'suffix(3)': 'of', 'prev-word': 'investigation'}, 'IN')


In [None]:
# using naive Bayesian as classifier
# split data into a training set and a test set, using a 90%/10% split
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
print(len(train_set))
print(len(test_set))

90499
10055


In [None]:
# train classifier on the training set
classifier = nltk.NaiveBayesClassifier.train(train_set)


In [None]:
# evaluate the accuracy (this will take a little while)
print(nltk.classify.accuracy(classifier, test_set))
# the result is reasonable for features without the previous tag


0.7891596220785678


## Part 3

In [None]:
### classify documents based on keywords
from nltk.corpus import movie_reviews
import random

In [None]:
# movie reviews are labeled either positive or negative (by human annotators)
print(movie_reviews.categories())


['neg', 'pos']


In [None]:
# for each document in movie_reviews, get its words and category (positive/negative)
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
print(len(documents))

random.shuffle(documents)

2000


In [None]:
# look at the first document - consists of a list of all the words in the review
# followed by the category
print(documents[0])


(['assume', 'nothing', '.', 'the', 'phrase', 'is', 'perhaps', 'one', 'of', 'the', 'most', 'used', 'of', 'the', '1990', "'", 's', ',', 'as', 'first', 'impressions', 'and', 'rumors', 'are', 'hardly', 'ever', 'what', 'they', 'seem', 'to', 'be', '.', 'the', 'phrase', 'especially', 'goes', 'for', 'oscar', 'novak', ',', 'an', 'architect', 'who', 'is', 'the', 'main', 'focus', 'of', 'three', 'to', 'tango', ',', 'a', 'delightful', ',', 'funny', 'romantic', 'comedy', 'about', 'assumptions', 'and', 'being', 'yourself', '.', 'novak', '(', 'matthew', 'perry', ')', ',', 'a', 'shy', ',', 'clumsy', ',', 'chicago', 'based', 'architect', ',', 'along', 'with', 'openly', 'gay', 'partner', ',', 'peter', 'steinberg', '(', 'oliver', 'platt', ')', ',', 'fights', 'for', 'projects', 'day', 'in', 'and', 'day', 'out', '.', 'one', 'of', 'these', 'is', 'the', 'job', 'of', 'restoring', 'a', 'popular', 'building', 'for', 'charles', 'newman', '(', 'dylan', 'mcdermott', ')', ',', 'a', 'rich', ',', 'well', '-', 'known',

In [None]:
## use words from all documents to define the word vector for features
# get all words from all movie_reviews and put into a frequency distribution
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
print(len(all_words))


39768


In [None]:
# get the 2000 most frequently appearing keywords in the corpus
word_items = all_words.most_common(3500)
word_features = [word for (word, freq) in word_items]   # just the words

In [None]:
# look at the first 100 words
print(word_features[:100])


[',', 'the', '.', 'a', 'and', 'of', 'to', "'", 'is', 'in', 's', '"', 'it', 'that', '-', ')', '(', 'as', 'with', 'for', 'his', 'this', 'film', 'i', 'he', 'but', 'on', 'are', 't', 'by', 'be', 'one', 'movie', 'an', 'who', 'not', 'you', 'from', 'at', 'was', 'have', 'they', 'has', 'her', 'all', '?', 'there', 'like', 'so', 'out', 'about', 'up', 'more', 'what', 'when', 'which', 'or', 'she', 'their', ':', 'some', 'just', 'can', 'if', 'we', 'him', 'into', 'even', 'only', 'than', 'no', 'good', 'time', 'most', 'its', 'will', 'story', 'would', 'been', 'much', 'character', 'also', 'get', 'other', 'do', 'two', 'well', 'them', 'very', 'characters', ';', 'first', '--', 'after', 'see', '!', 'way', 'because', 'make', 'life']


In [None]:
# define features (keywords) of a document
# each feature is 'contains(keyword)' and is true or false depending
# on whether that keyword is in the document
def document_features(document, word_features):
	document_words = set(document)
	features = {}
	for word in word_features:
		features['V_%s' % word] = (word in document_words)
	return features


In [None]:
# get features sets for a document, including keyword features and category feature
featuresets = [(document_features(d, word_features), c) for (d,c) in documents]

In [None]:
# the feature sets are 2000 words long - so this is optional
print(featuresets[0])



In [None]:
# training using naive Baysian classifier with a 95/5 split
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)


In [None]:
# evaluate the accuracy of the classifier
print (nltk.classify.accuracy(classifier, test_set))
# the accuracy result may vary since we randomized the documents

0.86


In [None]:
# show which features of classifier are most informative
print(classifier.show_most_informative_features(30))


Most Informative Features
           V_outstanding = True              pos : neg    =     10.4 : 1.0
                 V_mulan = True              pos : neg    =      8.4 : 1.0
                V_seagal = True              neg : pos    =      7.7 : 1.0
           V_wonderfully = True              pos : neg    =      6.3 : 1.0
                 V_damon = True              pos : neg    =      6.2 : 1.0
                V_wasted = True              neg : pos    =      5.6 : 1.0
            V_ridiculous = True              neg : pos    =      5.4 : 1.0
                 V_awful = True              neg : pos    =      5.3 : 1.0
                 V_waste = True              neg : pos    =      5.2 : 1.0
                  V_lame = True              neg : pos    =      5.1 : 1.0
                 V_flynt = True              pos : neg    =      5.0 : 1.0
                V_poorly = True              neg : pos    =      4.8 : 1.0
                V_allows = True              pos : neg    =      4.5 : 1.0