In [2]:
from nltk.corpus import names
import random
import nltk

labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
               [(name, 'female') for name in names.words('female.txt')])

random.shuffle(labeled_names)

In [3]:
def gender_features(word):
    feat = dict()
    
    nvow = len([let for let in word if let in ['a','e','i','o','u', 'y']]) #(Kane, 2024)
    ot = len(word) - nvow
    
    #feat['last_letter'] = word[-1]
    feat['first_letter'] = word[0]
    #feat['next_to_last_letter'] = word[-2]
    feat['name_length'] = len(word)
    feat['vowels'] = nvow 
    feat['last_letter_vowel'] = (word[-1] in ['a','e','i','o','u', 'y'])
    feat['more_vowels_than_consonants'] = (nvow >= ot)
    return feat

In [4]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [62]:
classifier.classify(gender_features('Dave'))


'female'

In [63]:
classifier.classify(gender_features('Ash'))

'male'

In [64]:
print(nltk.classify.accuracy(classifier, test_set))

0.76


In [66]:
classifier.show_most_informative_features(20)


Most Informative Features
                  vowels = 0                male : female =      5.8 : 1.0
            first_letter = 'W'              male : female =      4.8 : 1.0
       last_letter_vowel = False            male : female =      3.2 : 1.0
                  vowels = 1                male : female =      2.8 : 1.0
            first_letter = 'Q'              male : female =      2.8 : 1.0
            first_letter = 'U'              male : female =      2.7 : 1.0
                  vowels = 4              female : male   =      2.6 : 1.0
            first_letter = 'K'            female : male   =      2.4 : 1.0
            first_letter = 'H'              male : female =      2.3 : 1.0
            first_letter = 'X'              male : female =      2.3 : 1.0
       last_letter_vowel = True           female : male   =      2.2 : 1.0
             name_length = 3                male : female =      2.1 : 1.0
             name_length = 2                male : female =      2.0 : 1.0

In [68]:
labeled_names

[('Zenia', 'female'),
 ('Annelise', 'female'),
 ('Ruperto', 'male'),
 ('Katrinka', 'female'),
 ('Jasmine', 'female'),
 ('Aamir', 'male'),
 ('Alla', 'female'),
 ('Lucia', 'female'),
 ('Davoud', 'male'),
 ('Carlota', 'female'),
 ('Georgianna', 'female'),
 ('Mandie', 'female'),
 ('Lana', 'female'),
 ('Danny', 'male'),
 ('Sheffy', 'male'),
 ('Kynthia', 'female'),
 ('Rori', 'female'),
 ('Marcella', 'female'),
 ('Hilda', 'female'),
 ('Issi', 'female'),
 ('Burnaby', 'male'),
 ('Valencia', 'female'),
 ('Arvy', 'male'),
 ('Mikhail', 'male'),
 ('Phil', 'male'),
 ('Toddie', 'male'),
 ('Dawn', 'female'),
 ('Calley', 'female'),
 ('Cam', 'female'),
 ('Kaiser', 'male'),
 ('Winni', 'female'),
 ('Deana', 'female'),
 ('Luke', 'male'),
 ('Gallagher', 'male'),
 ('Kati', 'female'),
 ('Niels', 'male'),
 ('Katalin', 'female'),
 ('Kai', 'female'),
 ('Liana', 'female'),
 ('Ninnetta', 'female'),
 ('Maria', 'female'),
 ('Grace', 'male'),
 ('Dalenna', 'female'),
 ('Tarrance', 'male'),
 ('Wendall', 'male'),
 ('Lev

In [5]:
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features


In [8]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
classifier.classify(gender_features('Ben'))

'female'

In [7]:
print(nltk.classify.accuracy(classifier, test_set))


0.774


In [9]:
classifier.show_most_informative_features(20)

Most Informative Features
             last_letter = 'k'              male : female =     75.6 : 1.0
             last_letter = 'a'            female : male   =     33.1 : 1.0
             last_letter = 'f'              male : female =     16.7 : 1.0
             last_letter = 'p'              male : female =     11.9 : 1.0
             last_letter = 'd'              male : female =      9.2 : 1.0
             last_letter = 'v'              male : female =      9.2 : 1.0
             last_letter = 'm'              male : female =      9.1 : 1.0
             last_letter = 'o'              male : female =      8.0 : 1.0
             last_letter = 'r'              male : female =      6.9 : 1.0
                count(a) = 3              female : male   =      5.7 : 1.0
             last_letter = 'g'              male : female =      5.3 : 1.0
             last_letter = 'w'              male : female =      4.8 : 1.0
            first_letter = 'w'              male : female =      4.5 : 1.0

In [11]:
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

In [29]:
def gender_features(word):
    feat = dict()
    
    nvow = len([let for let in word if let in ['a','e','i','o','u', 'y']]) #(Kane, 2024)
    ot = len(word) - nvow
    
    feat['last_letter'] = word[-1]
    feat['first_letter'] = word[0]
    feat['first_2_letters'] = word[:2]
    feat['middle_letter'] = word[len(word) // 2]
    if (len(word) > 2): feat['last_3_letters'] = word[-3:]
    if (len(word) > 1): feat['last_2_letters'] = word[-2:]
    feat['next_to_last_letter'] = word[-2]
    feat['name_length'] = len(word)
    feat['vowels'] = nvow 
    feat['last_letter_vowel'] = (word[-1] in ['a','e','i','o','u', 'y'])
    feat['more_vowels_than_consonants'] = (nvow >= ot)
    return feat

In [30]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.82


In [31]:
classifier.show_most_informative_features(20)


Most Informative Features
          last_2_letters = 'na'           female : male   =     85.8 : 1.0
             last_letter = 'k'              male : female =     68.6 : 1.0
          last_2_letters = 'la'           female : male   =     66.2 : 1.0
          last_2_letters = 'ia'           female : male   =     35.2 : 1.0
             last_letter = 'a'            female : male   =     32.1 : 1.0
          last_2_letters = 'ra'           female : male   =     31.1 : 1.0
          last_2_letters = 'sa'           female : male   =     31.0 : 1.0
          last_2_letters = 'ta'           female : male   =     29.4 : 1.0
          last_3_letters = 'ana'          female : male   =     24.4 : 1.0
          last_2_letters = 'do'             male : female =     23.9 : 1.0
          last_3_letters = 'tta'          female : male   =     22.2 : 1.0
          last_2_letters = 'us'             male : female =     22.1 : 1.0
          last_2_letters = 'ch'             male : female =     21.7 : 1.0

In [32]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

In [28]:
for (tag, guess, name) in sorted(errors):
    print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))

correct=female   guess=male     name=Abbe                          
correct=female   guess=male     name=Adelind                       
correct=female   guess=male     name=Aeriell                       
correct=female   guess=male     name=Ag                            
correct=female   guess=male     name=Alis                          
correct=female   guess=male     name=Alison                        
correct=female   guess=male     name=Allsun                        
correct=female   guess=male     name=Bab                           
correct=female   guess=male     name=Bo                            
correct=female   guess=male     name=Brier                         
correct=female   guess=male     name=Carolin                       
correct=female   guess=male     name=Carrol                        
correct=female   guess=male     name=Charlot                       
correct=female   guess=male     name=Charmion                      
correct=female   guess=male     name=Chery      

In [42]:
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [43]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [44]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [45]:
print(nltk.classify.accuracy(classifier, test_set))

0.78


In [46]:
classifier.show_most_informative_features(5)

Most Informative Features
   contains(outstanding) = True              pos : neg    =     11.1 : 1.0
         contains(mulan) = True              pos : neg    =      8.3 : 1.0
        contains(seagal) = True              neg : pos    =      7.9 : 1.0
   contains(wonderfully) = True              pos : neg    =      7.3 : 1.0
         contains(damon) = True              pos : neg    =      6.1 : 1.0


In [47]:
from nltk.corpus import brown
suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1

In [48]:
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]

In [49]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
    return features

In [50]:
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]

In [51]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]

In [52]:
classifier = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)


0.6270512182993535

In [53]:
classifier.classify(pos_features('cats'))

'NNS'

In [54]:
classifier.classify(pos_features('cross'))

'NN'

In [55]:
classifier.classify(pos_features('was'))

'BEDZ'

In [58]:
classifier.classify(pos_features('breed'))

'VBN'

In [57]:
print(classifier.pseudocode(depth=4))


if endswith(the) == False: 
  if endswith(,) == False: 
    if endswith(s) == False: 
      if endswith(.) == False: return '.'
      if endswith(.) == True: return '.'
    if endswith(s) == True: 
      if endswith(is) == False: return 'PP$'
      if endswith(is) == True: return 'BEZ'
  if endswith(,) == True: return ','
if endswith(the) == True: return 'AT'



In [60]:
def pos_features(sentence, i):
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

In [61]:
tagged_sents = brown.tagged_sents(categories='news')
featuresets = []
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i, (word, tag) in enumerate(tagged_sent):
        featuresets.append( (pos_features(untagged_sent, i), tag) )

size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)

nltk.classify.accuracy(classifier, test_set)

0.7891596220785678

In [62]:
def pos_features(sentence, i, history): 
     features = {"suffix(1)": sentence[i][-1:],
                 "suffix(2)": sentence[i][-2:],
                 "suffix(3)": sentence[i][-3:]}
     if i == 0:
         features["prev-word"] = "<START>"
         features["prev-tag"] = "<START>"
     else:
         features["prev-word"] = sentence[i-1]
         features["prev-tag"] = history[i-1]
     return features

class ConsecutivePosTagger(nltk.TaggerI): 

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = pos_features(untagged_sent, i, history)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = pos_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

In [64]:
tagged_sents = brown.tagged_sents(categories='news')
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
tagger = ConsecutivePosTagger(train_sents)
print(tagger.accuracy(test_sents))

0.7980528511821975


In [65]:
posts = nltk.corpus.nps_chat.xml_posts()[:10000]

In [66]:
def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features['contains({})'.format(word.lower())] = True
    return features

In [67]:
featuresets = [(dialogue_act_features(post.text), post.get('class'))
               for post in posts]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.667


In [68]:
classifier.most_informative_features(20)

[('contains(hi)', True),
 ('contains(>)', True),
 ('contains(empty)', True),
 ('contains(part)', True),
 ('contains(brb)', True),
 ('contains(no)', True),
 ('contains(<)', True),
 ('contains(yes)', True),
 ('contains(0)', True),
 ('contains(are)', True),
 ('contains(na)', True),
 ('contains(ok)', True),
 ('contains(lol)', True),
 ('contains(tc)', True),
 ('contains(what)', True),
 ('contains(wan)', True),
 ('contains(where)', True),
 ('contains(right)', True),
 ('contains(and)', True),
 ('contains(u)', True)]

In [78]:
classifier.classify(dialogue_act_features("Put your hands up."))

'Statement'