In [7]:
from nltk.corpus import names

import random

# WARMING UP EXERCISE: BUILD A GENDER CLASSIFIER

names = ([(name, 'male') for name in names.words('male.txt')] +
         [(name, 'female') for name in names.words('female.txt')])

# Create random name list
random.shuffle(names)

In [25]:
# Show examples
print([name for (name, gender) in names[:10]])    

['Dudley', 'Joyan', 'Maud', 'Ainslie', 'Marius', 'Tobey', 'Urbain', 'Shara', 'Kathryn', 'Bernita']


In [28]:
# Gender Feature function
def gender_features(word):
    return {'last_letter': word[-1]}

# Try gender_feature
gender_features('Thinh')

{'last_letter': 'h'}

In [36]:
# Generate a training set
featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

# Test classifier
classifier.classify(gender_features('Adam'))

# Test accuracy
print(nltk.classify.accuracy(classifier, test_set))

0.728


In [38]:
# Statistic on features
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'            female : male   =     34.3 : 1.0
             last_letter = 'k'              male : female =     30.4 : 1.0
             last_letter = 'v'              male : female =     17.6 : 1.0
             last_letter = 'f'              male : female =     17.4 : 1.0
             last_letter = 'p'              male : female =     12.6 : 1.0


In [53]:
# -------------------------------
# ERROR ANALYSIS TECHNIQUE
# -------------------------------

# Create name list
train_names = names[1500:]
devtest_names = names[500:1500]
test_names = names[:500]

# Create development set and test sest
train_set = [(gender_features(n), g) for (n,g) in train_names]
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]
test_set = [(gender_features(n), g) for (n,g) in test_names]

# Train classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

# Test accuracy
print(nltk.classify.accuracy(classifier, devtest_set))

# Generate a list of ERRORs
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append((tag, guess, name))

# Print out ERRORs list
count = 0
for (tag, guess, name) in sorted(errors[:10]): # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
    print('correct=%-8s guess=%-8s namme=%-30s' % (tag, guess, name)) 
    

0.781
correct=female   guess=male     namme=Lust                          
correct=female   guess=male     namme=Robinet                       
correct=female   guess=male     namme=Roselyn                       
correct=male     guess=female   namme=Abel                          
correct=male     guess=female   namme=Benjie                        
correct=male     guess=female   namme=Boyce                         
correct=male     guess=female   namme=Dana                          
correct=male     guess=female   namme=Darcy                         
correct=male     guess=female   namme=Jodie                         
correct=male     guess=female   namme=Kalil                         


In [58]:
# Base on ERROR ANALYSIS, adjusting gender_features
def gender_features(word):
    return {'suffix1': word[-1:],
            'suffix2' : word[-2:]}

# Rebuild the classifier
train_set = [(gender_features(n), g) for (n,g) in train_names]
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]
test_set = [(gender_features(n), g) for (n,g) in test_names]

classifier = nltk.NaiveBayesClassifier.train(train_set)

# Test accuracy
print(nltk.classify.accuracy(classifier, devtest_set))

# => The accuracy increase thanks to adding more feature

0.791


In [71]:
# DOCUMENT CLASSIFICATION

from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
                for category in movie_reviews.categories()
                for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

# Examples
print([c for (f,c) in documents[:5]])

['neg', 'pos', 'pos', 'neg', 'neg']


In [83]:
# FEATURE EXTRACTOR

# Find 2k most frequent words
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) # sorted
word_features = list(all_words.keys())[:2000]

# Generate features
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

# Example
# print(document_features(movie_reviews.words('pos/cv957_8737.txt')))

In [86]:
# TRAINING AND TESTING A CLASSIFIER FOR DOCUMENT CLASSIFICATION
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

# Test accuracy
print(nltk.classify.accuracy(classifier, test_set))

# Find most informative features
classifier.show_most_informative_features(5)

0.81
Most Informative Features
           contains(ugh) = True              neg : pos    =      9.5 : 1.0
 contains(unimaginative) = True              neg : pos    =      8.2 : 1.0
          contains(mena) = True              neg : pos    =      6.9 : 1.0
        contains(shoddy) = True              neg : pos    =      6.9 : 1.0
        contains(suvari) = True              neg : pos    =      6.9 : 1.0


In [1]:
# ---------------------------------
# PART OF SPEECH TAGGING
# ---------------------------------
import nltk
from  nltk.corpus import brown

# Find the most common suffixes
suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1

common_suffixes = list(suffix_fdist.keys())[:100]
print(common_suffixes)

['e', 'he', 'the', 'n', 'on', 'ton', 'y', 'ty', 'nty', 'd', 'nd', 'and', 'ry', 'ury', 'id', 'aid', 'ay', 'day', 'an', 'ion', 'f', 'of', 's', "'s", "a's", 't', 'nt', 'ent', 'ary', 'ed', 'ced', '`', '``', 'o', 'no', 'ce', 'nce', "'", "''", 'at', 'hat', 'ny', 'any', 'es', 'ies', 'k', 'ok', 'ook', 'ace', '.', 'r', 'er', 'her', 'in', 'end', 'ts', 'nts', 'ity', 've', 'ive', 'ee', 'tee', ',', 'h', 'ch', 'ich', 'ad', 'had', 'l', 'll', 'all', 'ge', 'rge', 'ves', 'se', 'ise', 'ks', 'nks', 'a', 'ta', 'nta', 'or', 'for', 'ner', 'as', 'was', 'ted', 'ber', 'm', 'rm', 'erm', 'en', 'een', 'ged', 'by', 'ior', 'rt', 'urt', 'dge', 'od']


In [2]:
# Create a feature extractor
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
    return features

# Note: word.lower().endswith(suffix) is a boolean
'thinh'.lower().endswith('nh')

True

In [3]:
# Train a 'decision tree' classifier
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]

size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]

classifier = nltk.DecisionTreeClassifier.train(train_set)

# Test accuracy
print(nltk.classify.accuracy(classifier, test_set))

0.5689706613625062


In [13]:
# Try the classifier
print(classifier.classify(pos_features('cats')))
print(classifier.classify(pos_features('missed')))
print(classifier.classify(pos_features('him')) + '\n')

# See the decision Tree
print(classifier.pseudocode(depth=5))

# NOTE: The classifer has low accuracy partly because it does not take into account the context of the sentence

NNS
VBN
NN

if endswith(the) == False: 
  if endswith(,) == False: 
    if endswith(s) == False: 
      if endswith(.) == False: 
        if endswith(of) == False: return '.'
        if endswith(of) == True: return 'IN'
      if endswith(.) == True: return '.'
    if endswith(s) == True: 
      if endswith(was) == False: 
        if endswith(as) == False: return 'PP$'
        if endswith(as) == True: return 'CS'
      if endswith(was) == True: return 'BEDZ'
  if endswith(,) == True: return ','
if endswith(the) == True: return 'AT'



In [16]:
# Add a context-dependent feature extractor
def pos_features(sentence, i):
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

# Try the new extractor
pos_features(brown.sents()[0],8)

{'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}

In [19]:
# Create new feature sets
tagged_sents = brown.tagged_sents(categories='news')
featuresets = []
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i, (word, tag) in enumerate(tagged_sent):
        featuresets.append((pos_features(untagged_sent, i), tag))
        
# NOTE: i represents the index using enumerate() method
#       nltk.tag.untag feature

In [20]:
# Train the classifer again
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)

# Test the accuracy
print(nltk.classify.accuracy(classifier, test_set))

# NOTE: The accuracy increases thanks to introducing context, however the classifier
# only check the actual previous words, not their tags. This can make our context-dependent feature less effective  

0.7891596220785678


In [22]:
# SEQUENCE CLASSIFICATION
# Strategy: consecutive classification (greedy sequence classification)

# Introduce tagged prev_word to feature extractor by adding history factor
def pos_features(sentence, i, history):
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    
    if i == 0:
        features["prev-word"] = "<START>"
        features["prev-tag"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
        features["prev-tag"] = history[i-1]
    return features

class ConsecutivePosTagger(nltk.TaggerI):
    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = pos_features(untagged_sent, i, history)
                train_set.append((featureset,tag))
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    
    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = pos_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

In [23]:
# Train the new classifier
tagged_sents = brown.tagged_sents(categories='news')
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
tagger = ConsecutivePosTagger(train_sents)

# Test the accuracy
print(tagger.evaluate(test_sents))

# NOTE: the accuracy increases thanks to adding history factor. However, the limitation of this approach is that when we label,
# a word, we cannot go back to relabel it when we have more evidence to make better decision. 
# => OPTION 1: Consider a transformational strategy (checking the Brill tagger)
# => OPTION 2: HIDDEN MARKOV MODELS 
#              assign scores to all possible sequences of POS tags, and choose the highest (generate probability distribution) 
#              use dynamic programming to reduce complexity 
#              (checking Maximum Entropy Markov Models & Linear-Chain Conditional Random Field Models)

0.7980528511821975


In [24]:
# -------------------------------
# SENTENCE SEGMENTATION TECHNIQUE
# -------------------------------

# Generating necessary features
sents = nltk.corpus.treebank_raw.sents()
tokens = []
boundaries = set()
offset = 0
for sent in nltk.corpus.treebank_raw.sents():
    tokens.extend(sent)
    offset += len(sent)
    boundaries.add(offset-1)

# NOTE: tokens is a merged list of individual sentences
#       boundaries is a set containing indexes of all sentence-boundary tokens

In [25]:
# Punctuation Feature Extractor
def punct_features(tokens, i):
    return {'next-word-capitalized': tokens[i+1][0].isupper(),
            'prevword': tokens[i-1].lower(),
            'punct': tokens[i],
            'prev-word-is-one-char': len(tokens[i-1]) == 1}

# Generate a feature set
featuresets = [(punct_features(tokens, i), (i in boundaries))
                    for i in range(1, len(tokens)-1)
                    if tokens[i] in '.?!']

In [26]:
# Train the punctuatioin classifier
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)

# Test the accuracy
nltk.classify.accuracy(classifier, test_set)

0.936026936026936

In [32]:
classifier.show_most_informative_features(5)

Most Informative Features
                prevword = 'mr'            False : True   =    136.9 : 1.0
                prevword = '3'             False : True   =     43.2 : 1.0
                prevword = '7'             False : True   =     40.4 : 1.0
                prevword = '2'             False : True   =     37.7 : 1.0
                prevword = '6'             False : True   =     25.6 : 1.0


In [43]:
# Using the punctuatioin classifier to build a sentence segmenter
def segment_sentences(words):
    start = 0
    sents = []
    for i, word in enumerate(words[:-1]):
        if word in '.?!' and classifier.classify(punct_features(words, i)) == True:
            sents.append(words[start:i+1])
            start = i+1
    if start < len(words):
        sents.append(words[start:])
    return sents

In [46]:
# Test sentence segmenter
# text = "Nice to meet you ! my name is Thinh . Can we be friends ?"
text = ["Nice", "to", "meet", "you", "!", "my", "name", "is", "Thinh", ".", "Can", "we", "be", "friends", "?"]
print(text)
segments = segment_sentences(text)
print(segments)

['Nice', 'to', 'meet', 'you', '!', 'my', 'name', 'is', 'Thinh', '.', 'Can', 'we', 'be', 'friends', '?']
[['Nice', 'to', 'meet', 'you', '!'], ['my', 'name', 'is', 'Thinh', '.'], ['Can', 'we', 'be', 'friends', '?']]


In [50]:
# Recreate sentece from BOW
s = ' '
sentences = [s.join(segment) for segment in segments]
print(sentences)

['Nice to meet you !', 'my name is Thinh .', 'Can we be friends ?']
