In [1]:
# Import libraries
import nltk
import pprint
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

The first thing to do is pick a tagged corpus that we want to train our tagger on. Here, we use the nltk treebank corpus.

In [2]:
# nltk.download('treebank')
tagged_sentences = nltk.corpus.treebank.tagged_sents()

In [3]:
print(tagged_sentences[0])

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


In [4]:
print("Tagged sentences: ", len(tagged_sentences))

Tagged sentences:  3914


In [5]:
print ("Tagged words:", len(nltk.corpus.treebank.tagged_words()))

Tagged words: 100676


Next, we need to determine what features our tagger will take into consideration when determining what tag to assign to a word. These can include whether the word is all capitalized, is in lowercase, or has one capital letter:

In [6]:
# Define the features for each word
def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }

In [7]:
# Test out the feature creator on a word
pprint.pprint(features(['This', 'is', 'a', 'sentence'], 2))

{'capitals_inside': False,
 'has_hyphen': False,
 'is_all_caps': False,
 'is_all_lower': True,
 'is_capitalized': False,
 'is_first': False,
 'is_last': False,
 'is_numeric': False,
 'next_word': 'sentence',
 'prefix-1': 'a',
 'prefix-2': 'a',
 'prefix-3': 'a',
 'prev_word': 'is',
 'suffix-1': 'a',
 'suffix-2': 'a',
 'suffix-3': 'a',
 'word': 'a'}


Create a function to strip the tagged words of their tags so that we can feed them into our tagger:

In [8]:
# Create tag stripper function
def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

Now we need to build our training set. Our tagger needs to take features individually for each word, but our corpus is actually in the form of sentences, so we need to do a little transforming. Split the data into training and testing sets. Apply this function on the training set.

In [9]:
# Split the dataset for training and testing
cutoff = int(.75 * len(tagged_sentences))
training_sentences = tagged_sentences[:cutoff]
test_sentences = tagged_sentences[cutoff:]
print(len(training_sentences)) # 2935
print(len(test_sentences)) # 979

2935
979


Create a function to assign the features to 'X' and append the POS tags to 'Y'.

In [10]:
# Create a function that will assign the features to X and append the POS tags to Y
def transform_to_dataset(tagged_sentences):
    X, y = [], []
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])
    return X, y

In [11]:
# Create the X and y dataset
X_train, y_train = transform_to_dataset(training_sentences)

In [12]:
# Check size of data
print(f'Datapoints: {len(X_train)}') 
print(f'Features: {len(X_train[0])}')

Datapoints: 75784
Features: 17


Apply this function on the training set. Now we can train our tagger. It's basically a classifier since it's categorizing words into classes, so we can use a classification algorithm. You can use any that you like or try out a bunch of them to see which works best. Here, we'll use the decision tree classifier. Import the classifier, initialize it, and fit the model on the training data. Print the accuracy score.

In [13]:
# Create a pipeline for training
clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', RandomForestClassifier(n_estimators=100, n_jobs=-1))
])

In [14]:
# Perform the training - this will take a while
clf.fit(X_train[0:50000], y_train[0:50000]);

In [15]:
# Create the test set
X_test, y_test = transform_to_dataset(test_sentences)

In [16]:
# Check size of data
print(f'Datapoints: {len(X_test)}') 
print(f'Features: {len(X_test[0])}')

Datapoints: 24892
Features: 17


In [17]:
print("Accuracy:", clf.score(X_test, y_test))

Accuracy: 0.9446408484653704
