# Sequential Labeling

In machine learning, sequence labeling is a type of pattern recognition task that involves the algorithmic assignment of a categorical label to each member of a sequence of observed values. A common example of a sequence labeling task is part of speech tagging, which seeks to assign a part of speech to each word in an input sentence or document.

There are two famos algorithm for sequential labeling:
* HMM
* CRF

This tutorial will use CRF for a POS Tagger training.

In [1]:
from __future__ import unicode_literals
import pycrfsuite

In [2]:
from nltk.corpus import mac_morpho

In [3]:
list(mac_morpho.tagged_sents()[:10])

[[(u'Jersei', u'N')],
 [(u'atinge', u'V')],
 [(u'm\xe9dia', u'N')],
 [(u'de', u'PREP')],
 [(u'Cr$', u'CUR')],
 [(u'1,4', u'NUM')],
 [(u'milh\xe3o', u'N')],
 [(u'em', u'PREP|+')],
 [(u'a', u'ART')],
 [(u'venda', u'N')]]

In [4]:
# get only 1k sentences for fast exemplification
tagged_sents = mac_morpho.tagged_sents()[:1000]

In [5]:
# could you come up with better features?
def word2features(sent, i):
        word = sent[i]

        features = [
            'bias',
            'word.lower=' + word.lower(),
            'word[-3:]=' + word[-3:],
            'word[-2:]=' + word[-2:],
            'word.isupper=%s' % word.isupper(),
            'word.istitle=%s' % word.istitle(),
            'word.isdigit=%s' % word.isdigit(),
        ]
        if i > 0:
            word1 = sent[i - 1]
            features.extend([
                '-1:word.lower=' + word1.lower(),
                '-1:word.istitle=%s' % word1.istitle(),
                '-1:word.isupper=%s' % word1.isupper(),
            ])
        else:
            features.append('BOS')

        if i < len(sent) - 1:
            word1 = sent[i + 1]
            features.extend([
                '+1:word.lower=' + word1.lower(),
                '+1:word.istitle=%s' % word1.istitle(),
                '+1:word.isupper=%s' % word1.isupper(),
            ])
        else:
            features.append('EOS')

        return features
    
def sent2features(sent):
   return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
   return [label for token, label in sent]

def sent2tokens(sent):
   return [token for token, label in sent]

In [6]:
sentences = list()
for sent in tagged_sents:
    sentences.append([token for token, tag in sent])

X_train = [sent2features(s) for s in sentences]
y_train = [sent2labels(s) for s in tagged_sents]

In [7]:
trainer = pycrfsuite.Trainer(verbose=False)
trainer.select(algorithm='lbfgs', type='crf1d')

In [8]:
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)
trainer.train('pos.model')

In [9]:
tagger = pycrfsuite.Tagger()
tagger.open('pos.model')

<contextlib.closing at 0x7f88a0d260d0>

In [10]:
sent = 'O menino jogou a bola azul no gol .'.split()
X_test = sent2features(sent)
guess = tagger.tag(X_test)
print zip(sent, guess)

[(u'O', 'ART'), (u'menino', 'N'), (u'jogou', 'V'), (u'a', 'ART'), (u'bola', 'ADJ'), (u'azul', 'N'), (u'no', 'N'), (u'gol', 'N'), (u'.', 'N')]
