In [232]:
import xml.etree.ElementTree as ET

from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers 
from sklearn_crfsuite import metrics

import os
import random
from random import shuffle

In [233]:
# nltk.download_shell()

# XML Parser for NLTK Corpus
## What is this for
To train our CRF we have to prepare the data.  
It should have a format of list of sentences,   
with each word being represented by the following tuple:  
('word', 'some tag', 'maybe another tag as well', 'etc')

For starters, we gonna use these tags:
* ctag
* msd

## Parsing our Data

Unfortunately, ann_words.xml doesn't have info whether a word is a named entity or not.
Because of that, we have to use both ann_named.xml and ann_words.xml files.

First, we have to remove every folder that doesnt contain ann_words.xml or ann_named.xml

In [234]:
wordsFilename = 'ann_words.xml'
namedFilename = 'ann_named.xml'
folders = [name for name in os.listdir("_")]

################################
# for debug purposes
# folders = folders[:100]
################################

toRemove = []
i = 0
for folder in folders:
    folderPath = '.\\_\\' + folder + '\\'
    name = [name for name in os.listdir(folderPath)]
    if wordsFilename not in name or namedFilename not in name:
        toRemove.append(i)
    i += 1
    
i = 0
for j in toRemove:
    del folders[j-i]
    i += 1

Now we can parse the files to the list of sentences.

In [235]:
train_sents = list()

numberTokens = 0

for folder in folders:
    folderPath = '.\\_\\' + folder + '\\'

    tree = ET.parse(folderPath + wordsFilename)
    root = tree.getroot()
    tree = ET.parse(folderPath + namedFilename)
    namedRoot = tree.getroot()

    named_words = []

    for child in namedRoot.iter('{http://www.tei-c.org/ns/1.0}f'):
        if child.get('name') == 'orth':
            named_words.append(child.getchildren()[0].text)

    sent_arr = []

    for sentence in root.iter('{http://www.tei-c.org/ns/1.0}s'):
        data_arr = []
    
        for child in sentence.iter('{http://www.tei-c.org/ns/1.0}f'):
            if child.attrib['name'] == 'orth':
                numberTokens += 1
                word = child.getchildren()[0].text
                if word in named_words:
                    named = 'B'
                else:
                    named = 'O'
            elif child.attrib['name'] == 'ctag':
                ctag = child.getchildren()[0].attrib['value']
            elif child.attrib['name'] == 'msd':
                msd = child.getchildren()[0].attrib['value']
                data_tuple = (word, ctag, msd, named)
                #data_tuple = (word, str(random.randint(100,500)), msd, named)
                data_arr.append(data_tuple)

        train_sent = list(data_arr)
        sent_arr.append(train_sent)

    train_sents.extend(sent_arr)

## Data preparation
Let's divide data into train set (90%) and test set (10%)

In [236]:
division = len(train_sents) // 10
shuffle(train_sents)
test_sents = train_sents[:division]
train_sents = train_sents[division:]

In [237]:
print('Total number of tokens: ' + str(numberTokens))
print('Training set: ' + str(len(train_sents)) + " sentences.")
print('Testing set: ' + str(len(test_sents)) + " sentences.")

Total number of tokens: 983657
Training set: 65494 sentences.
Testing set: 7277 sentences.


# CRF training

## Learning
We have to define the features based on which our classifier will learn.

In [238]:
def word2features(sent, i):
    word = sent[i][0]
    ctag = sent[i][1]
    msg = sent[i][2]
  
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'ctag': ctag,
        'msg': msg,
#         'msg len': len(msg.split(':')),
        'msg start': msg.split(':')[0],
        'msg end': msg.split(':')[len(msg.split(':'))-1],
    }
  
    if i > 0:
        prev_word = sent[i-1][0]
        prev_ctag = sent[i-1][1]
        prev_msg = sent[i-1][2]
        features.update({
            '-1:word.lower()': prev_word.lower(),
            '-1:word.istitle()': prev_word.istitle(),
            '-1:word.isupper()': prev_word.isupper(),
            '-1:ctag': prev_ctag,
            '-1:msg': prev_msg,
#             '-1:msg len': len(prev_msg.split(':')),
            '-1:msg start': prev_msg.split(':')[0],
            '-1:msg end': prev_msg.split(':')[len(prev_msg.split(':'))-1],
        })
    else:
        features['BOS'] = True
    
    if i < len(sent)-1:
        next_word = sent[i+1][0]
        next_ctag = sent[i+1][1]
        next_msg = sent[i+1][2]
        features.update({
            '+1:word.lower()': next_word.lower(),
            '+1:word.istitle()': next_word.istitle(),
            '+1:word.isupper()': next_word.isupper(),
            '+1:ctag': next_ctag,
            '+1:msg': next_msg,
#             '+1:msg len': len(next_msg.split(':')),
            '+1:msg start': next_msg.split(':')[0],
            '+1:msg end': next_msg.split(':')[len(next_msg.split(':'))-1],
        })
    else:
        features['EOS'] = True
    
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, ctag, msd, label in sent]

def sent2tokens(sent):
    return [token for token, ctag, msd, label in sent]

In [245]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

## 10-fold cross-validation

In [246]:
%%time

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted')

rs = RandomizedSearchCV(crf, 
                        params_space,
                        cv=10,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=1,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  8.6min finished


Wall time: 10min 16s


In [247]:
y_pred = rs.predict(X_test)

print(metrics.flat_classification_report(y_test, y_pred, digits=3))

             precision    recall  f1-score   support

          B      0.933     0.885     0.908      5063
          O      0.994     0.997     0.995     94082

avg / total      0.991     0.991     0.991     99145

