In [38]:
import xml.etree.ElementTree as ET

from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers 
from sklearn_crfsuite import metrics

import os
import random
from random import shuffle

In [39]:
# nltk.download_shell()

# XML Parser for NLTK Corpus
## What is this for
To train our CRF we have to prepare the data.  
It should have a format of list of sentences,   
with each word being represented by the following tuple:  
('word', 'some tag', 'maybe another tag as well', 'etc')

For starters, we gonna use these tags:
* ctag
* msd

## Parsing our Data

Unfortunately, ann_words.xml doesn't have info whether a word is a named entity or not.
Because of that, we have to use both ann_named.xml and ann_words.xml files.

In [40]:
# in this very ugly bit of code we remove every folder that doesnt contain ann_words.xml or ann_named.xml

wordsFilename = 'ann_words.xml'
namedFilename = 'ann_named.xml'
folders = [name for name in os.listdir("_")]

################################
folders = folders[:100]
################################

toRemove = []
i = 0
for folder in folders:
    folderPath = '.\\_\\' + folder + '\\'
    name = [name for name in os.listdir(folderPath)]
    if wordsFilename not in name or namedFilename not in name:
        toRemove.append(i)
    i += 1
    
i = 0
for j in toRemove:
    del folders[j-i]
    i += 1

noword = 0
for folder in folders:
    folderPath = '.\\_\\' + folder + '\\'
    if wordsFilename not in [name for name in os.listdir(folderPath)]:
        noword += 1

print(noword, len(folders), noword/len(folders))

0 100 0.0


In [41]:
train_sents = list()

for folder in folders:
    folderPath = '.\\_\\' + folder + '\\'

    tree = ET.parse(folderPath + wordsFilename)
    root = tree.getroot()
    tree = ET.parse(folderPath + namedFilename)
    namedRoot = tree.getroot()

    named_words = []

    for child in namedRoot.iter('{http://www.tei-c.org/ns/1.0}f'):
        if child.get('name') == 'orth':
            named_words.append(child.getchildren()[0].text)

    sent_arr = []

    for sentence in root.iter('{http://www.tei-c.org/ns/1.0}s'):
        data_arr = []
    
        for child in sentence.iter('{http://www.tei-c.org/ns/1.0}f'):
            if child.attrib['name'] == 'orth':
                word = child.getchildren()[0].text
                if word in named_words:
                    named = 'B'
                else:
                    named = 'O'
            elif child.attrib['name'] == 'ctag':
                ctag = child.getchildren()[0].attrib['value']
            elif child.attrib['name'] == 'msd':
                msd = child.getchildren()[0].attrib['value']
                data_tuple = (word, ctag, msd, named)
                #data_tuple = (word, str(random.randint(100,500)), msd, named)
                data_arr.append(data_tuple)

        train_sent = list(data_arr)
        sent_arr.append(train_sent)

    train_sents.extend(sent_arr)

## Data preparation
Let's divide data into train set (90%) and test set (10%)

In [42]:
division = len(train_sents) // 10
shuffle(train_sents)
test_sents = train_sents[:division]
train_sents = train_sents[division:]

# CRF training

## Learning
We have to define the features based on which our classifier will learn.

In [43]:
def word2features(sent, i):
  word = sent[i][0]
  postag = sent[i][1]
  
  features = {
      'bias': 1.0,
      'word.lower()': word.lower(),
      'word[-3:]': word[-3:],
      'word[-2:]': word[-2:],
      'word.isupper()': word.isupper(),
      'word.istitle()': word.istitle(),
      'word.isdigit()': word.isdigit(),
      'postag': postag,
      'postag[:2]': postag[:2],
  }
  
  if i > 0:
    prev_word = sent[i-1][0]
    prev_postag = sent[i-1][1]
    features.update({
        '-1:word.lower()': prev_word.lower(),
        '-1:word.istitle()': prev_word.istitle(),
        '-1:word.isupper()': prev_word.isupper(),
        '-1:postag': prev_postag,
        '-1:postag[:2]': prev_postag[:2],
    })
  else:
    features['BOS'] = True
    
  if i < len(sent)-1:
    next_word = sent[i+1][0]
    next_postag = sent[i+1][1]
    features.update({
        '+1:word.lower()': next_word.lower(),
        '+1:word.istitle()': next_word.istitle(),
        '+1:word.isupper()': next_word.isupper(),
        '+1:postag': next_postag,
        '+1:postag[:2]': next_postag[:2],
    })
  else:
    features['EOS'] = True
    
  return features

def sent2features(sent):
  return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
  return [label for token, ctag, msd, label in sent]

def sent2tokens(sent):
  return [token for token, ctag, msd, label in sent]

In [44]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

## Training

In [45]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
_ = crf.fit(X_train, y_train)

## Evaluation

In [47]:
y_pred = crf.predict(X_test)

print(metrics.flat_classification_report(
    y_test, y_pred, digits=3
))

             precision    recall  f1-score   support

          B      0.833     0.723     0.774       173
          O      0.989     0.994     0.992      4533

avg / total      0.984     0.984     0.984      4706



## 10-fold cross-validation

In [49]:
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted')

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  4.0min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=None, c2=None,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error...e,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False),
          fit_params={}, iid=True, n_iter=50, n_jobs=-1,
          param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000017046169198>, 'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001704B89B9E8>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=make_scorer(flat_f1_score, average=weighted), verbose=1)

In [50]:
y_pred = rs.predict(X_test)

print(metrics.flat_classification_report(
    y_test, y_pred, digits=3
))

             precision    recall  f1-score   support

          B      0.833     0.751     0.790       173
          O      0.991     0.994     0.992      4533

avg / total      0.985     0.985     0.985      4706

