In [131]:
import xml.etree.ElementTree as ET

In [132]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers 
from sklearn_crfsuite import metrics

In [133]:
# nltk.download_shell()

In [134]:
import os
import random
from random import shuffle

# XML Parser for NLTK Corpus
## What is this for
To train our CRF we have to prepare the data.  
It should have a format of list of sentences,   
with each word being represented by the following tuple:  
('word', 'some tag', 'maybe another tag as well', 'etc')

For starters, we gonna use these tags:
* ctag
* msd

## Parsing our Data

Unfortunately, ann_words.xml doesn't have info whether a word is a named entity or not.
Because of that, we have to use both ann_named.xml and ann_words.xml files.

In [135]:
# in this very ugly bit of code we remove every folder that doesnt contain ann_words.xml or ann_named.xml

wordsFilename = 'ann_words.xml'
namedFilename = 'ann_named.xml'
folders = [name for name in os.listdir("_")]

################################
folders = folders[:100]
################################

toRemove = []
i = 0
for folder in folders:
    folderPath = '.\\_\\' + folder + '\\'
    name = [name for name in os.listdir(folderPath)]
    if wordsFilename not in name or namedFilename not in name:
        toRemove.append(i)
    i += 1
    
i = 0
for j in toRemove:
    del folders[j-i]
    i += 1

noword = 0
for folder in folders:
    folderPath = '.\\_\\' + folder + '\\'
    if wordsFilename not in [name for name in os.listdir(folderPath)]:
        noword += 1

print(noword, len(folders), noword/len(folders))

0 100 0.0


In [136]:
train_sents = list()

for folder in folders:
    folderPath = '.\\_\\' + folder + '\\'

    tree = ET.parse(folderPath + wordsFilename)
    root = tree.getroot()
    tree = ET.parse(folderPath + namedFilename)
    namedRoot = tree.getroot()

    named_words = []

    for child in namedRoot.iter('{http://www.tei-c.org/ns/1.0}f'):
        if child.get('name') == 'orth':
            named_words.append(child.getchildren()[0].text)

    sent_arr = []

    for sentence in root.iter('{http://www.tei-c.org/ns/1.0}s'):
        data_arr = []
    
        for child in sentence.iter('{http://www.tei-c.org/ns/1.0}f'):
            if child.attrib['name'] == 'orth':
                word = child.getchildren()[0].text
                if word in named_words:
                    named = 'B'
                else:
                    named = 'O'
            elif child.attrib['name'] == 'ctag':
                ctag = child.getchildren()[0].attrib['value']
            elif child.attrib['name'] == 'msd':
                msd = child.getchildren()[0].attrib['value']
                data_tuple = (word, ctag, msd, named)
                data_arr.append(data_tuple)

        train_sent = list(data_arr)
        sent_arr.append(train_sent)

    train_sents.extend(sent_arr)

## Data preparation
Let's divide data into train set (90%) and test set (10%)

In [137]:
division = len(train_sents) // 10
shuffle(train_sents)
test_sents = train_sents[:division]
train_sents = train_sents[division:]

# CRF training

## Learning
We have to define the features based on which our classifier will learn.

In [138]:
def word2features(sent, i):
  word = sent[i][0]
  postag = sent[i][1]
  
  features = {
      'bias': 1.0,
      'word.lower()': word.lower(),
      'word[-3:]': word[-3:],
      'word[-2:]': word[-2:],
      'word.isupper()': word.isupper(),
      'word.istitle()': word.istitle(),
      'word.isdigit()': word.isdigit(),
      'postag': postag,
      'postag[:2]': postag[:2],
  }
  
  if i > 0:
    prev_word = sent[i-1][0]
    prev_postag = sent[i-1][1]
    features.update({
        '-1:word.lower()': prev_word.lower(),
        '-1:word.istitle()': prev_word.istitle(),
        '-1:word.isupper()': prev_word.isupper(),
        '-1:postag': prev_postag,
        '-1:postag[:2]': prev_postag[:2],
    })
  else:
    features['BOS'] = True
    
  if i < len(sent)-1:
    next_word = sent[i+1][0]
    next_postag = sent[i+1][1]
    features.update({
        '+1:word.lower()': next_word.lower(),
        '+1:word.istitle()': next_word.istitle(),
        '+1:word.isupper()': next_word.isupper(),
        '+1:postag': next_postag,
        '+1:postag[:2]': next_postag[:2],
    })
  else:
    features['EOS'] = True
    
  return features

def sent2features(sent):
  return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
  return [label for token, ctag, msd, label in sent]

def sent2tokens(sent):
  return [token for token, ctag, msd, label in sent]

In [139]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

## Training

In [140]:
crf = sklearn_crfsuite.CRF(
    algorithm='arow', 
#     c1=0.1, 
#     c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
_ = crf.fit(X_train, y_train)

## Evaluation

In [141]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted')

0.9783418116933347

In [142]:
print(metrics.flat_classification_report(
    y_test, y_pred, digits=3
))

             precision    recall  f1-score   support

          B      0.737     0.730     0.734       200
          O      0.989     0.989     0.989      4706

avg / total      0.978     0.978     0.978      4906

