# Conditional Markov Model 
### A.k.a. Maximum Entropy Markov Models
### Author: Omer Waseem
#### Description: This Python notebook trains and evaluates a CMM using CoNLL and NEEL datasets

In [1]:
from dataprep import conll_sentences, conll_words, neel_sentences, neel_words
from helper import accuracy, entity_count
from nltk import MaxentClassifier
from sklearn.metrics import precision_recall_fscore_support

## Using CoNLL 2003 Dataset

In [2]:
def get_conll_features(index, sentence, pos, chunk):
    """Function used to extract features for the CoNLL dataset
    
    'w' represents word feature
    't' represents POS tag feature
    'c' represents chunk tag feature
    '-n' represents previous 'n' feature
    '+n' represents posterior 'n' feature
    """
    
    features = {}
    last_index = len(sentence) - 1
    word = sentence[index]
    word_lc = word.lower()
    
    # features from current word:
    features['w'] = word
    features['t'] = pos[index]
    features['length'] = len(word)
    features['uppercase'] = any(x.isupper() for x in word)
    features['firstletter'] = word[0].isupper() and (len(word) > 1)
    features['hasdigits'] = any(x.isdigit() for x in word)
    features['c'] = chunk[index]
    features['loc_flag'] = ('field' in word_lc) or ('land' in word_lc) or ('burgh' in word_lc) or ('shire' in word_lc) 
    features['hasdot'] = ('.' in word and len(word) > 1)
    features['endsinns'] = (len(word) > 1 and word_lc[-2:] == 'ns')
    
    
    # features from previous 2 words
    if index == 0: # first word in sentence
        features['t-2 t-1'] = '<B> <B>'
        features['t-1'] = '<B>'
        features['w-2'] = '<B>'
        features['w-1'] = '<B>'
        features['c-2 c-1'] = '<B> <B>'
        features['c-1'] = '<B>'
    elif index == 1: # second word in sentence
        features['t-2 t-1'] = '<B> ' + pos[0]
        features['t-1'] = pos[0]
        features['w-2'] = '<B>'
        features['w-1'] = sentence[0]
        features['c-2 c-1'] = '<B> ' + chunk[0]
        features['c-1'] = chunk[0]
    else:
        features['t-2 t-1'] = pos[index-2] + ' ' + pos[index-1]
        features['t-1'] = pos[index-1]
        features['w-2'] = sentence[index-2]
        features['w-1'] = sentence[index-1]
        features['c-2 c-1'] = chunk[index-2] + ' ' + chunk[index-1]
        features['c-1'] = chunk[index-1]

      
    # features from posterior 2 words
    if index == last_index: # last word in sentence
        features['t+1 t+2'] = '<E> <E>'
        features['t+1'] = '<E>'
        features['w+2'] = '<E>'
        features['w+1'] = '<E>'
    elif index == last_index - 1: # second to last word in sentence
        features['t+1 t+2'] = pos[last_index] + ' <E>'
        features['t+1'] = pos[last_index]
        features['w+2'] = '<E>'
        features['w+1'] = sentence[last_index]
    else:
        features['t+1 t+2'] = pos[index+1] + ' ' + pos[index+2]
        features['t+1'] = pos[index+1]
        features['w+2'] = sentence[index+2]
        features['w+1'] = sentence[index+1]
    
    return features

### Get CoNLL training data from file and extract sentences with corresponding tags

In [3]:
c_train_file = './datasets/CoNLL2003/eng.train'
c_train_sent, c_train_pos, c_train_chunk, c_train_entity = conll_sentences(c_train_file)

### For each sentence create training data feature set

In [4]:
c_train_data = []
for sent, pos, chunk, entity in zip(c_train_sent, c_train_pos, c_train_chunk, c_train_entity): 
    if len(sent) != len(pos) or len(pos) != len(chunk) or len(chunk) != len(entity):
        raise ValueError('error: CoNLL train length mismatch')  
    for i, ent in enumerate(entity):
        labelled_data = (get_conll_features(i, sent, pos, chunk), ent)
        c_train_data.append(labelled_data)

### Train CMM using NLTK classifier and CoNLL training data

In [5]:
memm = MaxentClassifier.train(c_train_data, algorithm='MEGAM')

Exception ignored in: <generator object find_file_iter at 0x110e9c360>
RuntimeError: generator ignored GeneratorExit


### Evaluate trained model on test data

In [6]:
c_testa_file = './datasets/CoNLL2003/eng.testa'
c_testb_file = './datasets/CoNLL2003/eng.testb'
c_testc_file = './datasets/CoNLL2003/eng.testc'

c_testa_sent, c_testa_pos, c_testa_chunk, c_testa_entity = conll_sentences(c_testa_file)
c_testb_sent, c_testb_pos, c_testb_chunk, c_testb_entity = conll_sentences(c_testb_file)
c_testc_sent, c_testc_pos, c_testc_chunk, c_testc_entity = conll_sentences(c_testc_file)

c_teata_truth = []
c_testa_pred = []
for sent, pos, chunk, entity in zip(c_testa_sent, c_testa_pos, c_testa_chunk, c_testa_entity):
    if len(sent) != len(pos) or len(pos) != len(chunk) or len(chunk) != len(entity):
        raise ValueError('error: CoNLL testa length mismatch')
    for i, ent in enumerate(entity):
        c_teata_truth.append(ent)
        pred = memm.classify(get_conll_features(i, sent, pos, chunk))
        c_testa_pred.append(pred)

c_teatb_truth = []
c_testb_pred = []
for sent, pos, chunk, entity in zip(c_testb_sent, c_testb_pos, c_testb_chunk, c_testb_entity):
    if len(sent) != len(pos) or len(pos) != len(chunk) or len(chunk) != len(entity):
        raise ValueError('error: CoNLL testb length mismatch')
    for i, ent in enumerate(entity):
        c_teatb_truth.append(ent)
        pred = memm.classify(get_conll_features(i, sent, pos, chunk))
        c_testb_pred.append(pred)

c_teatc_truth = []
c_testc_pred = []
for sent, pos, chunk, entity in zip(c_testc_sent, c_testc_pos, c_testc_chunk, c_testc_entity):
    if len(sent) != len(pos) or len(pos) != len(chunk) or len(chunk) != len(entity):
        raise ValueError('error: CoNLL testc length mismatch')
    for i, ent in enumerate(entity):
        c_teatc_truth.append(ent)
        pred = memm.classify(get_conll_features(i, sent, pos, chunk))
        c_testc_pred.append(pred)

### CoNLL Evaluation
#### testa

In [8]:
accuracy(c_teata_truth, c_testa_pred)

accuracy = 48302 / 51362 = 0.940423


In [9]:
precision_recall_fscore_support(c_teata_truth, c_testa_pred)

(array([ 0.70308672,  0.76169265,  0.98155966,  0.64755392,  0.77272727]),
 array([ 0.68529131,  0.53943218,  0.98966299,  0.58843212,  0.83677358]),
 array([ 0.69407497,  0.63157895,  0.98559467,  0.61657901,  0.80347614]),
 array([ 2094,  1268, 42759,  2092,  3149]))

#### testb

In [10]:
accuracy(c_teatb_truth, c_testb_pred)

accuracy = 43173 / 46435 = 0.929751


In [11]:
precision_recall_fscore_support(c_teatb_truth, c_testb_pred)

(array([ 0.68520408,  0.66752911,  0.98136776,  0.65479332,  0.71844353]),
 array([ 0.69766234,  0.5620915 ,  0.97993372,  0.59655449,  0.81896863]),
 array([ 0.69137709,  0.61028977,  0.98065022,  0.62431866,  0.76541962]),
 array([ 1925,   918, 38323,  2496,  2773]))

#### testc

In [12]:
accuracy(c_teatc_truth, c_testc_pred)

accuracy = 33 / 35 = 0.942857


In [13]:
precision_recall_fscore_support(c_teatc_truth, c_testc_pred)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


(array([ 0.        ,  0.        ,  1.        ,  0.66666667,  1.        ]),
 array([ 0.,  0.,  1.,  1.,  1.]),
 array([ 0. ,  0. ,  1. ,  0.8,  1. ]),
 array([ 0,  2, 29,  2,  2]))

## Using NEEL 2006 Dataset

In [15]:
def get_neel_features(index, sentence):
    """Function used to extract features for the NEEL dataset
    
    'w' represents word feature
    '-n' represents previous 'n' feature
    '+n' represents posterior 'n' feature
    """
    
    features = {}
    last_index = len(sentence) - 1
    word = sentence[index]
    word_lc = word.lower()
    
    # features from current word:
    features['w'] = word
    features['length'] = len(word)
    features['uppercase'] = any(x.isupper() for x in word)
    features['firstletter'] = word[0].isupper() and (len(word) > 1)
    features['hasdigits'] = any(x.isdigit() for x in word)
    features['loc_flag'] = ('field' in word_lc) or ('land' in word_lc) or ('burgh' in word_lc) or ('shire' in word_lc) 
    features['hasdot'] = ('.' in word and len(word) > 1)
    features['endsinns'] = (len(word) > 1 and word_lc[-2:] == 'ns')
    
    
    # features from previous 2 words
    if index == 0: # first word in sentence
        features['w-2'] = '<B>'
        features['w-1'] = '<B>'
    elif index == 1: # second word in sentence
        features['w-2'] = '<B>'
        features['w-1'] = sentence[0]
    else:
        features['w-2'] = sentence[index-2]
        features['w-1'] = sentence[index-1]

      
    # features from posterior 2 words
    if index == last_index: # last word in sentence
        features['w+2'] = '<E>'
        features['w+1'] = '<E>'
    elif index == last_index - 1: # second to last word in sentence
        features['w+2'] = '<E>'
        features['w+1'] = sentence[last_index]
    else:
        features['w+2'] = sentence[index+2]
        features['w+1'] = sentence[index+1]
    
    return features

### Get NEEL training data from files and extract sentences with corresponding tags

In [18]:
n_train_gs_file = './datasets/NEEL2006/training_neel.gs'
n_train_tsv_file = './datasets/NEEL2006/training.tsv'
n_train_sent, n_train_ent, n_train_err = neel_sentences(n_train_gs_file, n_train_tsv_file)

n_train_data = []
for sent, entity in zip(n_train_sent, n_train_ent): 
    if len(sent) != len(entity):
        raise ValueError('error: NEEL train length mismatch')  
    for i, ent in enumerate(entity):
        labelled_data = (get_neel_features(i, sent), ent)
        n_train_data.append(labelled_data)

### Train CMM on NEEL training data

In [19]:
memm = MaxentClassifier.train(n_train_data, algorithm='MEGAM')

### Evaluate trained model on NEEL test data

In [21]:
n_test_gs_file = './datasets/NEEL2006/test_neel.gs'
n_test_tsv_file = './datasets/NEEL2006/test.tsv'
n_test_sent, n_test_ent, n_test_err = neel_sentences(n_test_gs_file, n_test_tsv_file)

n_test_truth = []
n_test_pred = []
for sent, entity in zip(n_test_sent, n_test_ent):
    if len(sent) != len(entity):
        raise ValueError('error: CoNLL testa length mismatch')
    for i, ent in enumerate(entity):
        n_test_truth.append(ent)
        pred = memm.classify(get_conll_features(i, sent, pos, chunk))
        n_test_pred.append(pred)

In [22]:
accuracy(n_test_truth, n_test_pred)

accuracy = 4665 / 5408 = 0.862611


In [23]:
precision_recall_fscore_support(n_test_truth, n_test_pred)

(array([ 0.66666667,  0.49094567,  0.913339  ,  0.36363636,  0.60555556]),
 array([ 0.21621622,  0.50413223,  0.98601238,  0.02739726,  0.28684211]),
 array([ 0.32653061,  0.49745158,  0.94828537,  0.05095541,  0.38928571]),
 array([  37,  484, 4361,  146,  380]))

In [25]:
entity_count(n_test_truth)

ORG: 146
PER: 380
LOC: 37
MISC: 484
O: 4361
