#  CRF + expert.ai edge NL API for named entities recognition


---

## Data Preparation
CoNLL corpus is download and prepared for the training phase

### Methods for processing CoNLL corpus

In [1]:
CONLL_URL_ROOT = "https://raw.githubusercontent.com/nluninja/nlp_datasets/be9fd23409f1443790f6e1eab91d28b105769368/conll2003/data/"

In [2]:

import os
import re
import urllib
import pandas as pd
from math import nan
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report


In [3]:
def load_conll_data(filename, url_root=CONLL_URL_ROOT, 
                    only_tokens=False):
    """
    Take an url to the raw .txt files that you can find the repo linked above,
    load data and save it into a list of tuples data structure.
    
    Those files structure data with a word in each line with word, POS, 
    syntactic tag and entity tag separated by a whitespace. Sentences are 
    separated by an empty line.
    """
    lines = read_raw_conll(url_root, filename)
    X = []
    Y = []
    sentence = []
    labels = []
    output_labels=set()
    for line in lines:
        if line == "\n":
            if(len(sentence) != len(labels)):
                print(f"Error: we have {len(sentence)} words but {len(labels)} labels")
            if sentence and is_real_sentence(only_tokens, sentence):
                X.append(sentence)
                Y.append(labels)
            sentence = []
            labels = []
        else:
            features = line.split()
            tag = features.pop()
            labels.append(tag)
            output_labels.add(tag)
            if only_tokens:
                sentence.append(features.pop(0))
            else:
                sentence.append(tuple(features))
    
    print(f"Read {len(X)} sentences")
    if(len(X) != len(Y)):
        print("ERROR in reading data.")
    return X, Y, output_labels

In [4]:
def read_raw_conll(url_root, filename):
    """Read a file which contains a conll03 dataset"""
    lines = []
    full_url = url_root + filename
    lines = open_read_from_url(full_url)
    return lines[2:]

In [5]:
def open_read_from_url(url):
    """
    Take in input an url to a .txt file and return the list of its raws
    """
    print(f"Read file from {url}")
    file = urllib.request.urlopen(url)
    lines = []
    for line in file:
        lines.append(line.decode("utf-8"))

    return lines

In [6]:
def is_real_sentence(only_token, sentence):
    """Chek if a sentence is a real sentence or a document separator"""
    first_word = ""
    if only_token:
        first_word = sentence[0]
    else:
        first_word = sentence[0][0]

    if '---------------------' in first_word or first_word == '-DOCSTART-':
        return False
    else:
        return True

### Data loading

In [7]:
raw_train, y_train, output_labels = load_conll_data('train.txt', only_tokens=True)
raw_valid, y_valid, _ = load_conll_data('valid.txt', only_tokens=True)
raw_test, y_test, _ = load_conll_data('test.txt', only_tokens=True)

Read file from https://raw.githubusercontent.com/nluninja/nlp_datasets/be9fd23409f1443790f6e1eab91d28b105769368/conll2003/data/train.txt
Read 14028 sentences
Read file from https://raw.githubusercontent.com/nluninja/nlp_datasets/be9fd23409f1443790f6e1eab91d28b105769368/conll2003/data/valid.txt
Read 3250 sentences
Read file from https://raw.githubusercontent.com/nluninja/nlp_datasets/be9fd23409f1443790f6e1eab91d28b105769368/conll2003/data/test.txt
Read 3453 sentences


In [8]:
print(raw_train[0])
print(y_train[0])

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


---

## Feature generation with edge NL API

In [9]:
import os
os.environ["EAI_USERNAME"] = 'andrea.belli@gmail.com'
os.environ["EAI_PASSWORD"] = 'eXpert00!'

In [10]:
from expertai.nlapi.edge.client import ExpertAiClient
client = ExpertAiClient()

### Methods for performing tokenization and features generation 

In [11]:
from tqdm import tqdm, trange

In [12]:
def tokens_to_docs(raw, eai):
    """Analyze a sentence with expertai
    
    Take a list of sentences, where each sentence is a list of token; build a
    string with the sentence and analyze it with expertai.
    
    Params:
        raw: list of lists of tokens
        eai: Expertai instance
    Return:
        docs: list of expertai Document
    """
    docs = []
    for sent in tqdm(raw):
        docs.append(eai.full_analysis(' '.join(sent)))
    return docs

In [13]:
def _get_label(doc, syncon):
    """Extract the knowledge label of a syncon in a document, if any"""
    label = ''
    if hasattr(doc, 'knowledge'):
        for element in doc.knowledge:
            if element.syncon == syncon:
                label = element.label
                break
        if label and '.' in label:
            label = label.split('.')[-1]
    return label

In [14]:
def features_from_docs (sentences, docs):
    """Extract token features from expertai docs
    
    Given a list of tokenized sentences and the relative expertai docs, 
    create a dictionary for each with the doc features:
        * Word
        * PoS tag;
        * Dep tag;
        * Syncon;
        * Label;
        * Typeclass (a mix of POS and entity);
    Params:
        sentences: list of sentences, that are lists of strings;
        docs: list of expertai Document;
    Returns:
        eai_sents: list of sentences features, that are lists of dictionaries;
    """
    eai_sents = []
    for sent_idx in trange(len(sentences)):
        seek = 0    # Index of the part of the sentence string already read
        eai_tokenlist = []
        for tk_idx in range(len(sentences[sent_idx])):
            # Token text and boundary indexes in doc.content
            token = sentences[sent_idx][tk_idx]
            index_start = docs[sent_idx].content.find(token, seek)
            index_end = index_start + len(token)
            possible_tokens = []
            for t in docs[sent_idx].tokens:
                # If a eai Token contain (part of the) chunk od text, it can be
                # the possible corresponding Token
                if (t.start<=index_start and t.end>=index_end) or \
                (t.start >= index_start and t.start <= index_end) or \
                (t.end >= index_start and t.end <= index_end):
                    possible_tokens.append(t)
            if not possible_tokens:
                print('ERROR: expertai tokenization not found for token', token)
                eai_tokenlist.append(_voidtoken())
            else:
                # Extract information from the eai.Token for the raw token we 
                # are analyzing
                if len(possible_tokens)>1:
                    possible_tokens.sort(key = lambda t: t.syncon, reverse=True)
                new_token = {
                    'word': token,
                    'pos': possible_tokens[0].pos,
                    'syncon': possible_tokens[0].syncon,
                    'ancestor': -1,
                    'label': _get_label(docs[sent_idx], possible_tokens[0].syncon),
                    'dep': possible_tokens[0].dependency.label,
                    'typeclass': possible_tokens[0].type_.split('.')
                }
                eai_tokenlist.append(new_token)
            seek = index_end
            while len(docs[sent_idx].content) < seek and (docs[sent_idx].content[seek] == ' '):
                seek += 1
        eai_sents.append(eai_tokenlist)
    return eai_sents


In [15]:
def features_from_word(sentence, idx):
    """Extract features related to a word and its neighbours"""
    token = sentence[idx] 
    
    features = {
        'bias': 1.0,
        'word.lower()': token['word'].lower(),
        'word[-3:]': token['word'][-3:],
        'word[-2:]': token['word'][-2:],
        'word.isupper()': token['word'].isupper(),
        'word.istitle()': token['word'].istitle(),
        'word.isdigit()': token['word'].isdigit(),
        'eai.postag': token['pos'],
        'eai.postag[:2]': token['pos'][:2],
        'eai.deptag': token['dep'],
        'eai.deptag[-2:]': token['dep'][-2:],
        'eai.syncon': -1 if token['syncon'] == -1 else token['syncon'] / 10000.,
        'eai.ancestor': -1 if token['ancestor'] == -1 else token['ancestor'] / 10000.,
        'eai.labels': token['label'],
        'eai.typeclass': token['typeclass'],
    }
    if idx > 0:
        token1 = sentence[idx-1]
        features.update({
            '-1:word.lower()': token1['word'].lower(),
            '-1:word.istitle()': token1['word'].istitle(),
            '-1:word.isupper()': token1['word'].isupper(),
            '-1:eai.postag': token1['pos'],
            '-1:eai.deptag': token1['dep'],
            '-1:eai.labels': token1['label'],
            '-1:eai.typeclass': token1['typeclass'],
        })
    else:
        features['BOS'] = True
        
    if idx < len(sentence)-1:
        token1 = sentence[idx-1]
        features.update({
            '+1:word.lower()': token1['word'].lower(),
            '+1:word.istitle()': token1['word'].istitle(),
            '+1:word.isupper()': token1['word'].isupper(),
            '+1:eai.postag': token1['pos'],
            '+1:eai.deptag': token1['dep'],
            '+1:eai.labels': token1['label'],
            '+1:eai.typeclass': token1['typeclass'],
        })
    else:
        features['EOS'] = True
                
    return features

In [16]:
def features_from_sentence(sentence):
    """Create feature dictionary for a sentence"""
    return tuple(features_from_word(sentence, index) for index in range(len(sentence)))

In [17]:
def _voidtoken():
    """Generate an empty token"""
    t = {
        'word': '',
        'pos': '',
        'syncon': -1,
        'ancestor': -1,
        'dep': '',
        'label': ''
    }
    return t

### Generate tokens and features

In [None]:
train_docs = tokens_to_docs(raw_train, client)
test_docs = tokens_to_docs(raw_test, client)
valid_docs = tokens_to_docs(raw_valid, client)

 40%|█████████████████████████████▉                                            | 5673/14028 [19:29<25:19,  5.50it/s]

In [None]:
train = features_from_docs(raw_train, train_docs)
test = features_from_docs(raw_test, test_docs)
valid = features_from_docs(raw_valid, valid_docs)

In [None]:
import pprint
p_idx=2
print(raw_train[p_idx])
print(y_train[p_idx])
print('')
pprint.pprint(train[p_idx])
print('')
pprint.pprint([tk.__dict__ for tk in train_docs[p_idx].tokens])

#### Features Function

In [None]:
X_train = [features_from_sentence(sentence) for sentence in train]
X_test = [features_from_sentence(sentence) for sentence in test]
X_valid = [features_from_sentence(sentence) for sentence in valid]
pprint.pprint(X_train[1])

---

## Training the model

In [None]:
%%time
crf = None
gs = None


crf = sklearn_crfsuite.CRF(
    algorithm = 'lbfgs',
    c1 = 0.1,
    c2 = 0.5,
    max_iterations = 800,
    all_possible_transitions = True,
    verbose = True
    )
crf.fit(X_train, y_train, X_dev=X_valid, y_dev=y_valid)

---

## Model Evaluation

In [None]:
import time

def compute_prediction_latency(dataset, model, n_instances=-1):
    """Compute prediction latency of a model.
    
    The model must have a predict method.
    """
    if n_instances == -1:
        n_instances = len(dataset)
    start_time = time.process_time()
    model.predict(dataset)
    total_latency = time.process_time() - start_time
    return total_latency / n_instances

In [None]:
print('Model size: {:0.2f}M'.format(crf.size_ / 1000000))

In [None]:
print(f'Model latency in prediction: {compute_prediction_latency(X_test, crf):.3} s')

In [None]:
datasets = [ ('Test Set', X_test, y_test), ('Validation Set', X_valid, y_valid)]

for title, X, Y in datasets:
    Y_pred = crf.predict(X)
    print(title)
    print(classification_report(Y, Y_pred, digits=3))
    print('\n')