# Named Entity Recognition (NER) - ISSUES/PROBLEMS

Building 3 NER classifiers to detect locations, organisations, and people respectively from short sentences using Vowpal Wabbit's Python API (pyvw).

Results -
 - NER classifiers do not appear to generalise and learn sentence structure well. 
 - Overfit to the training data, 
 - Don't do well on examples with same sentence structure but different named entities.

In [67]:
import pyvw
import joblib
from pprint import pprint

# Vowpal Wabbit class instantation

In [3]:
class SequenceLabeler(pyvw.SearchTask):
    def __init__(self, vw, sch, num_actions):
        # you must must must initialize the parent class
        # this will automatically store self.sch <- sch, self.vw <- vw
        pyvw.SearchTask.__init__(self, vw, sch, num_actions)
        
        # set whatever options you want
        sch.set_options( sch.AUTO_HAMMING_LOSS | sch.AUTO_CONDITION_FEATURES )

    def _run(self, sentence):   # it's called _run to remind you that you shouldn't call it directly!
        output = []
        for n in range(len(sentence)):
            pos,word = sentence[n]
            # use "with...as..." to guarantee that the example is finished properly
            with self.vw.example({'w': [word]}) as ex:
                pred = self.sch.predict(examples=ex, my_tag=n+1, oracle=pos, condition=[(n,'p'), (n-1, 'q')])
                output.append(pred)
        return output

# Utility Functions
Functions to train models, extract entities, and evaluate test sentences

In [84]:
def train_model(training_set, num_labels=3):
    '''
    Train NER models for each category (location, organisation, and person). 
    The number of labels is 3 using the BIO encoding scheme
    '''
    vw = pyvw.vw(search=num_labels, search_task='hook', ring_size=1024)
    sequenceLabeler = vw.init_search_task(SequenceLabeler)    
    sequenceLabeler.learn(training_set)
    
    return sequenceLabeler

def extract_entities(sentence, label_list):
    '''
    Takes in a sentence and a sequence of labels (from the Vowpal Wabbit NER classifier) as input.
    Identifies subsequences in sentence where the classifier believes named entities are present. 
    It returns identified entities as a list of strings.
    '''
    entities = []
    for pos, label in enumerate(label_list):
        if label == 1:
            pos2 = pos+1
            current_entity = sentence[pos]
            while pos2 < len(label_list) and label_list[pos2] == 2:
                current_entity = current_entity + ' ' + sentence[pos2]
                pos2 += 1
            entities.append(current_entity)
    return entities

def eval_sentences(sentences):
    '''
    Takes in a list of sentences.
    Runs each model's predict() method and extracts predicted entities from the sentence.
    Returns these identified entities
    '''
    models = {'locations': loc_model, 'organisations': org_model, 'people': per_model}
    for sentence in sentences:
        print sentence
        for key, model in models.iteritems():
            split_sentence = sentence.split()
            vw_sentence = [(0, word) for word in split_sentence]
            predicted_labels = model.predict(vw_sentence)
            predicted_entities = extract_entities(sentence=split_sentence, label_list=predicted_labels)
            print key, '-', predicted_entities
        print ''

# Load data and train models
Each dataset contains only one type of entity - one of locations, persons, or organisations

## Location data and model

In [None]:
loc_train = joblib.load('conll_atis_merged_loc_train.pkl') #locations from CONLL 2003 and ATIS
loc_model = train_model(loc_train)

In [66]:
pprint([sentence for sentence in loc_train for label, word in sentence if label == 2 and len(sentence) <= 8][8:12])
# {'B-LOC': 1, 'I-LOC': 2, 'Other': 3}

[[(1, 'new'),
  (2, 'york'),
  (3, 'to'),
  (1, 'las'),
  (2, 'vegas'),
  (3, 'on'),
  (3, 'sunday'),
  (3, 'afternoon')],
 [(1, 'new'),
  (2, 'york'),
  (3, 'to'),
  (1, 'las'),
  (2, 'vegas'),
  (3, 'on'),
  (3, 'sunday'),
  (3, 'afternoon')],
 [(1, 'st'), (2, 'helens'), (3, ','), (1, 'england'), (3, '1996-08-26')],
 [(3, 'what'),
  (3, 'flights'),
  (3, 'from'),
  (1, 'denver'),
  (3, 'to'),
  (1, 'salt'),
  (2, 'lake'),
  (2, 'city')]]


## Organisation data and model

In [None]:
org_train = joblib.load('conll_org_train.pkl') #Organisations from CONLL 2003
org_model = train_model(org_train)

In [99]:
pprint([sentence for sentence in org_train for label, word in sentence if label == 2 and len(sentence) <= 6][12:16])
# {'B-ORG': 1, 'I-ORG': 2, 'Other': 3}

[[(1, 'chernomorets'),
  (2, 'novorossiisk'),
  (3, '2'),
  (1, 'rostselmash'),
  (2, 'rostov'),
  (3, '1')],
 [(3, '--'),
  (1, 'sydney'),
  (2, 'newsroom'),
  (3, '61-2'),
  (3, '9373'),
  (3, '1800')],
 [(1, 'financial'), (2, 'kathimerini')],
 [(1, 'nec'),
  (2, 'nijmegen'),
  (3, '1'),
  (1, 'psv'),
  (2, 'eindhoven'),
  (3, '4')]]


## People data and model

In [None]:
per_train = joblib.load('conll_per_train.pkl') #Persons from CONLL 2003
per_model = train_model(per_train)

In [74]:
pprint([sentence for sentence in per_train for label, word in sentence if label == 2 and len(sentence) <= 8][10:14])
# {'B-PER': 1, 'I-PER': 2, 'Other': 3}

[[(3, '1.'),
  (1, 'tom'),
  (2, 'pukstys'),
  (3, '('),
  (3, 'u.s.'),
  (3, ')'),
  (3, '86.82')],
 [(3, 'shelbourne'),
  (3, '-'),
  (1, 'mark'),
  (2, 'rutherford'),
  (3, '('),
  (3, '5th'),
  (3, ')')],
 [(1, 'fred'),
  (2, 'trueman'),
  (3, '('),
  (3, 'england'),
  (3, ')'),
  (3, '307'),
  (3, ','),
  (3, '67')],
 [(3, '5.'),
  (1, 'jamie'),
  (2, 'baulch'),
  (3, '('),
  (3, 'britain'),
  (3, ')'),
  (3, '45.08')]]


# Evaluate models on test sentences

 - We test the NER models on sentences that are based on training data. 
 - In each set of sentences, we modify only the named entities while keeping the sentence structure the same. 
 - Results indicate that the model is unable to generalise to new locations, people, and organisations with ease.

## Locations
In the examples below, only the location names change, but the `location` NER classifier is unable to recognise them.

In [102]:
#the first sentence comes directly from the training data, the other two use the same structure with different locations
sample_sentences = ['new york to las vegas on sunday afternoon', #flight tickets
                    'chennai to mumbai on sunday afternoon',
                    'lima to ascuncion on a sunday afternoon']
eval_sentences(sample_sentences)

new york to las vegas on sunday afternoon
organisations - []
locations - ['new york', 'las vegas']
people - []

chennai to mumbai on sunday afternoon
organisations - []
locations - []
people - []

lima to ascuncion on a sunday afternoon
organisations - []
locations - []
people - []



## Organisations
In the examples below, only the organisation names change, but the `organisation` NER classifier is unable to recognise them.

In [103]:
#the first sentence comes directly from the training data, the other two use the same structure with different orgs
sample_sentences2 = ['neuchatel 3 st gallen 0', #football clubs
                     'universidad 3 river plate 0',
                     'osasuna 3 real zaragoza 0']
eval_sentences(sample_sentences2)

neuchatel 3 st gallen 0
organisations - ['neuchatel', 'st gallen']
locations - []
people - []

universidad 3 river plate 0
organisations - []
locations - []
people - []

osasuna 3 real zaragoza 0
organisations - []
locations - []
people - []



## People
In the examples below, the person names change, but the `person` NER classifier is unable to recognise them.

In [104]:
#the first sentence is based on training data, the other two use the same structure with different persons
sample_sentences3 = ['bill gates is the ceo', 
                     'ma huateng is the ceo', #CEO of Tencent
                     'narayana moorthy is the ceo', #CEO of Infosys
                     'elon musk is the ceo'] #CEO of Tesla
eval_sentences(sample_sentences3)

bill gates is the ceo
organisations - []
locations - []
people - ['bill gates']

ma huateng is the ceo
organisations - []
locations - []
people - []

narayana moorthy is the ceo
organisations - []
locations - []
people - []

elon musk is the ceo
organisations - []
locations - []
people - []

