# Named Entity Recognition - Baseline

In [1]:
"""
(Practical tip) Table of contents can be compiled directly in jupyter notebooks using the following code:
I set an exception: if the package is in your installation you can import it otherwise you download it 
then import it.
"""
try:
    from jyquickhelper import add_notebook_menu 
except:
    !pip install jyquickhelper
    from jyquickhelper import add_notebook_menu

In [2]:
"""
Output Table of contents to navigate easily in the notebook. 
For interested readers, the package also includes Ipython magic commands to go back to this cell
wherever you are in the notebook to look for cells faster
"""
add_notebook_menu()

## Imports

In [3]:
import numpy as np
import matplotlib.pyplot as plt

In [4]:
import sklearn

In [5]:
#!pip install git+https://github.com/MeMartijn/updated-sklearn-crfsuite.git#egg=sklearn_crfsuite
from sklearn_crfsuite import CRF, metrics

## The Dataset

A simple sentence NER example:

[**ORG** U.N. ] official [**PER** Ekeus ] heads for [**LOC** Baghdad ] 

We will concentrate on four types of named entities:
 * persons (**PER**), 
 * locations (**LOC**) 
 * organizations (**ORG**)
 * Others (**O**)

In [6]:
def _generate_examples(filepath):
        with open(filepath, encoding="utf-8") as f:
            sent = []
            for line in f:
                if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                    if sent:
                        yield sent
                        sent = []
                else:
                    splits = line.split(" ")
                    token = splits[0]
                    pos_tag = splits[1]
                    ner_tag = splits[3].rstrip()
                    if 'MISC' in ner_tag:
                        ner_tag = 'O'
                    
                    sent.append((token, pos_tag, ner_tag))

In [7]:
%%time
train_sents = list(_generate_examples('NER Dataset/train.txt'))
test_sents = list(_generate_examples('NER Dataset/test.txt'))

CPU times: user 406 ms, sys: 38.2 ms, total: 445 ms
Wall time: 825 ms


In [8]:
test_sents[2]

[('United', 'NNP', 'B-LOC'),
 ('Arab', 'NNP', 'I-LOC'),
 ('Emirates', 'NNPS', 'I-LOC'),
 ('1996-12-06', 'CD', 'O')]

In [9]:
# reduced features
def reduced_word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'w':word.lower(),
    }
    
    return sent[i][0].lower()

In [10]:
# sophisticated features
def sophisticated_word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],        
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
                
    return features



In [11]:
def sent2features(sent):
    return [reduced_word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [12]:
sent2features(test_sents[2])

['united', 'arab', 'emirates', '1996-12-06']

In [13]:
sent2labels(test_sents[2])

['B-LOC', 'I-LOC', 'I-LOC', 'O']

In [14]:
sent2tokens(test_sents[2])

['United', 'Arab', 'Emirates', '1996-12-06']

In [15]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

CPU times: user 262 ms, sys: 17.5 ms, total: 279 ms
Wall time: 601 ms


In [16]:
X_test[2]

['united', 'arab', 'emirates', '1996-12-06']

In [17]:
y_test[2]

['B-LOC', 'I-LOC', 'I-LOC', 'O']

## Baseline model

In [18]:
%%time
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

try: # https://stackoverflow.com/questions/66059532/attributeerror-crf-object-has-no-attribute-keep-tempfiles
    crf.fit(X_train, y_train)
except AttributeError:
    pass

In [19]:
y_pred = crf.predict(X_test)

In [20]:
labels = list(crf.classes_)
labels

['B-ORG', 'O', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-LOC']

In [21]:
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

0.8055111833833511

In [22]:
print(metrics.flat_classification_report(y_test, y_pred, labels=sorted(labels), digits=3))

              precision    recall  f1-score   support

       B-LOC      0.296     0.121     0.172      1667
       B-ORG      0.455     0.101     0.165      1660
       B-PER      0.410     0.131     0.198      1615
       I-LOC      0.163     0.027     0.047       257
       I-ORG      0.102     0.032     0.049       834
       I-PER      0.404     0.176     0.246      1156
           O      0.872     0.979     0.922     39203

    accuracy                          0.845     46392
   macro avg      0.386     0.224     0.257     46392
weighted avg      0.791     0.845     0.806     46392



In [23]:
# Generally we want the prediction without the O class. Just remove it.
labels.remove('O')
print(metrics.flat_classification_report(y_test, y_pred, labels=sorted(labels), digits=3))

              precision    recall  f1-score   support

       B-LOC      0.296     0.121     0.172      1667
       B-ORG      0.455     0.101     0.165      1660
       B-PER      0.410     0.131     0.198      1615
       I-LOC      0.163     0.027     0.047       257
       I-ORG      0.102     0.032     0.049       834
       I-PER      0.404     0.176     0.246      1156

   micro avg      0.344     0.114     0.171      7189
   macro avg      0.305     0.098     0.146      7189
weighted avg      0.348     0.114     0.169      7189



In [24]:
# Group B and I label
y_test2 = [item if len(item)==1 else item[2:] for sublist in y_test for item in sublist]
y_pred2 = [item if len(item)==1 else item[2:] for sublist in y_pred for item in sublist]

labels2 =list(np.unique(y_test2))
labels2.remove('O')

print(sklearn.metrics.classification_report(y_test2, y_pred2, labels=labels2, digits=3))

              precision    recall  f1-score   support

         LOC      0.300     0.113     0.165      1924
         ORG      0.323     0.082     0.131      2494
         PER      0.425     0.157     0.229      2771

   micro avg      0.360     0.119     0.179      7189
   macro avg      0.350     0.117     0.175      7189
weighted avg      0.357     0.119     0.178      7189



## The CRF transition model

In [25]:
len(crf.transition_features_)

49

In [26]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-PER  -> I-PER   5.007822
B-LOC  -> I-LOC   4.390106
B-ORG  -> I-ORG   4.317037
O      -> O       3.892950
I-ORG  -> I-ORG   2.227305
B-LOC  -> O       1.825590
I-LOC  -> I-LOC   1.591971
O      -> B-PER   1.553625
B-ORG  -> O       1.524175
O      -> B-LOC   1.326683
B-PER  -> O       1.070718
O      -> B-ORG   0.819926
I-PER  -> I-PER   0.111627
I-ORG  -> O       -0.131032
I-PER  -> O       -0.406888
I-LOC  -> O       -0.462165
B-ORG  -> I-LOC   -2.774514
B-PER  -> I-LOC   -2.789020
B-ORG  -> B-ORG   -2.973535
B-ORG  -> I-PER   -3.545578

Top unlikely transitions:
I-LOC  -> I-ORG   -4.324495
I-PER  -> I-LOC   -4.382247
B-PER  -> B-LOC   -4.457050
I-ORG  -> I-PER   -4.697409
O      -> I-PER   -4.916649
B-LOC  -> B-PER   -4.993257
O      -> I-ORG   -4.995963
B-ORG  -> B-LOC   -5.014555
I-LOC  -> B-LOC   -5.167761
I-ORG  -> B-ORG   -5.280133
B-PER  -> B-PER   -5.396340
I-PER  -> I-ORG   -5.407771
I-ORG  -> B-PER   -5.440242
B-PER  -> B-ORG   -5.748373
I-LOC  -> 

In [27]:
len(crf.state_features_)

263

In [28]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
6.536070 O        :
6.103176 O        "
5.685340 O        2
5.031274 O        5
4.625815 O        ,
4.472865 O        1
4.445613 O        9
4.321843 O        4
4.297237 O        )
4.290845 O        3
4.063955 I-ORG    /
3.952660 O        0
3.901412 O        (
3.774479 O        6
3.712470 I-ORG    &
3.610679 B-ORG    0
3.593618 O        *
3.583397 O        7
3.571319 B-ORG    &
3.525086 O        8
3.412688 O        $
3.090865 O        =
2.930699 B-ORG    8
2.730002 O        /
2.664872 B-ORG    6
2.566389 B-ORG    1
2.523711 O        ?
2.305855 O        ;
2.158848 I-ORG    6
1.936472 I-ORG    (

Top negative:
-0.410378 I-ORG    h
-0.417052 I-LOC    1
-0.421024 B-LOC    v
-0.438873 O        r
-0.464333 B-LOC    x
-0.498874 O        b
-0.501938 I-LOC    d
-0.574837 O        a
-0.588805 I-LOC    )
-0.674498 I-LOC    h
-0.683466 B-LOC    -
-0.734525 I-PER    )
-0.758843 I-LOC    v
-0.784627 O        x
-1.012517 I-LOC    '
-1.155841 B-ORG    '
-1.157525 I-LOC    q
-1.216810 I-OR

In [29]:
# Another method to see the transition

#!pip install eli5

# Transition features make sense: at least model learned that I-ENITITY must follow B-ENTITY.
# It also learned that some transitions are unlikely,
# e.g. it is not common in this dataset to have a location right after an organization name
# (I-ORG -> B-LOC has a large negative weight).
import eli5

eli5.show_weights(crf, top=10)

From \ To,O,B-LOC,I-LOC,B-ORG,I-ORG,B-PER,I-PER
O,3.893,1.327,-4.07,0.82,-4.996,1.554,-4.917
B-LOC,1.826,-3.839,4.39,-3.758,-3.876,-4.993,-3.716
I-LOC,-0.462,-5.168,1.592,-5.89,-4.324,-6.041,-4.21
B-ORG,1.524,-5.015,-2.775,-2.974,4.317,-4.118,-3.546
I-ORG,-0.131,-6.286,-3.853,-5.28,2.227,-5.44,-4.697
B-PER,1.071,-4.457,-2.789,-5.748,-3.703,-5.396,5.008
I-PER,-0.407,-7.39,-4.382,-7.491,-5.408,-7.151,0.112

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6
+6.536,:,,,,,
+6.103,"""",,,,,
+5.685,2,,,,,
+5.031,5,,,,,
+4.626,",",,,,,
+4.473,1,,,,,
+4.446,9,,,,,
+4.322,4,,,,,
+4.297,),,,,,
+4.291,3,,,,,

Weight?,Feature
+6.536,:
+6.103,""""
+5.685,2
+5.031,5
+4.626,","
+4.473,1
+4.446,9
+4.322,4
+4.297,)
+4.291,3

Weight?,Feature
+1.129,3
+1.111,.
+0.600,u
+0.518,a
+0.456,z
+0.383,n
… 10 more positive …,… 10 more positive …
… 10 more negative …,… 10 more negative …
-0.421,v
-0.464,x

Weight?,Feature
+1.165,&
… 8 more positive …,… 8 more positive …
… 15 more negative …,… 15 more negative …
-0.674,h
-0.759,v
-1.013,'
-1.158,q
-1.263,m
-1.471,w
-1.483,x

Weight?,Feature
+3.611,0
+3.571,&
+2.931,8
+2.665,6
+2.566,1
+1.781,4
+1.616,2
+1.434,3
+1.349,7
… 27 more positive …,… 27 more positive …

Weight?,Feature
+4.064,/
+3.712,&
+2.159,6
+1.936,(
+1.736,)
+1.728,","
+1.016,4
+0.836,1
… 19 more positive …,… 19 more positive …
… 10 more negative …,… 10 more negative …

Weight?,Feature
+1.125,j
+0.826,q
+0.402,x
+0.387,v
+0.380,1
+0.372,m
+0.280,h
… 9 more positive …,… 9 more positive …
… 11 more negative …,… 11 more negative …
-0.253,'

Weight?,Feature
+0.723,z
+0.604,h
+0.464,j
+0.429,v
+0.400,k
+0.250,l
… 12 more positive …,… 12 more positive …
… 9 more negative …,… 9 more negative …
-0.280,-
-0.298,p
