# Named Entity Recognition (NER) using CRF

In [1]:
!pip install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn_crfsuite)
  Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn_crfsuite
Successfully installed python-crfsuite-0.9.11 sklearn_crfsuite-0.5.0


In [2]:
!gdown "1yd4PdpjD2mgDPm1B5mLLXOSjiXme1Gxb"

Downloading...
From: https://drive.google.com/uc?id=1yd4PdpjD2mgDPm1B5mLLXOSjiXme1Gxb
To: /content/ner_dataset.csv
100% 15.2M/15.2M [00:00<00:00, 32.3MB/s]


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [4]:
df = pd.read_csv('ner_dataset.csv', encoding = "ISO-8859-1")
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [5]:
df.isnull().sum()

Unnamed: 0,0
Sentence #,1000616
Word,10
POS,0
Tag,0


In [6]:
df = df.fillna(method='ffill')

  df = df.fillna(method='ffill')


In [7]:
df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique()

(47959, 35177, 17)

In [8]:
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [9]:
df.groupby('Tag').size().reset_index(name='counts')

Unnamed: 0,Tag,counts
0,B-art,402
1,B-eve,308
2,B-geo,37644
3,B-gpe,15870
4,B-nat,201
5,B-org,20143
6,B-per,16990
7,B-tim,20333
8,I-art,297
9,I-eve,253


In [10]:
X = df.drop('Tag', axis=1)
X.head()

Unnamed: 0,Sentence #,Word,POS
0,Sentence: 1,Thousands,NNS
1,Sentence: 1,of,IN
2,Sentence: 1,demonstrators,NNS
3,Sentence: 1,have,VBP
4,Sentence: 1,marched,VBN


In [11]:
classes = np.unique(df.Tag.values)

In [12]:
classes = classes.tolist()
classes

['B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim',
 'O']

In [13]:
len(classes)

17

In [14]:
new_classes = classes.copy()
new_classes.pop()
new_classes

['B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim']

### Conditional Random Fields (CRFs)

#### Get sentences

In [15]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                           s['POS'].values.tolist(),
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [16]:
getter = SentenceGetter(df)

  self.grouped = self.data.groupby('Sentence #').apply(agg_func)


In [17]:
sent = getter.get_next()
print(sent)

[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]


In [18]:
sentences = getter.sentences

#### Features extraction

Next, we extract more features (word parts, simplified POS tags, lower/title/upper flags, features of nearby words) and convert them to sklear-crfsuite format - each sentence should be converted to a list of dicts.

In [19]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

Split train and test sets.

In [20]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [22]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

In [23]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=new_classes)

0.8514704483847342

In [24]:
print(metrics.flat_classification_report(y_test, y_pred, labels = new_classes))

              precision    recall  f1-score   support

       B-art       0.46      0.13      0.20       143
       B-eve       0.53      0.38      0.44       106
       B-geo       0.86      0.91      0.88     12447
       B-gpe       0.97      0.94      0.95      5284
       B-nat       0.80      0.42      0.55        78
       B-org       0.80      0.73      0.77      6615
       B-per       0.85      0.83      0.84      5652
       B-tim       0.93      0.88      0.90      6856
       I-art       0.11      0.03      0.05       105
       I-eve       0.36      0.26      0.30        93
       I-geo       0.82      0.80      0.81      2520
       I-gpe       0.91      0.62      0.74        69
       I-nat       1.00      0.43      0.61        23
       I-org       0.81      0.80      0.81      5597
       I-per       0.85      0.90      0.87      5674
       I-tim       0.84      0.75      0.79      2207

   micro avg       0.86      0.85      0.85     53469
   macro avg       0.74   

In [None]:
print(sentences[0])

[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]


In [25]:
sample = sentences[0]

result = crf.predict_single(sent2features(sample))

print("Test Sample:", sample)
print("Prediction Result:", result)
print("Actual Result:", sent2labels(sample))

Test Sample: [('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]
Prediction Result: ['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']
Actual Result: ['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [27]:
result = crf.predict([sent2features(sample)])

print("Test Sample:", sample)
print("Prediction Result:", result)
print("Actual Result:", sent2labels(sample))

Test Sample: [('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]
Prediction Result: [['O' 'O' 'O' 'O' 'O' 'O' 'B-geo' 'O' 'O' 'O' 'O' 'O' 'B-geo' 'O' 'O' 'O'
  'O' 'O' 'B-gpe' 'O' 'O' 'O' 'O' 'O']]
Actual Result: ['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [29]:
test_sample = [('The', 'DT', 'O'), ('president', 'NN', 'O'), ('of', 'IN', 'O'), ('France', 'NNP', 'B-geo'), ('visited', 'VBD', 'O'), ('the', 'DT', 'O'), ('Eiffel', 'NNP', 'B-geo'), ('Tower', 'NNP', 'I-geo'), ('on', 'IN', 'O'), ('Saturday', 'NNP', 'B-tim'), ('to', 'TO', 'O'), ('attend', 'VB', 'O'), ('a', 'DT', 'O'), ('ceremony', 'NN', 'O'), ('celebrating', 'VBG', 'O'), ('the', 'DT', 'O'), ('city’s', 'NN', 'O'), ('cultural', 'JJ', 'O'), ('heritage', 'NN', 'O'), ('.', '.', 'O')]
test_sentences = "The president of France visited the Eiffel Tower on Saturday to attend a ceremony celebrating the city’s cultural heritage."

result = crf.predict([sent2features(test_sample)])
print("Test Sample:", test_sample)
print("Test Sentences: ", test_sentences)
print("Prediction Result:", result)
print("Actual Result:", sent2labels(test_sample))

Test Sample: [('The', 'DT', 'O'), ('president', 'NN', 'O'), ('of', 'IN', 'O'), ('France', 'NNP', 'B-geo'), ('visited', 'VBD', 'O'), ('the', 'DT', 'O'), ('Eiffel', 'NNP', 'B-geo'), ('Tower', 'NNP', 'I-geo'), ('on', 'IN', 'O'), ('Saturday', 'NNP', 'B-tim'), ('to', 'TO', 'O'), ('attend', 'VB', 'O'), ('a', 'DT', 'O'), ('ceremony', 'NN', 'O'), ('celebrating', 'VBG', 'O'), ('the', 'DT', 'O'), ('city’s', 'NN', 'O'), ('cultural', 'JJ', 'O'), ('heritage', 'NN', 'O'), ('.', '.', 'O')]
Test Sentences:  The president of France visited the Eiffel Tower on Saturday to attend a ceremony celebrating the city’s cultural heritage.
Prediction Result: [['O' 'O' 'O' 'B-geo' 'O' 'O' 'B-org' 'I-org' 'O' 'B-tim' 'O' 'O' 'O' 'O'
  'O' 'O' 'O' 'O' 'O' 'O']]
Actual Result: ['O', 'O', 'O', 'B-geo', 'O', 'O', 'B-geo', 'I-geo', 'O', 'B-tim', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [33]:
test_sample2 = []
for _test in test_sample:
    _test = list(_test)
    _test[2] = ''
    test_sample2.append(tuple(_test))

test_sentences = "The president of France visited the Eiffel Tower on Saturday to attend a ceremony celebrating the city’s cultural heritage."

result = crf.predict([sent2features(test_sample2)])
print("Test Sample:", test_sample2)
print("Test Sentences: ", test_sentences)
print("Prediction Result:", result)
print("Actual Result:", sent2labels(test_sample))

Test Sample: [('The', 'DT', ''), ('president', 'NN', ''), ('of', 'IN', ''), ('France', 'NNP', ''), ('visited', 'VBD', ''), ('the', 'DT', ''), ('Eiffel', 'NNP', ''), ('Tower', 'NNP', ''), ('on', 'IN', ''), ('Saturday', 'NNP', ''), ('to', 'TO', ''), ('attend', 'VB', ''), ('a', 'DT', ''), ('ceremony', 'NN', ''), ('celebrating', 'VBG', ''), ('the', 'DT', ''), ('city’s', 'NN', ''), ('cultural', 'JJ', ''), ('heritage', 'NN', ''), ('.', '.', '')]
Test Sentences:  The president of France visited the Eiffel Tower on Saturday to attend a ceremony celebrating the city’s cultural heritage.
Prediction Result: [['O' 'O' 'O' 'B-geo' 'O' 'O' 'B-org' 'I-org' 'O' 'B-tim' 'O' 'O' 'O' 'O'
  'O' 'O' 'O' 'O' 'O' 'O']]
Actual Result: ['O', 'O', 'O', 'B-geo', 'O', 'O', 'B-geo', 'I-geo', 'O', 'B-tim', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [34]:
test_sample2 = []
for _test in test_sample:
    _test = list(_test)
    _test[1] = ''
    _test[2] = ''
    test_sample2.append(tuple(_test))

test_sentences = "The president of France visited the Eiffel Tower on Saturday to attend a ceremony celebrating the city’s cultural heritage."

result = crf.predict([sent2features(test_sample2)])
print("Test Sample:", test_sample2)
print("Test Sentences: ", test_sentences)
print("Prediction Result:", result)
print("Actual Result:", sent2labels(test_sample))

Test Sample: [('The', '', ''), ('president', '', ''), ('of', '', ''), ('France', '', ''), ('visited', '', ''), ('the', '', ''), ('Eiffel', '', ''), ('Tower', '', ''), ('on', '', ''), ('Saturday', '', ''), ('to', '', ''), ('attend', '', ''), ('a', '', ''), ('ceremony', '', ''), ('celebrating', '', ''), ('the', '', ''), ('city’s', '', ''), ('cultural', '', ''), ('heritage', '', ''), ('.', '', '')]
Test Sentences:  The president of France visited the Eiffel Tower on Saturday to attend a ceremony celebrating the city’s cultural heritage.
Prediction Result: [['O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'B-tim' 'O' 'O' 'O' 'O' 'O' 'O' 'O'
  'O' 'O' 'O']]
Actual Result: ['O', 'O', 'O', 'B-geo', 'O', 'O', 'B-geo', 'I-geo', 'O', 'B-tim', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
