# Lab 03 - NLP - Named Entity Recognition

In [14]:
!pip install python-crfsuite
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
from nltk.corpus.reader import ConllChunkCorpusReader
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score
from seqeval.scheme import IOB1
import codecs

You should consider upgrading via the '/home/diptesh/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


## Reading Train/Test Data

In [15]:
mycorpus = ConllChunkCorpusReader(r"Twitter-NER/", r".*\.train",chunk_types="pos")
train_corpus = []
for tree in mycorpus.tagged_sents():
    train_corpus.append(tree)
    
print(len(train_corpus))
# print(train_corpus[1])

2394


In [16]:
mycorpus = ConllChunkCorpusReader(r"Twitter-NER/", r".*\.test",chunk_types="pos")
test_corpus = []
for tree in mycorpus.tagged_sents():
    test_corpus.append(tree)
print(len(test_corpus))
# print(test_corpus[1])

3850


## Features
Next, define some features. In this example we use word identity, word suffix, word shape; also, some information from nearby words is used.

In [17]:
#Every word is represented by a set of features. CRF allows us to give any arbitrary set of features
def word2features(sent, i):
    word = sent[i][0]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
    ]
    if i > 0:
        word1 = sent[i-1][0]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
        ])
    else:
        features.append('EOS')
                
    return features

def word2featuresTest(sent, i):
    word = sent[i]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
    ]
    if i > 0:
        word1 = sent[i-1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
        ])
    else:
        features.append('EOS')
                
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2featuresTest(sent):
    return [word2featuresTest(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [18]:
X_train = [sent2features(s) for s in train_corpus]
y_train = [sent2labels(s) for s in train_corpus]

X_test = [sent2features(s) for s in test_corpus]
y_test = [sent2labels(s) for s in test_corpus]

print(train_corpus[5][0])
print(X_train[5][0])

('RT', 'O')
['bias', 'word.lower=rt', 'word[-3:]=RT', 'word[-2:]=RT', 'word.isupper=True', 'word.istitle=False', 'word.isdigit=False', 'BOS', '+1:word.lower=@liltwist', '+1:word.istitle=False', '+1:word.isupper=False']


In [19]:
# 1%%time
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

## Hyperparameters

In [30]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-4,  # coefficient for L2 penalty
    'max_iterations': 100,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

## Training the Model

In [31]:
%%time
trainer.train('twitter-ner.crfsuite')

CPU times: user 6.83 s, sys: 37 µs, total: 6.83 s
Wall time: 6.83 s


In [32]:
tagger = pycrfsuite.Tagger()
tagger.open('twitter-ner.crfsuite')

<contextlib.closing at 0x7f7583d75be0>

## Obtaining Predictions

In [33]:
%%time
y_pred = [tagger.tag(xseq) for xseq in X_test]

CPU times: user 327 ms, sys: 0 ns, total: 327 ms
Wall time: 326 ms


## Dumping Predictions to Disk

In [34]:
with codecs.open('crf.out.txt', 'w') as out_file:
    for every_sent,pred_label in zip(test_corpus, y_pred):
        for every_word_correct_label,predicted_label in zip(every_sent, pred_label):
            out_file.write(every_word_correct_label[0] + "\t" + every_word_correct_label[1] + "\t" + predicted_label)
            out_file.write("\n")
        out_file.write("\n")

In [35]:
def conllReader(filename, word_field=0, label_field=1, prediction_field=2):
    sentences_true_labels = []
    sentences_pred_labels = []
    true_list = []
    pred_list = []
    label_list = []
    
    with codecs.open(filename, 'r', errors='ignore', encoding='utf8') as f_in:
        for line in f_in:
            line = line.strip()
            if line:
                if line.startswith('#'):
                    continue
                label = line.split('\t')[label_field]
                pred = line.split('\t')[prediction_field]
                true_list.append( label )
                pred_list.append( pred )
                
            else:
                if len(true_list) > 0:
                    sentences_true_labels.append( true_list )
                    sentences_pred_labels.append( pred_list )
                true_list = []
                pred_list = []
        f_in.close()
        
    return sentences_true_labels, sentences_pred_labels

In [36]:
true_labels, predicted_labels = conllReader("crf.out.txt")
# predicted_labels

In [37]:
print('F1 Score is')
print( f1_score(true_labels, predicted_labels) )

print('Classification report')
print( classification_report(true_labels, predicted_labels, scheme=IOB1) )

F1 Score is
0.25871559633027524
Classification report
              precision    recall  f1-score   support

     company       0.72      0.10      0.18       586
    facility       0.49      0.29      0.36       244
     geo-loc       0.63      0.38      0.47       768
       movie       0.33      0.04      0.06        28
 musicartist       0.11      0.01      0.01       180
       other       0.23      0.07      0.10       535
      person       0.33      0.22      0.26       466
     product       0.33      0.02      0.04       216
  sportsteam       0.00      0.00      0.00       128
      tvshow       0.00      0.00      0.00        24

   micro avg       0.48      0.18      0.26      3175
   macro avg       0.32      0.11      0.15      3175
weighted avg       0.44      0.18      0.23      3175



## Attaching PoS Tagger-based features

In [38]:
postagger = pycrfsuite.Tagger()
postagger.open('twitter-ner.crfsuite')

<contextlib.closing at 0x7f7583d751c0>

In [39]:
#Every word is represented by a set of features. CRF allows us to give any arbitrary set of features
def word2featurespos(sent, i):
    word = sent[i][0]
    postag = sent[i][2]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][2]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][2]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features

def sent2featurespos(sent, postagger):
    tagged = postagger.tag(sent2features(sent))
    sentNew = [word_nelabel+(pos, ) for word_nelabel,pos in zip(sent, tagged)]
    return [word2featurespos(sentNew, i) for i in range(len(sentNew))]

def sent2labelspos(sent):
    return [label for token, label in sent]

def sent2tokenspos(sent):
    return [token for token, label in sent]

In [41]:
for sent in train_corpus:
    print(sent)
    print("\n")
    s1 = postagger.tag(sent2features(sent))
    sentNew = [word_nelabel+(pos, ) for word_nelabel,pos in zip(sent, s1)]
    print(sentNew)
    break

[('@SammieLynnsMom', 'O'), ('@tg10781', 'O'), ('they', 'O'), ('will', 'O'), ('be', 'O'), ('all', 'O'), ('done', 'O'), ('by', 'O'), ('Sunday', 'O'), ('trust', 'O'), ('me', 'O'), ('*wink*', 'O')]


[('@SammieLynnsMom', 'O', 'O'), ('@tg10781', 'O', 'O'), ('they', 'O', 'O'), ('will', 'O', 'O'), ('be', 'O', 'O'), ('all', 'O', 'O'), ('done', 'O', 'O'), ('by', 'O', 'O'), ('Sunday', 'O', 'O'), ('trust', 'O', 'O'), ('me', 'O', 'O'), ('*wink*', 'O', 'O')]


In [46]:
X_train_pos = [sent2featurespos(s, postagger) for s in train_corpus]
y_train_pos = [sent2labelspos(s) for s in train_corpus]

X_test_pos = [sent2featurespos(s, postagger) for s in test_corpus]
y_test_pos = [sent2labelspos(s) for s in test_corpus]

# print(X_train_pos[0][0])

In [47]:
trainerpos = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train_pos, y_train_pos):
    trainerpos.append(xseq, yseq)

In [49]:
trainerpos.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 100,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [50]:
%%time
trainerpos.train('twitter-ner-pos.crfsuite')

CPU times: user 7.84 s, sys: 3.91 ms, total: 7.85 s
Wall time: 7.85 s


In [51]:
taggerpos = pycrfsuite.Tagger()
taggerpos.open('twitter-ner-pos.crfsuite')

<contextlib.closing at 0x7f758389f0d0>

In [52]:
%%time
y_pred_pos = [taggerpos.tag(xseq) for xseq in X_test_pos]

CPU times: user 490 ms, sys: 19 µs, total: 490 ms
Wall time: 489 ms


In [54]:
with codecs.open('crf.out.pos.txt', 'w') as out_file:
    for every_sent,pred_label in zip(test_corpus, y_pred_pos):
        for every_word_correct_label,predicted_label in zip(every_sent, pred_label):
            out_file.write(every_word_correct_label[0] + "\t" + every_word_correct_label[1] + "\t" + predicted_label)
            out_file.write("\n")
        out_file.write("\n")

In [60]:
true_labels, predicted_labels = conllReader("crf.out.pos.txt")

In [61]:
print('F1 Score is')
print( f1_score(true_labels, predicted_labels) )

print('Classification report')
print( classification_report(true_labels, predicted_labels, scheme=IOB1) )

F1 Score is
0.25929325378614043
Classification report
              precision    recall  f1-score   support

     company       0.72      0.10      0.18       586
    facility       0.48      0.29      0.36       244
     geo-loc       0.63      0.38      0.47       768
       movie       0.33      0.04      0.06        28
 musicartist       0.11      0.01      0.01       180
       other       0.23      0.07      0.10       535
      person       0.33      0.22      0.27       466
     product       0.33      0.02      0.04       216
  sportsteam       0.00      0.00      0.00       128
      tvshow       0.00      0.00      0.00        24

   micro avg       0.48      0.18      0.26      3175
   macro avg       0.32      0.11      0.15      3175
weighted avg       0.44      0.18      0.24      3175



## Adding Gazatteer features

In [62]:
from nltk.corpus.reader import plaintext

In [82]:
personGazetteer = []

mycorpus = plaintext.PlaintextCorpusReader("./Twitter-NER/", "firstname.5000")
person = []
for sent in mycorpus.words():
    person.append(sent)
    
mycorpus = plaintext.PlaintextCorpusReader("./Twitter-NER/", "lastname.5000")
for sent in mycorpus.words():
    person.append(sent)

person = [name.lower() for name in person]
personGazetteer = set(person)

In [83]:
#Every word is represented by a set of features. CRF allows us to give any arbitrary set of features
def word2featuresGaz(sent, i, personGaz):
    word = sent[i][0]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'word.ispersongaz=%s' % (word.lower() in personGaz),
    ]
    if i > 0:
        word1 = sent[i-1][0]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
        ])
    else:
        features.append('EOS')
                
    return features

def word2featuresTest(sent, i):
    word = sent[i]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
    ]
    if i > 0:
        word1 = sent[i-1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
        ])
    else:
        features.append('EOS')
                
    return features

def sent2featuresGaz(sent, personGaz):
    return [word2featuresGaz(sent, i, personGaz) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [84]:
X_train_gaz = [sent2featuresGaz(s, personGazetteer) for s in train_corpus]
y_train_gaz = [sent2labels(s) for s in train_corpus]

X_test_gaz = [sent2featuresGaz(s, personGazetteer) for s in test_corpus]
y_test_gaz = [sent2labels(s) for s in test_corpus]

In [85]:
trainergaz = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train_gaz, y_train_gaz):
    trainergaz.append(xseq, yseq)

In [86]:
trainergaz.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 150,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [87]:
%%time
trainergaz.train('twitter-ner-gaz.crfsuite')

CPU times: user 9.7 s, sys: 3.27 ms, total: 9.7 s
Wall time: 9.7 s


In [88]:
taggergaz = pycrfsuite.Tagger()
taggergaz.open('twitter-ner-gaz.crfsuite')

<contextlib.closing at 0x7f758389fca0>

In [89]:
%%time
y_pred_gaz = [taggergaz.tag(xseq) for xseq in X_test_gaz]

CPU times: user 352 ms, sys: 0 ns, total: 352 ms
Wall time: 352 ms


In [90]:
with codecs.open('crf.out.gaz.txt', 'w') as out_file:
    for every_sent,pred_label in zip(test_corpus, y_pred_gaz):
        for every_word_correct_label,predicted_label in zip(every_sent, pred_label):
            out_file.write(every_word_correct_label[0] + "\t" + every_word_correct_label[1] + "\t" + predicted_label)
            out_file.write("\n")
        out_file.write("\n")

In [91]:
true_labels, predicted_labels = conllReader("crf.out.gaz.txt")

In [93]:
print('F1 Score is')
print( f1_score(true_labels, predicted_labels) )

print('Classification report')
print( classification_report(true_labels, predicted_labels, scheme=IOB1) )

F1 Score is
0.2762382718743181
Classification report
              precision    recall  f1-score   support

     company       0.75      0.10      0.18       586
    facility       0.48      0.28      0.35       244
     geo-loc       0.62      0.34      0.44       768
       movie       0.14      0.04      0.06        28
 musicartist       0.12      0.01      0.01       180
       other       0.27      0.08      0.12       535
      person       0.34      0.41      0.38       466
     product       0.29      0.02      0.04       216
  sportsteam       0.00      0.00      0.00       128
      tvshow       0.11      0.04      0.06        24

   micro avg       0.45      0.20      0.28      3175
   macro avg       0.31      0.13      0.16      3175
weighted avg       0.45      0.20      0.25      3175



## Adding PoS Tag feature with the Gazetteer Feature

In [95]:
#Every word is represented by a set of features. CRF allows us to give any arbitrary set of features
def word2featuresposGaz(sent, i, personGaz):
    word = sent[i][0]
    postag = sent[i][2]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
        'word.ispersongaz=%s' % (word.lower() in personGaz),
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][2]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][2]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features

def sent2featuresposGaz(sent, personGaz, postagger):
    tagged = postagger.tag(sent2features(sent))
    sentNew = [word_nelabel+(pos, ) for word_nelabel,pos in zip(sent, tagged)]
    return [word2featuresposGaz(sentNew, i, personGaz) for i in range(len(sentNew))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [96]:
X_train_pos_gaz = [sent2featuresposGaz(s, personGazetteer, postagger) for s in train_corpus]
y_train_pos_gaz = [sent2labels(s) for s in train_corpus]

X_test_pos_gaz = [sent2featuresposGaz(s, personGazetteer,postagger) for s in test_corpus]
y_test_pos_gaz = [sent2labels(s) for s in test_corpus]

In [97]:
trainerposgaz = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train_pos_gaz, y_train_pos_gaz):
    trainerposgaz.append(xseq, yseq)

In [98]:
trainerposgaz.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 150,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [99]:
%%time
trainerposgaz.train('twitter-ner-pos-gaz.crfsuite')

CPU times: user 11.4 s, sys: 7.66 ms, total: 11.5 s
Wall time: 11.5 s


In [100]:
taggerposgaz = pycrfsuite.Tagger()
taggerposgaz.open('twitter-ner-pos-gaz.crfsuite')

<contextlib.closing at 0x7f7658067760>

In [101]:
%%time
y_pred_pos_gaz = [taggerposgaz.tag(xseq) for xseq in X_test_pos_gaz]

CPU times: user 478 ms, sys: 11.3 ms, total: 489 ms
Wall time: 488 ms


In [102]:
with codecs.open('crf.out.pos.gaz.txt', 'w') as out_file:
    for every_sent,pred_label in zip(test_corpus, y_pred_gaz):
        for every_word_correct_label,predicted_label in zip(every_sent, pred_label):
            out_file.write(every_word_correct_label[0] + "\t" + every_word_correct_label[1] + "\t" + predicted_label)
            out_file.write("\n")
        out_file.write("\n")

In [105]:
true_labels, predicted_labels = conllReader("crf.out.pos.gaz.txt")

In [106]:
print('F1 Score is')
print( f1_score(true_labels, predicted_labels) )

print('Classification report')
print( classification_report(true_labels, predicted_labels, scheme=IOB1) )

F1 Score is
0.2762382718743181
Classification report
              precision    recall  f1-score   support

     company       0.75      0.10      0.18       586
    facility       0.48      0.28      0.35       244
     geo-loc       0.62      0.34      0.44       768
       movie       0.14      0.04      0.06        28
 musicartist       0.12      0.01      0.01       180
       other       0.27      0.08      0.12       535
      person       0.34      0.41      0.38       466
     product       0.29      0.02      0.04       216
  sportsteam       0.00      0.00      0.00       128
      tvshow       0.11      0.04      0.06        24

   micro avg       0.45      0.20      0.28      3175
   macro avg       0.31      0.13      0.16      3175
weighted avg       0.45      0.20      0.25      3175



### This finishes the demonstration for the lab. Please go through the problem statement to complete your lab assignment.
#### You are requested to submit it within the deadline for this assignment. 