# Lab 03 - NLP - Named Entity Recognition

In [None]:
!pip install python-crfsuite
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
from nltk.corpus.reader import ConllChunkCorpusReader
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score
from seqeval.scheme import IOB1
import codecs

## Reading Train/Test Data

In [None]:
mycorpus = ConllChunkCorpusReader(r"Twitter-NER/", r".*\.train",chunk_types="pos")
# mycorpus = ConllChunkCorpusReader(r"../../MultiCoNER/test/HI-Hindi/", r"hi_train.conll", chunk_types="pos")

train_corpus = []
for tree in mycorpus.tagged_sents():
    train_corpus.append(tree)
    
print(len(train_corpus))
# print(train_corpus[1])

In [None]:
mycorpus = ConllChunkCorpusReader(r"Twitter-NER/", r".*\.test",chunk_types="pos")
# mycorpus = ConllChunkCorpusReader(r"../../MultiCoNER/test/HI-Hindi/", r"hi_dev.conll", chunk_types="pos")

test_corpus = []
for tree in mycorpus.tagged_sents():
    test_corpus.append(tree)
print(len(test_corpus))
# print(test_corpus[1])

## Features
Next, define some features. In this example we use word identity, word suffix, word shape; also, some information from nearby words is used.

In [None]:
#Every word is represented by a set of features. CRF allows us to give any arbitrary set of features
def word2features(sent, i):
    word = sent[i][0]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
    ]
    if i > 0:
        word1 = sent[i-1][0]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
        ])
    else:
        features.append('EOS')
                
    return features

def word2featuresTest(sent, i):
    word = sent[i]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
    ]
    if i > 0:
        word1 = sent[i-1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
        ])
    else:
        features.append('EOS')
                
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2featuresTest(sent):
    return [word2featuresTest(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [None]:
X_train = [sent2features(s) for s in train_corpus]
y_train = [sent2labels(s) for s in train_corpus]

X_test = [sent2features(s) for s in test_corpus]
y_test = [sent2labels(s) for s in test_corpus]

print(train_corpus[5][0])
print(X_train[5][0])

In [None]:
# 1%%time
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

## Hyperparameters

In [None]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-4,  # coefficient for L2 penalty
    'max_iterations': 100,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

## Training the Model

In [None]:
%%time
trainer.train('twitter-ner.crfsuite')

In [None]:
tagger = pycrfsuite.Tagger()
tagger.open('twitter-ner.crfsuite')

## Obtaining Predictions

In [None]:
%%time
y_pred = [tagger.tag(xseq) for xseq in X_test]
y_pred

## Dumping Predictions to Disk

In [None]:
with codecs.open('crf.out.txt', 'w') as out_file:
    for every_sent,pred_label in zip(test_corpus, y_pred):
        for every_word_correct_label,predicted_label in zip(every_sent, pred_label):
            out_file.write(every_word_correct_label[0] + "\t" + every_word_correct_label[1] + "\t" + predicted_label)
            out_file.write("\n")
        out_file.write("\n")

In [None]:
def conllReader(filename, word_field=0, label_field=1, prediction_field=2):
    sentences_true_labels = []
    sentences_pred_labels = []
    true_list = []
    pred_list = []
    label_list = []
    
    with codecs.open(filename, 'r', errors='ignore', encoding='utf8') as f_in:
        for line in f_in:
            line = line.strip()
            if line:
                if line.startswith('#'):
                    continue
                label = line.split('\t')[label_field]
                pred = line.split('\t')[prediction_field]
                true_list.append( label )
                pred_list.append( pred )
                
            else:
                if len(true_list) > 0:
                    sentences_true_labels.append( true_list )
                    sentences_pred_labels.append( pred_list )
                true_list = []
                pred_list = []
        f_in.close()
        
    return sentences_true_labels, sentences_pred_labels

In [None]:
true_labels, predicted_labels = conllReader("crf.out.txt")
# predicted_labels

In [None]:
print('F1 Score is')
print( f1_score(true_labels, predicted_labels) )

print('Classification report')
print( classification_report(true_labels, predicted_labels, scheme=IOB1) )

## Attaching PoS Tagger-based features

In [None]:
postagger = pycrfsuite.Tagger()
postagger.open('twitter-ner.crfsuite')

In [None]:
#Every word is represented by a set of features. CRF allows us to give any arbitrary set of features
def word2featurespos(sent, i):
    word = sent[i][0]
    postag = sent[i][2]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][2]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][2]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features

def sent2featurespos(sent, postagger):
    tagged = postagger.tag(sent2features(sent))
    sentNew = [word_nelabel+(pos, ) for word_nelabel,pos in zip(sent, tagged)]
    return [word2featurespos(sentNew, i) for i in range(len(sentNew))]

def sent2labelspos(sent):
    return [label for token, label in sent]

def sent2tokenspos(sent):
    return [token for token, label in sent]

In [None]:
for sent in train_corpus:
    print(sent)
    print("\n")
    s1 = postagger.tag(sent2features(sent))
    sentNew = [word_nelabel+(pos, ) for word_nelabel,pos in zip(sent, s1)]
    print(sentNew)
    break

In [None]:
X_train_pos = [sent2featurespos(s, postagger) for s in train_corpus]
y_train_pos = [sent2labelspos(s) for s in train_corpus]

X_test_pos = [sent2featurespos(s, postagger) for s in test_corpus]
y_test_pos = [sent2labelspos(s) for s in test_corpus]

# print(X_train_pos[0][0])

In [None]:
trainerpos = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train_pos, y_train_pos):
    trainerpos.append(xseq, yseq)

In [None]:
trainerpos.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 100,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [None]:
%%time
trainerpos.train('twitter-ner-pos.crfsuite')

In [None]:
taggerpos = pycrfsuite.Tagger()
taggerpos.open('twitter-ner-pos.crfsuite')

In [None]:
%%time
y_pred_pos = [taggerpos.tag(xseq) for xseq in X_test_pos]

In [None]:
with codecs.open('crf.out.pos.txt', 'w') as out_file:
    for every_sent,pred_label in zip(test_corpus, y_pred_pos):
        for every_word_correct_label,predicted_label in zip(every_sent, pred_label):
            out_file.write(every_word_correct_label[0] + "\t" + every_word_correct_label[1] + "\t" + predicted_label)
            out_file.write("\n")
        out_file.write("\n")

In [None]:
true_labels, predicted_labels = conllReader("crf.out.pos.txt")

In [None]:
print('F1 Score is')
print( f1_score(true_labels, predicted_labels) )

print('Classification report')
print( classification_report(true_labels, predicted_labels, scheme=IOB1) )

## Adding Gazatteer features

In [None]:
from nltk.corpus.reader import plaintext

In [None]:
personGazetteer = []

mycorpus = plaintext.PlaintextCorpusReader("./Twitter-NER/", "firstname.5000")
person = []
for sent in mycorpus.words():
    person.append(sent)
    
mycorpus = plaintext.PlaintextCorpusReader("./Twitter-NER/", "lastname.5000")
for sent in mycorpus.words():
    person.append(sent)

person = [name.lower() for name in person]
personGazetteer = set(person)

In [None]:
#Every word is represented by a set of features. CRF allows us to give any arbitrary set of features
def word2featuresGaz(sent, i, personGaz):
    word = sent[i][0]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'word.ispersongaz=%s' % (word.lower() in personGaz),
    ]
    if i > 0:
        word1 = sent[i-1][0]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
        ])
    else:
        features.append('EOS')
                
    return features

def word2featuresTest(sent, i):
    word = sent[i]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
    ]
    if i > 0:
        word1 = sent[i-1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
        ])
    else:
        features.append('EOS')
                
    return features

def sent2featuresGaz(sent, personGaz):
    return [word2featuresGaz(sent, i, personGaz) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [None]:
X_train_gaz = [sent2featuresGaz(s, personGazetteer) for s in train_corpus]
y_train_gaz = [sent2labels(s) for s in train_corpus]

X_test_gaz = [sent2featuresGaz(s, personGazetteer) for s in test_corpus]
y_test_gaz = [sent2labels(s) for s in test_corpus]

In [None]:
trainergaz = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train_gaz, y_train_gaz):
    trainergaz.append(xseq, yseq)

In [None]:
trainergaz.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 150,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [None]:
%%time
trainergaz.train('twitter-ner-gaz.crfsuite')

In [None]:
taggergaz = pycrfsuite.Tagger()
taggergaz.open('twitter-ner-gaz.crfsuite')

In [None]:
%%time
y_pred_gaz = [taggergaz.tag(xseq) for xseq in X_test_gaz]

In [None]:
with codecs.open('crf.out.gaz.txt', 'w') as out_file:
    for every_sent,pred_label in zip(test_corpus, y_pred_gaz):
        for every_word_correct_label,predicted_label in zip(every_sent, pred_label):
            out_file.write(every_word_correct_label[0] + "\t" + every_word_correct_label[1] + "\t" + predicted_label)
            out_file.write("\n")
        out_file.write("\n")

In [None]:
true_labels, predicted_labels = conllReader("crf.out.gaz.txt")

In [None]:
print('F1 Score is')
print( f1_score(true_labels, predicted_labels) )

print('Classification report')
print( classification_report(true_labels, predicted_labels, scheme=IOB1) )

## Adding PoS Tag feature with the Gazetteer Feature

In [None]:
#Every word is represented by a set of features. CRF allows us to give any arbitrary set of features
def word2featuresposGaz(sent, i, personGaz):
    word = sent[i][0]
    postag = sent[i][2]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
        'word.ispersongaz=%s' % (word.lower() in personGaz),
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][2]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][2]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features

def sent2featuresposGaz(sent, personGaz, postagger):
    tagged = postagger.tag(sent2features(sent))
    sentNew = [word_nelabel+(pos, ) for word_nelabel,pos in zip(sent, tagged)]
    return [word2featuresposGaz(sentNew, i, personGaz) for i in range(len(sentNew))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [None]:
X_train_pos_gaz = [sent2featuresposGaz(s, personGazetteer, postagger) for s in train_corpus]
y_train_pos_gaz = [sent2labels(s) for s in train_corpus]

X_test_pos_gaz = [sent2featuresposGaz(s, personGazetteer,postagger) for s in test_corpus]
y_test_pos_gaz = [sent2labels(s) for s in test_corpus]

In [None]:
trainerposgaz = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train_pos_gaz, y_train_pos_gaz):
    trainerposgaz.append(xseq, yseq)

In [None]:
trainerposgaz.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 150,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [None]:
%%time
trainerposgaz.train('twitter-ner-pos-gaz.crfsuite')

In [None]:
taggerposgaz = pycrfsuite.Tagger()
taggerposgaz.open('twitter-ner-pos-gaz.crfsuite')

In [None]:
%%time
y_pred_pos_gaz = [taggerposgaz.tag(xseq) for xseq in X_test_pos_gaz]

In [None]:
with codecs.open('crf.out.pos.gaz.txt', 'w') as out_file:
    for every_sent,pred_label in zip(test_corpus, y_pred_pos_gaz):
        for every_word_correct_label,predicted_label in zip(every_sent, pred_label):
            out_file.write(every_word_correct_label[0] + "\t" + every_word_correct_label[1] + "\t" + predicted_label)
            out_file.write("\n")
        out_file.write("\n")

In [None]:
true_labels, predicted_labels = conllReader("crf.out.pos.gaz.txt")

In [None]:
print('F1 Score is')
print( f1_score(true_labels, predicted_labels) )

print('Classification report')
print( classification_report(true_labels, predicted_labels, scheme=IOB1) )

### This finishes the demonstration for the lab. Please go through the problem statement to complete your lab assignment.
#### You are requested to submit it within the deadline for this assignment. 