# Assignment 3

Date: 20-12-2020 <br>
Nick Radunovic (s2072724) <br>
Cheyenne Heath (s1647865) <br>

In [2]:
#Imports
import random
import os
random.seed(30) # set random seed for reproducibility

import numpy as np
from itertools import chain
from collections import Counter
import eli5

import nltk
nltk.download('punkt')
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Stand\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Preprocessing

First, we define function that are used for preprocessing the data

In [13]:
def parse_data(text, annotation):    
    # parse annotation-file
    bio_tag = []
    with open(annotation, encoding='utf-8') as ann:
        for l in ann:
            l = l.split('\t')
            tag = l[1].split()[0]
            if not tag == 'AnnotatorNotes':
                string = l[2].strip()
                bio_tag.append((tag, string))
                
    # parse text-file and add POS tags
    sents = []
    with open(text, encoding='utf-8') as fp:
        for line in fp:
            line = line.strip()
            line = nltk.word_tokenize(line)
            pos_line = add_pos_tag(line)
            sents.append(pos_line)
    
    # add BIO tags and format data as follows -> (word, pos, biotag)
    sents = add_bio_tag(sents, bio_tag)
    
    return sents


def add_pos_tag(sent):
    return nltk.pos_tag(sent)


def add_bio_tag(sents, bio_tag):
    """ We use 'cadec/original/' as annotation to add BIO-tags """
    bio_sent = []
    for sent in sents:
        remaining = 0
        for i, word in enumerate(sent):
            BIOtag = 'O'
            for j, tag in enumerate(bio_tag):
                target = nltk.word_tokenize(tag[1])
                tag = tag[0]
                count = 0
                
                # changes the biotag to either 'B-' or 'I-' when necessary
                if word[0] == target[0]:
                    for k in range(len(target)):
                        if len(sent) > i+k and sent[i+k][0] == target[k]:
                            count += 1
                            
                    # if target is found, the current word gets a 'B-' tag assigned
                    if count == len(target):
                        definite_tag = tag
                        BIOtag = 'B-' + definite_tag
                        remaining = len(target) - 1
                        break
            
            # changes the biotag to 'I-' when necessary
            if remaining > 0 and BIOtag == 'O':
                BIOtag = 'I-' + definite_tag
                remaining -= 1
          
            bio_sent.append((word[0], word[1], BIOtag))
    return bio_sent


# VERSION: splitting by sentence
# def add_bio_tag(sents, bio_tag):
#     """ We use 'cadec/original/' as annotation to add BIO-tags """
#     msg = []
#     for sent in sents:
#         bio_sent = []
#         remaining = 0
#         for i, word in enumerate(sent):
#             BIOtag = 'O'
#             for j, tag in enumerate(bio_tag):
#                 target = nltk.word_tokenize(tag[1])
#                 tag = tag[0]
#                 count = 0
                
#                 # changes the biotag to either 'B-' or 'I-' when necessary
#                 if word[0] == target[0]:
#                     for k in range(len(target)):
#                         if len(sent) > i+k and sent[i+k][0] == target[k]:
#                             count += 1
                            
#                     # if target is found, the current word gets a 'B-' tag assigned
#                     if count == len(target):
#                         definite_tag = tag
#                         BIOtag = 'B-' + definite_tag
#                         remaining = len(target) - 1
#                         break
            
#             # changes the biotag to 'I-' when necessary
#             if remaining > 0 and BIOtag == 'O':
#                 BIOtag = 'I-' + definite_tag
#                 remaining -= 1
          
#             bio_sent.append((word[0], word[1], BIOtag))
#         msg.append(bio_sent)
#     return msg

We preprocess the data:

(1) The data is parsed,
(2) POS tags are added,
(3) BIO tags are added.

In [14]:
DIR_text = 'cadec/text/'
DIR_annotation = 'cadec/original/'

data = []
for f in os.listdir(DIR_text):
    text = DIR_text + f
    ann = DIR_annotation + f[:-4] + '.ann'
    m = parse_data(text, ann)
    data.append(m)

# shows first patient post
data[0]

[('I', 'PRP', 'O'),
 ('feel', 'VBP', 'O'),
 ('a', 'DT', 'O'),
 ('bit', 'NN', 'B-ADR'),
 ('drowsy', 'JJ', 'I-ADR'),
 ('&', 'CC', 'O'),
 ('have', 'VBP', 'O'),
 ('a', 'DT', 'O'),
 ('little', 'JJ', 'B-ADR'),
 ('blurred', 'JJ', 'I-ADR'),
 ('vision', 'NN', 'I-ADR'),
 (',', ',', 'O'),
 ('so', 'RB', 'O'),
 ('far', 'RB', 'O'),
 ('no', 'DT', 'O'),
 ('gastric', 'JJ', 'B-ADR'),
 ('problems', 'NNS', 'I-ADR'),
 ('.', '.', 'O'),
 ('I', 'PRP', 'O'),
 ("'ve", 'VBP', 'O'),
 ('been', 'VBN', 'O'),
 ('on', 'IN', 'O'),
 ('Arthrotec', 'NNP', 'B-Drug'),
 ('50', 'CD', 'O'),
 ('for', 'IN', 'O'),
 ('over', 'IN', 'O'),
 ('10', 'CD', 'O'),
 ('years', 'NNS', 'O'),
 ('on', 'IN', 'O'),
 ('and', 'CC', 'O'),
 ('off', 'IN', 'O'),
 (',', ',', 'O'),
 ('only', 'RB', 'O'),
 ('taking', 'VBG', 'O'),
 ('it', 'PRP', 'O'),
 ('when', 'WRB', 'O'),
 ('I', 'PRP', 'O'),
 ('needed', 'VBD', 'O'),
 ('it', 'PRP', 'O'),
 ('.', '.', 'O'),
 ('Due', 'JJ', 'O'),
 ('to', 'TO', 'O'),
 ('my', 'PRP$', 'O'),
 ('arthritis', 'NN', 'B-Disease'),
 ('g

## Training the NER classifier

We split the data into a 80% train set and a 20% test set.

In [32]:
train_size = 0.8  # specifying the % of data used as train set

split = int(len(data) * train_size)
train_data, test_data = data[:split], data[split:]

In [16]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [19]:
X_train = [sent2features(s) for s in train_data]
y_train = [sent2labels(s) for s in train_data]

X_test = [sent2features(s) for s in test_data]
y_test = [sent2labels(s) for s in test_data]

In [20]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [21]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-ADR',
 'I-ADR',
 'B-Drug',
 'B-Disease',
 'B-Symptom',
 'I-Symptom',
 'I-Disease',
 'I-Drug',
 'B-Finding',
 'I-Finding']

In [22]:
y_pred = crf.predict(X_test)

sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

       B-ADR      0.732     0.686     0.708       994
       I-ADR      0.655     0.544     0.594      1643
   B-Disease      0.148     0.138     0.143        29
   I-Disease      0.071     0.077     0.074        13
      B-Drug      0.983     0.766     0.861       304
      I-Drug      0.769     0.526     0.625        19
   B-Finding      0.390     0.147     0.213       109
   I-Finding      0.205     0.059     0.091       136
   B-Symptom      0.535     0.329     0.407        70
   I-Symptom      0.182     0.047     0.074        43

   micro avg      0.688     0.557     0.616      3360
   macro avg      0.467     0.332     0.379      3360
weighted avg      0.666     0.557     0.603      3360



In [24]:
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=15, 
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 15 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  4.6min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=CRF(algorithm='lbfgs', all_possible_states=None,
                                 all_possible_transitions=True, averaging=None,
                                 c=None, c1=None, c2=None,
                                 calibration_candidates=None,
                                 calibration_eta=None,
                                 calibration_max_trials=None,
                                 calibration_rate=None,
                                 calibration_samples=None, delta=None,
                                 epsilon=None, error_sensitive=None, gamma=None,
                                 keep_...
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000024DA45FCB08>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000024DA45FC988>},
                   pre_dispatch='2*n_jobs', random_state=None, refi

In [25]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

print("\nweighted avg:")
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
weighted_avg = metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3, output_dict=True)['weighted avg']
for k in weighted_avg.keys():
    print("%s: %s" % (k, round(weighted_avg[k], 3)))

best params: {'c1': 0.4858854663877837, 'c2': 0.016422517233163236}
best CV score: 0.5893758737287462
model size: 0.57M

weighted avg:
precision: 0.668
recall: 0.553
f1-score: 0.602
support: 3360


In [26]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

y_pred = crf.predict(X_test)

sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print("weighted avg:")
weighted_avg = metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3, output_dict=True)['weighted avg']
for k in weighted_avg.keys():
    print("%s: %s" % (k, round(weighted_avg[k], 3)))

weighted avg:
precision: 0.666
recall: 0.557
f1-score: 0.603
support: 3360
