# Machine Learning NER

In [1]:
# import libraries
from os import listdir
from xml.dom.minidom import parse
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag

## 1. Feature Extractor
- Must be an independent program, separate from learner and classifier.
- Must get as argument the directory with the XML files to encode. 
- Must print the feature vectors to `stdout`

> DDI-DrugBank.d658.s0 When 0 3 O form=When formlower=when suf3=hen
suf4=When isTitle BoS formNext=administered
formlowerNext=administered suf3Next=red suf4Next=ered

> DDI-DrugBank.d658.s0 administered 5 16 O form=administered
formlower=administered suf3=red suf4=ered formPrev=When
formlowerPrev=when suf3Prev=hen suf4Prev=When isTitlePrev
formNext=concurrently formlowerNext=concurrently suf3Next=tly
suf4Next=ntly

### Tokenize Text

In [2]:
import nltk
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('universal_tagset')
stopwords = set(stopwords.words("english"))

In [3]:
# We don't remove stopwords and punctuations here since they may be used for feature extracting
def tokenize(s):
    '''
    Given a sentence , calls nltk.tokenize to split it in tokens, and adds to each token its start / end offset 
    in the original sentence .
    Input - s: string containing the text for one sentence
    Output - Returns a list of tuples (word , offsetFrom , offsetTo )
    '''

    token_list = []
    tokens = word_tokenize(s)
    
    for t in tokens:
            offsetFrom = s.find(t)
            offsetTo = offsetFrom + len(t) - 1
            token_list.append((t, offsetFrom, offsetTo))
            
    return token_list

### Extract features
Given a tokenized sentence, return a feature vector fo each token.
Example :
> `extract_features` ([(" Ascorbic " ,0 ,7) , (" acid " ,9 ,12) , (" ," ,13 ,13) ,
(" aspirin " ,15 ,21) , (" ," ,22 ,22) , (" and " ,24 ,26) , (" the " ,28 ,30) ,
(" common " ,32 ,37) , (" cold " ,39 ,42) , ("." ,43 ,43) ])
[ [ " form = Ascorbic ", " suf4 = rbic ", " next = acid ", " prev = _BoS_ ", "
capitalized " ],

> Output -> [ " form = acid ", " suf4 = acid ", " next =,", " prev = Ascorbic " ],
[ " form =,", " suf4 =,", " next = aspirin ", " prev = acid ", " punct " ],
[ " form = aspirin ", " suf4 = irin ", " next =,", " prev =," ],
]

In [21]:
sent_1 = 'Ascorbic acid, aspirin, and the common cold'
sent_2 = 'Phenothiazines and 3-butyrophenones may reduce or reverse the depressor effect of epinephrine'

tokenized_sent_1 = tokenize(sent_1)
tokenized_sent_2 = tokenize(sent_2)

def has_numbers(word):
    return any(l.isdigit() for l in word)

### Get tag
Given a token and a list of ground truth entities in a sentence, decide which is the B-I-O tag for the token.

**B-I-O Approach** = Mark each token as **B**egin of a **sub**sequence, **I**nside a subsequence, or **O**utside any subsequence.
> `get_tag` ((" Ascorbic " ,0 ,7) , [(0 , 12, " drug ") , (15 , 21, " brand ") ]) --> B-drug

> `get_tag` ((" acid " ,9 ,12) , [(0 , 12, " drug ") , (15 , 21, " brand ") ]) --> I-drug

> `get_tag` ((" common " ,32 ,37) , [(0 , 12, " drug ") , (15 , 21, " brand ") ]) --> 0

> `get_tag` ((" aspirin " ,15 ,21) , [(0 , 12, " drug ") , (15 , 21, " brand ") ]) --> B-brand

In [22]:
def get_tag(token, gold):
    '''
    Input:
        token: A token, i.e. one triple (word, offsetFrom, offsetTo)
        gold: A list of ground truth entities, i.e. a list of triples (offsetFrom, offsetTo, type)
        
    Output:
        The B-I-O ground truth tag for the given token ("B-drug", "I-drug", "B-group", "I-group", "O", ...)
    '''
    (form, start, end) = token
    for (offsetFrom, offsetTo, Type) in gold:
        if start == offsetFrom and end<=offsetTo:
            return "B-"+Type # First letter of token equals 0 -> Beginning
        elif start > offsetFrom and end <=offsetTo:
            return "I-"+Type # Word not in the beginning
    return "O"

In [23]:
print(get_tag ((" Ascorbic " ,0 ,7) , [(0 , 12, " drug ") , (15 , 21, " brand ") ]))
print(get_tag ((" acid " ,9 ,12) , [(0 , 12, " drug ") , (15 , 21, " brand ") ]))
print(get_tag ((" common " ,32 ,37) , [(0 , 12, " drug ") , (15 , 21, " brand ") ]) )
print(get_tag ((" aspirin " ,15 ,21) , [(0 , 12, " drug ") , (15 , 21, " brand ") ]))

B- drug 
I- drug 
O
B- brand 


In [124]:
def extract_features(tokenized_sentence):
    '''
    Input:
        s: A tokenized sentence (list of triples (word, offsetFrom, offsetTo) )
        
    Output: 
        A list of feature vectors, one per token.
        Features are binary and vectors are in sparse representeation (i.e. only active features are listed)
    '''
    
    features = []
    
    for i in range(0, len(tokenized_sentence)):
        t = tokenized_sentence[i][0]
        punct = [".",",",";",":","?","!"]
        
        # length, number of digits, rules 
        
        tokenFeatures = [
            "form=" + t,
            "formlower=" + t.lower(),
            "suf3=" + t[-3:],
            "suf4=" + t[-4:],
            "capitalized=%s " % t.istitle(),
            "uppercase=%s" % t.isupper(),
            "digit=%s" % t.isdigit(),
            "hasNumber=%s" % has_numbers(t),
            "stopword=%s" % (t in stopwords),
            "punctuation=%s" % (t in punct),
            #"posTag = %s" % pos_tag(t, tagset = 'universal')[0][1]
        ]  
        

            
        features.append(tokenFeatures)
        
    for i, current_token in enumerate(features):
        # add previous token
        if i > 0:
            prev_token = features[i-1]
            current_token.append("prev=%s" % prev_token[0][5:])
        else:
            current_token.append("prev=_BoS_") #beginning of sentence?
            
        # add next token
        if i < len(features)-1:
            next_token = features[i+1]
            current_token.append("next=%s" % next_token[0][5:])
        else:
            current_token.append("next=_EoS_") # end of sentence

        # we could also add the suffixes of the previous/next word
            
        
    return features

In [125]:
feats_sent_1 = extract_features(tokenized_sent_1)
feats_sent_2 = extract_features(tokenized_sent_2)
print(feats_sent_1)

[['form=Ascorbic', 'formlower=ascorbic', 'suf3=bic', 'suf4=rbic', 'capitalized=True ', 'uppercase=False', 'digit=False', 'hasNumber=False', 'stopword=False', 'punctuation=False', 'prev=_BoS_', 'next=acid'], ['form=acid', 'formlower=acid', 'suf3=cid', 'suf4=acid', 'capitalized=False ', 'uppercase=False', 'digit=False', 'hasNumber=False', 'stopword=False', 'punctuation=False', 'prev=Ascorbic', 'next=,'], ['form=,', 'formlower=,', 'suf3=,', 'suf4=,', 'capitalized=False ', 'uppercase=False', 'digit=False', 'hasNumber=False', 'stopword=False', 'punctuation=True', 'prev=acid', 'next=aspirin'], ['form=aspirin', 'formlower=aspirin', 'suf3=rin', 'suf4=irin', 'capitalized=False ', 'uppercase=False', 'digit=False', 'hasNumber=False', 'stopword=False', 'punctuation=False', 'prev=,', 'next=,'], ['form=,', 'formlower=,', 'suf3=,', 'suf4=,', 'capitalized=False ', 'uppercase=False', 'digit=False', 'hasNumber=False', 'stopword=False', 'punctuation=True', 'prev=aspirin', 'next=and'], ['form=and', 'forml

### Feature Extractor function

In [126]:
datadir = '/Users/mponsclo/Downloads/labAHLT/data/train'
datadir = "../labAHLT/data/train"



def feature_extractor(datadir, resultpath):
    result_f = open(resultpath, 'w')
    # process each file in directory
    for f in listdir(datadir):

        # parse XML file, obtaining a DOM tree
        tree = parse(datadir + "/" + f)

        # process each sentence in the file
        sentences = tree.getElementsByTagName("sentence")
        for s in sentences:
            sid = s.attributes["id"].value # get sentence id
            stext = s.attributes["text"].value # get sentence text
            # load ground truth entities
            gold = []
            entities = s.getElementsByTagName("entity")
            for e in entities:
                # for discontinuous entities, we only get the first span
                offset = e.attributes["charOffset"].value      # e.g. 24-44
                try: # too many values to unpack in some iteration
                    (start, end) = offset.split(":")[0].split("-") # e.g. start:24, end:44
                except:
                    pass
                gold.append((int(start), int(end), e.attributes["type"].value)) # e.g. [(24, 44, 'drug')] 

            # tokenize text
            tokens = tokenize(stext)

            # extract features for each word in the sentence
            features = extract_features(tokens)

            # print features in format suitable for the learner/classifier
            for i in range (0, len(tokens)):
                # see if the token is part of an entity, and which part (B/I)
                tag = get_tag(tokens[i], gold)
                joined_features = "\t".join(features[i])
                result_f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(sid, tokens[i][0], tokens[i][1], tokens[i][2], tag, joined_features))
                #print(sid, tokens[i][0], tokens[i][1], tokens[i][2], tag, "\t".join(features[i]), sep='\t')
        

            # black line to separate sentences
            #print()

In [127]:
feature_extractor(datadir)

TypeError: feature_extractor() missing 1 required positional argument: 'resultpath'

## 2. Learner

### Option 1: CRF
The learner needs only the right class and the features, so you'll need to remove the 4 extra fields _(sent\_id, token, span\_start, span\_end)_ added by the feature extractor, before feeding the vector to the learner. 

In [136]:
import sklearn
import pycrfsuite

In [137]:
traindatadir = "../labAHLT/data/train"
testdatadir = "../labAHLT/data/test"

train_data_filename = "train_data.out"
test_data_filename = "test_data.out"

In [138]:
feature_extractor(traindatadir, train_data_filename)
feature_extractor(testdatadir, test_data_filename)
#todo read about features accepted by CRF & try fitting crf

In [181]:
from itertools import groupby
from operator import itemgetter

# def read_feature_file(filepath):
#     f = open(filepath, 'r')
#     lines = f.readlines()
#     features = {}
#     classes = {}
#     metadata={} # sentence ids, token form, offsets - for later reconstruction
#     #print(lines)
#     for line in lines:
#         split_line = line[:-1].split('\t')
#         sentence_id = split_line[0]
#         feature_dict = {
#             "form": split_line[5],
#             "formlower": split_line[6]
            
#         }
#         #TODO read all features from text files
#         metadata.append((sentence_id, (split_line[1], split_line[2], split_line[3])))
#         #features.append(split_line[5:])
#         features.append(feature_dict)
#         classes.append(split_line[4])
#     return metadata, features, classes
def sentence2features(sentence):
    classes = []
    features = []
    tokens = []
    for token in sentence:
        split_line = token[:-1].split('\t')
        sentence_id = split_line[0]
        feature_dict = {
            "form": split_line[5],
            "formlower": split_line[6]

        }
        #TODO read all features from text files
        features.append(split_line[5:])
        classes.append(split_line[4])
        tokens.append((split_line[1], split_line[2], split_line[3]))
    return features, classes, tokens

def read_feature_file(filepath):
    f = open(filepath, 'r')
    lines = f.readlines()
    features = []
    classes = []
    metadata=[]# sentence ids, token form, offsets - for later reconstruction
    sentences = set()
    #print(lines)
    sentences = groupby(lines, lambda l: l.split('\t')[0])
    for sid, sentence in sentences:
        s_features, s_classes, s_tokens = sentence2features(sentence)
        features.append(s_features)
        classes.append(s_classes)
        metadata.append((sid, s_tokens))
           
    return metadata, features, classes
        

In [182]:
train_metadata, X_train, y_train = read_feature_file(train_data_filename)
test_metadata, X_test, y_test = read_feature_file(test_data_filename)

In [183]:
print(X_train[:5])
print(y_train[:5])
print(train_metadata[:5])

[[['form=While', 'formlower=while', 'suf3=ile', 'suf4=hile', 'capitalized=True ', 'uppercase=False', 'digit=False', 'hasNumber=False', 'stopword=False', 'punctuation=False', 'prev=_BoS_', 'next=co-administration'], ['form=co-administration', 'formlower=co-administration', 'suf3=ion', 'suf4=tion', 'capitalized=False ', 'uppercase=False', 'digit=False', 'hasNumber=False', 'stopword=False', 'punctuation=False', 'prev=While', 'next=of'], ['form=of', 'formlower=of', 'suf3=of', 'suf4=of', 'capitalized=False ', 'uppercase=False', 'digit=False', 'hasNumber=False', 'stopword=True', 'punctuation=False', 'prev=co-administration', 'next=ZAVESCA'], ['form=ZAVESCA', 'formlower=zavesca', 'suf3=SCA', 'suf4=ESCA', 'capitalized=False ', 'uppercase=True', 'digit=False', 'hasNumber=False', 'stopword=False', 'punctuation=False', 'prev=of', 'next=appeared'], ['form=appeared', 'formlower=appeared', 'suf3=red', 'suf4=ared', 'capitalized=False ', 'uppercase=False', 'digit=False', 'hasNumber=False', 'stopword=F

CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [186]:
# train
train_result_file = 'conll2002-esp.crfsuite'
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)
    
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})
trainer.train(train_result_file)

In [187]:
trainer.logparser.last_iteration

{'num': 50,
 'scores': {},
 'loss': 15116.108667,
 'feature_norm': 85.609124,
 'error_norm': 277.056944,
 'active_features': 6909,
 'linesearch_trials': 1,
 'linesearch_step': 1.0,
 'time': 0.069}

In [190]:
print(len(trainer.logparser.iterations), trainer.logparser.iterations[-1])

50 {'num': 50, 'scores': {}, 'loss': 15116.108667, 'feature_norm': 85.609124, 'error_norm': 277.056944, 'active_features': 6909, 'linesearch_trials': 1, 'linesearch_step': 1.0, 'time': 0.069}


In [192]:
# predict
tagger = pycrfsuite.Tagger()
tagger.open(train_result_file)

predicted = tagger.tag(X_test[0])

print("Predicted:", ' '.join(predicted))
print("Correct:  ", ' '.join(y_test[0]))

Predicted: B-drug O O O O O O O O O O O O O O O B-drug O
Correct:   B-drug O O O O O O I-drug O O O O O O O O B-drug_n O
