# Machine Learning NER

In [1]:
from os import listdir
from xml.dom.minidom import parse
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag
from evaluator import *

In [2]:
import nltk
stopwords = set(stopwords.words("english"))
wordnet_lemmatizer = WordNetLemmatizer()

### Tokenize Text

In [3]:
def tokenize(s):

    token_list = []
    tokens = word_tokenize(s)
    
    for t in tokens:
            offsetFrom = s.find(t)
            offsetTo = offsetFrom + len(t) - 1
            token_list.append((t, offsetFrom, offsetTo))
            
    return token_list

### Extract features

In [4]:
sent_1 = 'Ascorbic acid, aspirin, and the common cold PCP'
sent_2 = 'Phenothiazines and butyrophenones may reduce or reverse the depressor effect of epinephrine'

def has_numbers(word):
    return any(l.isdigit() for l in word)
def num_digits(word):
    return sum(l.isdigit() for l in word)

In [5]:
SIMPLE_DB_PATH = './resources/HSDB.txt'
DRUG_BANK_PATH = './resources/DrugBank.txt'

SimpleDrugDb = set()
DrugBank = {'drug' : set(), 'brand': set(), 'group': set()}

def read_drug_list_files():
    '''
    Read the drug databases.
    '''
    resource_file = open(SIMPLE_DB_PATH, 'r')
    lines = resource_file.readlines()
    global SimpleDrugDb
    SimpleDrugDb = set([d[:-1].lower() for d in lines])

    resource_file = open(DRUG_BANK_PATH, 'r')
    lines = resource_file.readlines()
    global DrugBank
    split_lines = [(line.split('|')[0].lower(), line.split('|')[1][:-1]) for line in lines]

    for name, n_type in split_lines:
        DrugBank[n_type].add(name)

In [6]:
def use_db_resources(word):
    
    drug_n = ["PCP", "18-MC", "ibogaine", "MHD", "endotoxin", "toxin", "NANM", "ginsenosides", 
             "NaCMC", "PTX", "coumaphos", "contortrostatin", "resveratrol", "GSLS", "methylglyoxal",
             "hydrodolasetron", "neurotensin"]

    if (word.lower() in SimpleDrugDb):
        return True, "drug"
    elif (word.lower() in DrugBank["drug"]):
        return True, "drug"
    elif (word.lower() in DrugBank["brand"]):
        return True, "brand"
    elif (word.lower() in DrugBank["group"]):
        return True, "group"
    #elif (True in [word in word for word in drug_n]):
        #return True, "drug_n"
    else:
        return False, ""

### Get tag

In [7]:
def get_tag(token, gold):
    '''
    Input:
        token: A token, i.e. one triple (word, offsetFrom, offsetTo)
        gold: A list of ground truth entities, i.e. a list of triples (offsetFrom, offsetTo, type)
        
    Output:
        The B-I-O ground truth tag for the given token ("B-drug", "I-drug", "B-group", "I-group", "O", ...)
    '''
    (form, start, end) = token
    for (offsetFrom, offsetTo, Type) in gold:
        if start == offsetFrom and end<=offsetTo:
            return "B-"+Type # First letter of token equals 0 -> Beginning
        elif start > offsetFrom and end <=offsetTo:
            return "I-"+Type # Word not in the beginning
    return "O"

In [8]:
print(get_tag ((" Ascorbic " ,0 ,7) , [(0 , 12, " drug ") , (15 , 21, " brand ") ]))
print(get_tag ((" acid " ,9 ,12) , [(0 , 12, " drug ") , (15 , 21, " brand ") ]))
print(get_tag ((" common " ,32 ,37) , [(0 , 12, " drug ") , (15 , 21, " brand ") ]) )
print(get_tag ((" aspirin " ,15 ,21) , [(0 , 12, " drug ") , (15 , 21, " brand ") ]))

B- drug 
I- drug 
O
B- brand 


In [9]:
def extract_features(tokenized_sentence, should_look_up=False):
    '''
    Input:
        s: A tokenized sentence (list of triples (word, offsetFrom, offsetTo) )
        
    Output: 
        A list of feature vectors, one per token.
        Features are binary and vectors are in sparse representeation (i.e. only active features are listed)
    '''
    
    features = []
    
    for i in range(0, len(tokenized_sentence)):
        t = tokenized_sentence[i][0]
        punct = [".",",",";",":","?","!"]
        
        # length, number of digits, rules 
        
        tokenFeatures =  [
            "form=" + t,
            "formlower=" + t.lower(),
            "suf3=" + t[-3:],
            "suf4=" + t[-4:],
            "suf5=" + t[-5:],
            "prfx3=" + t[:3],
            "prfx4=" + t[:4],
            "prfx5=" + t[:5], 
            "capitalized=%s " % t.istitle(),
            "uppercase=%s" % t.isupper(),
            "digit=%s" % t.isdigit(),
            "stopword=%s" % (t in stopwords),
            "punctuation=%s" % (t in punct),
            "length=%s" % len(t),
            "posTag=%s" % pos_tag(t)[0][1], # ?
            "lemma=%s" % wordnet_lemmatizer.lemmatize(t),
            "numDigits=%s" % num_digits(t), 
            "containsDash=%s" % ('-' in t)
        ]
  
        features.append(tokenFeatures)
    
        if should_look_up:
            read_drug_list_files()
            (is_drug, isType) = use_db_resources(t)
            if is_drug: 
                tokenFeatures.append("Ruled = %s" %isType) 
            else: 
                tokenFeatures.append("Ruled = O") 
        
    for i, current_token in enumerate(features):
        # add previous token
        if i > 0:
            prev_token = features[i-1][0][5:]
            current_token.append("prev=%s" % prev_token)
            current_token.append("suf3Prev = %s" % prev_token[-3:])
            current_token.append("suf4Prev = %s" % prev_token[-4:])
            current_token.append("prevIsTitle = %s" % prev_token.istitle())
            current_token.append("prevIsUpper = %s" % prev_token.isupper())
            current_token.append("PrevIsDigit = %s" % prev_token.isdigit())

        else:
            current_token.append("prev=_BoS_") #beginning of sentence
            
        # add next token
        if i < len(features)-1:
            next_token = features[i+1][0][5:]
            current_token.append("next=%s" % next_token)
            current_token.append("suf3Next = %s" % next_token[-3:])
            current_token.append("suf4Next = %s" % next_token[-3:])
            current_token.append("NextIsTitle = %s" % next_token.istitle())
            current_token.append("NextIsUpper = %s" % next_token.isupper())
            current_token.append("NextIsDigit = %s" % next_token.isdigit())

        else:
            current_token.append("next=_EoS_") # end of sentence
            
    return features

In [61]:
tokenized_sent_2 = tokenize(sent_2)
feats_sent_2 = extract_features(tokenized_sent_2, should_look_up = True)
#print(feats_sent_2)

### Feature Extractor function

In [10]:
def feature_extractor(datadir, resultpath, should_look_up = False):
    result_f = open(resultpath, 'w')
    # process each file in directory
    for f in listdir(datadir):

        # parse XML file, obtaining a DOM tree
        tree = parse(datadir + "/" + f)

        # process each sentence in the file
        sentences = tree.getElementsByTagName("sentence")
        for s in sentences:
            sid = s.attributes["id"].value # get sentence id
            stext = s.attributes["text"].value # get sentence text
            # load ground truth entities
            gold = []
            entities = s.getElementsByTagName("entity")
            for e in entities:
                # for discontinuous entities, we only get the first span
                offset = e.attributes["charOffset"].value      # e.g. 24-44
                try: # too many values to unpack in some iteration
                    (start, end) = offset.split(":")[0].split("-") # e.g. start:24, end:44
                except:
                    pass
                gold.append((int(start), int(end), e.attributes["type"].value)) # e.g. [(24, 44, 'drug')] 

            # tokenize text
            tokens = tokenize(stext)

            # extract features for each word in the sentence
            features = extract_features(tokens, should_look_up)

            # print features in format suitable for the learner/classifier
            for i in range (0, len(tokens)):
                # see if the token is part of an entity, and which part (B/I)
                tag = get_tag(tokens[i], gold)
                joined_features = "\t".join(features[i])
                result_f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(sid, tokens[i][0], tokens[i][1], tokens[i][2], tag, joined_features))

In [None]:
train_dataset = '/Users/mponsclo/Downloads/labAHLT/data/train'
devel_dataset = '/Users/mponsclo/Downloads/labAHLT/data/devel'
test_dataset = '/Users/mponsclo/Downloads/labAHLT/data/test'

feature_extractor(train_dataset, 'features_train_l.txt', should_look_up = True)
#feature_extractor(devel_dataset, 'features_devel')
#feature_extractor(test_dataset, 'features_test')

### Learner: CRF

In [35]:
import pycrfsuite
from operator import itemgetter
from itertools import groupby

from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import scipy
from sklearn.metrics import make_scorer
#from sklearn.cross_validation import cross_val_score

In [16]:
def parse_sentence_strings(sentence):
    '''
    Task:
        Given a stringified sentence, parse it and return the data in relevant data structures.
    Input:
        sentence: list of strings representing the given sentence, where each piece of information is separated by a tab, e.g.
            "DDI-DrugBank.d695.s0	While	0	4	O	form=While	formlower=while	suf3=ile	suf4=hile"
    Output:
        tokens: list of tuples representing tokens: (word, offset_from, offset_to)
        features: list of features for every token
        tags: list of tags (e.g. B-drug, O, I-brand etc.) for every token
    '''
    tags = []
    features = []
    tokens = []
    for token in sentence:
        split_data = token[:-1].split('\t')
        sentence_id = split_data[0]
        features.append(split_data[5:])
        tags.append(split_data[4])
        tokens.append((split_data[1], split_data[2], split_data[3]))
    return tokens, features, tags

def read_feature_file(filepath):
    '''
    Task:
        Given the path to the file containing tokenized sentences, read it and return the necessary data structures
    Input:
        filepath: Path to the data
    Output:
        tokens_by_sentence: list of tuples: (sentence_id, list_of_tokens). Each tuple represents a sentence,
            where in the list_of_tokens each token is represented by a tuple (word, offset_from, offset_to)
        features: list of lists of features per sentence
        tags: list of lists of B-I-O tags per sentence
    '''
    features = []
    tags = []
    tokens_by_sentence=[]
    
    f = open(filepath, 'r')
    lines = f.readlines()

    # group the tokens by sentence
    sentences = groupby(lines, lambda l: l.split('\t')[0])

    # process each sentence
    for sid, sentence in sentences:
        s_tokens, s_features, s_tags = parse_sentence_strings(sentence)
        
        tokens_by_sentence.append((sid, s_tokens))
        features.append(s_features)
        tags.append(s_tags)
           
    return tokens_by_sentence, features, tags

In [40]:
trainer = pycrfsuite.Trainer(algorithm='pa', verbose=False)
#help(pycrfsuite.Trainer) 
pycrfsuite.Trainer.params(trainer)
# lbfgs - Gradient descent using the L-BFGS method
# l2sgd - Stochastic Gradient Descent with L2 regularization term
# pa - Passive Aggressive
params = {
        'c': 0.21600273890535607,
        'epsilon': 0.004802939229551229,
        'type': 2,
        'feature.possible_transitions': True,
        'feature.possible_states': True,
        'max_iterations': 100
    }

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'type',
 'c',
 'error_sensitive',
 'averaging',
 'max_iterations',
 'epsilon']

In [41]:
def crf_learn(features_file, model_name):
    
    # Read features from file
    _, X_train, y_train = read_feature_file(features_file)
    
    trainer = pycrfsuite.Trainer(algorithm='pa', verbose=False)
    for xseq, yseq in zip(X_train, y_train):
        trainer.append(xseq, yseq)
        
    trainer.set_params({
        'c1': 0.2, 
        'c2': 0.001,
        'max_iterations': 1000,
        'feature.possible_states': True,
        
        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })
    
    trainer.train(model_name)
    return

In [47]:
crf_learn("features_train", "ml_model.crfsuite")

### Hyperparameter Optimization

In [None]:
labels = []

### Classifier

In [42]:
def output_entities(sid, tokens, tags, outf):
    '''
    Task:
        Given a list of tokens and the B-I-O tag for each token, produce a list
            of drugs in the format expected by the evaluator.
    Input:
        sid: sentence identifier (required by the evaluator output format)
        tokens: List of tokens in the sentence , i.e. list of tuples (word,
            offsetFrom, offsetTo)
        tags: List of B-I-O tags for each token
    Output:
        Prints to stdout the entities in the right format: one line per entity,
        fields separated by ’|’, field order : id, offset, name, type.
    '''
    
    i = 0
    while i < len(tokens):
        entity, offset_from, offset_to = tokens[i]
        tag = tags[i]
        
        if tag[0] == 'B':
            tag_name = tag[2:]
            # in case the entity is longer than one word, concatenate it
            j = i + 1
            while j < len(tokens):
                word_next, offset_from_next, offset_to_next = tokens[j]
                tag_next = tags[j]
                j += 1
                if int(offset_from_next) - int(offset_to) != 2 or tag_next[0] != 'I':
                    break
                if tag_next[2:] == tag_name:
                    entity = entity + ' ' + word_next
                    offset_to = offset_to_next
            # print the whole entity
            outf.write(str(sid) + "|" + str(offset_from) + '-' + str(offset_to) + "|" + entity + "|" + tag_name)
            
            outf.write("\n")
            
        i += 1

In [43]:
def crf_classifier(model_name, features, inputdir, outputfile):
    
    outf = open(outputfile, "w")
        
    ids_and_tokens, X_devel, y_devel = read_feature_file(features)
    
    tagger = pycrfsuite.Tagger()
    tagger.open(model_name)
    tags = [tagger.tag(sentence) for sentence in X_devel]
    
    for sentence_tags, sentence_data in zip(tags, ids_and_tokens):
        sid, tokens = sentence_data
        output_entities(sid, tokens, sentence_tags, outf)
        
    outf.close()
    evaluate("NER", inputdir, outputfile)
    return

In [47]:
result = crf_classifier("ml_model.crfsuite", "features_test.txt", test_dataset, "hello")

FileNotFoundError: [Errno 2] No such file or directory: 'ml_model.crfsuite'