# Machine Learning NER

In [87]:
# import libraries
from os import listdir
from xml.dom.minidom import parse
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag

## 1. Feature Extractor
- Must be an independent program, separate from learner and classifier.
- Must get as argument the directory with the XML files to encode. 
- Must print the feature vectors to `stdout`

> DDI-DrugBank.d658.s0 When 0 3 O form=When formlower=when suf3=hen
suf4=When isTitle BoS formNext=administered
formlowerNext=administered suf3Next=red suf4Next=ered

> DDI-DrugBank.d658.s0 administered 5 16 O form=administered
formlower=administered suf3=red suf4=ered formPrev=When
formlowerPrev=when suf3Prev=hen suf4Prev=When isTitlePrev
formNext=concurrently formlowerNext=concurrently suf3Next=tly
suf4Next=ntly

### Tokenize Text

In [100]:
import nltk
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('universal_tagset')
stopwords = set(stopwords.words("english"))

In [23]:
# We don't remove stopwords and punctuations here since they may be used for feature extracting
def tokenize(s):
    '''
    Given a sentence , calls nltk.tokenize to split it in tokens, and adds to each token its start / end offset 
    in the original sentence .
    Input - s: string containing the text for one sentence
    Output - Returns a list of tuples (word , offsetFrom , offsetTo )
    '''

    token_list = []
    tokens = word_tokenize(s)
    
    for t in tokens:
            offsetFrom = s.find(t)
            offsetTo = offsetFrom + len(t) - 1
            token_list.append((t, offsetFrom, offsetTo))
            
    return token_list

### Extract features
Given a tokenized sentence, return a feature vector fo each token.
Example :
> `extract_features` ([(" Ascorbic " ,0 ,7) , (" acid " ,9 ,12) , (" ," ,13 ,13) ,
(" aspirin " ,15 ,21) , (" ," ,22 ,22) , (" and " ,24 ,26) , (" the " ,28 ,30) ,
(" common " ,32 ,37) , (" cold " ,39 ,42) , ("." ,43 ,43) ])
[ [ " form = Ascorbic ", " suf4 = rbic ", " next = acid ", " prev = _BoS_ ", "
capitalized " ],

> Output -> [ " form = acid ", " suf4 = acid ", " next =,", " prev = Ascorbic " ],
[ " form =,", " suf4 =,", " next = aspirin ", " prev = acid ", " punct " ],
[ " form = aspirin ", " suf4 = irin ", " next =,", " prev =," ],
]

In [104]:
s = 'Ascorbic acid, aspirin, and the common cold'
t = "3-Methyl-Oxalacetate relation with severe anaphilactic reaction"
tks = tokenize(s) 

def has_numbers(word):
    return any(l.isdigit() for l in word)

In [99]:
def extract_features(tokenized_sentence):
    '''
    Input:
        s: A tokenized sentence (list of triples (word, offsetFrom, offsetTo) )
        
    Output: 
        A list of feature vectors, one per token.
        Features are binary and vectors are in sparse representeation (i.e. only active features are listed)
    '''
    
    result = []
    
    
    for i in range(0, len(tokenized_sentence)):
        t = tokenized_sentence[i][0]
        tokenFeatures = []
        
        tokenFeatures.append("form = " + t)
        tokenFeatures.append("formlower = " + t.lower())
        tokenFeatures.append("suf3 = " + t[-3:])
        tokenFeatures.append("suf4 = " + t[-4:])
        tokenFeatures.append("PoStag = " + pos_tag(t,tagset = 'universal')[0][1])
        
        if (t.istitle()): tokenFeatures.append("capitalized")
        if (t.isupper()): tokenFeatures.append("uppercase")
        if (t.isdigit()): tokenFeatures.append("digit")
        if (has_numbers(t)): tokenFeatures.append("containsNumber")
        if (t in stopwords): tokenFeatures.append("stopword")
        if (t in [".",",",";",":","?","!"]): tokenFeatures.append("punctuation") # necessary having PoS Tag?
            
        
        if i > 0: # offsetFrom > 0
            tPrev = tokenized_sentence[i-1][0]
            tokenFeatures.append("fromPrev = " + tPrev)
        else:
            tokenFeatures.append("BoS")
            
        if i < len(tokenized_sentence)-1:
            tNext = tokenized_sentence[i+1][0]
            tokenFeatures.append("formNext = " + tNext)
        else:
            tokenFeatures.append("EoS")
            
        
        result.append(tokenFeatures)
        
    return result

r = extract_features(tks)
print(r)

[['form = Ascorbic', 'formlower = ascorbic', 'suf3 = bic', 'suf4 = rbic', 'PoStag = DET', 'capitalized', 'BoS', 'formNext = acid'], ['form = acid', 'formlower = acid', 'suf3 = cid', 'suf4 = acid', 'PoStag = DET', 'fromPrev = Ascorbic', 'formNext = ,'], ['form = ,', 'formlower = ,', 'suf3 = ,', 'suf4 = ,', 'PoStag = .', 'punctuation', 'fromPrev = acid', 'formNext = aspirin'], ['form = aspirin', 'formlower = aspirin', 'suf3 = rin', 'suf4 = irin', 'PoStag = DET', 'fromPrev = ,', 'formNext = ,'], ['form = ,', 'formlower = ,', 'suf3 = ,', 'suf4 = ,', 'PoStag = .', 'punctuation', 'fromPrev = aspirin', 'formNext = 3-methyl'], ['form = 3-methyl', 'formlower = 3-methyl', 'suf3 = hyl', 'suf4 = thyl', 'PoStag = X', 'containsNumber', 'fromPrev = ,', 'formNext = ,'], ['form = ,', 'formlower = ,', 'suf3 = ,', 'suf4 = ,', 'PoStag = .', 'punctuation', 'fromPrev = 3-methyl', 'formNext = and'], ['form = and', 'formlower = and', 'suf3 = and', 'suf4 = and', 'PoStag = DET', 'stopword', 'fromPrev = ,', 'for

### Get tag
Given a token and a list of ground truth entities in a sentence, decide which is the B-I-O tag for the token.

**B-I-O Approach** = Mark each token as **B**egin of a sequence, **I**nside a sequence, or **O**utside any sequence.
> `get_tag` ((" Ascorbic " ,0 ,7) , [(0 , 12, " drug ") , (15 , 21, " brand ") ]) --> B-drug

> `get_tag` ((" acid " ,9 ,12) , [(0 , 12, " drug ") , (15 , 21, " brand ") ]) --> I-drug

> `get_tag` ((" common " ,32 ,37) , [(0 , 12, " drug ") , (15 , 21, " brand ") ]) --> 0

> `get_tag` ((" aspirin " ,15 ,21) , [(0 , 12, " drug ") , (15 , 21, " brand ") ]) --> B-brand

In [105]:
def get_tag(token, gold):
    '''
    Input:
        token: A token, i.e. one triple (word, offsetFrom, offsetTo)
        gold: A list of ground truth entities, i.e. a list of triples (offsetFrom, offsetTo, type)
        
    Output:
        The B-I-O ground truth tag for the given token ("B-drug", "I-drug", "B-group", "I-group", "O", ...)
    '''
    
    for (offsetFrom, offsetTo, Type) in gold:
        if token[1] == offsetFrom and token[2]<=offsetTo: return "B-"+Type # First letter of token equals 0 -> Beginning
        elif token[1] >= offsetFrom and token[2]<=offsetTo: return "I-"+Type # Word not in the beginning
        else: return "O"

In [110]:
print(get_tag((" acid ",9 ,12) , [(0 , 12, " drug ") , (15 , 21, " brand ")]))
print(get_tag((" Ascorbic " ,0 ,7) , [(0 , 12, " drug ") , (15 , 21, " brand ")]))
print(get_tag((" common " ,32 ,37) , [(0 , 12, " drug ") , (15 , 21, " brand ")]))

I- drug 
B- drug 
O


### Feature Extractor function

In [None]:
# process each file in directory
for f in listdir(datadir):
    
    # parse XML file, obtaining a DOM tree
    tree = parse(datadir + "/" + f)
    
    # process each sentence in the file
    sentences = tree.getElementsByTagName("sentence")
    for s in sentences:
        sid = s.attributes["id"].value # get sentence id
        stext = s.attributes["text"].value # get sentence text
        # load ground truth entities
        gold = []
        entities = s.getElementsByTagName("entity")
        for e in entities:
            # for discontinuous entities, we only get the first span
            offset = e.attributes["charOffset"].value      # 24-44
            (start, end) = offset.split(":")[0].split("-") # start:24, end:44
            gold.append((int(start), int(end), e.attributes["type"].value)) #[(24, 44, 'drug')] 
            
        # tokenize text
        tokens = tokenize(stext)
        
        # extract features for each word in the sentence
        features = extract_features(tokens)
        
        # print features in format suitable for the learner/classifier
        for i in range (0, len(tokens)):
            # see if the token is part of an entity, and which part (B/I)
            tag = get_tag(tokens[i], gold)
            print(sid, tokens[i][0], tokens[i][1], tokens[i][2], tag, "\t".join(features[i]), sep='\t')
            
        # black line to separate sentences
        print()

In [21]:
tree = parse('/Users/mponsclo/Downloads/labAHLT/data/train/1113260.xml')
sentences = tree.getElementsByTagName("sentence")
for s in sentences: 
    sid = s.attributes["id"].value
    stext = s.attributes["text"].value
    gold = []
    entities = s.getElementsByTagName("entity")
    for e in entities:
        offset = e.attributes["charOffset"].value
        #print(offset)
        (start, end) = offset.split(":")[0].split("-")
        #print(end)
        gold.append((int(start), int(end), e.attributes["type"].value))
        print(gold)

[(24, 44, 'drug')]
[(24, 44, 'drug'), (50, 57, 'drug')]
[(19, 39, 'drug')]
[(19, 39, 'drug'), (55, 62, 'drug')]
[(34, 47, 'drug')]
[(34, 47, 'drug'), (53, 60, 'drug')]
[(34, 47, 'drug'), (53, 60, 'drug'), (67, 75, 'drug')]
[(0, 8, 'drug')]
[(0, 8, 'drug'), (93, 100, 'drug')]
[(52, 59, 'drug')]
[(43, 51, 'drug')]
[(19, 32, 'drug')]
[(39, 47, 'drug')]
[(39, 47, 'drug'), (91, 98, 'drug')]


## 2. Learner

### Option 1: CRF
The learner needs only the right class and the features, so you'll need to remove the 4 extra fields _(sent\_id, token, span\_start, span\_end)_ added by the feature extractor, before feeding the vector to the learner. 

In [None]:
import pycrfsuite
pip install python-crfsuite

### Option 2: Maximum Entropy
`megam`does not expect the extra information in the features file, so:
- Remove the first 3 fields _(sent\_id, span\_start, span\_end)_ and the blank lines between spaces.
- You can modify the print statement in the feature extractor to directly produce two versions of the feature file, one with the extra information, and one without. 

### Option 3: Own choice
Adapt the feature file format to the needs of the selected algorithm. Train a classification model for the task of predicting BI-O tags for each token.

## 3. Classifier

### Option 1: CRF
Load the vectors produced by the feature extractor and feed them to the classifier.
The classifier needs only the features, so you'll need to remove the other extra fields _(sent\_id, token, span\_start, span\_end)_ added by the feature extractor, before feeding the vector to the classifier.

### Option 2. Maximum Entropy
Follow examples (and reuse code) for MaxEnt classifiers seen in class to get a B-I-O tag for each token in a sentence.

### Option 3: Your choice
Write the necessary code to call your choice classifier and get a B-I-O tag for each token in a sentence.

### *Default output for all options*
Given a list of tokens and the B-I-O tag for each token, produce a list of drugs in the format expected by the evaluator. 

> `output_entities` (" DDI - DrugBank . d553 .s0",
[(" Ascorbic " ,0 ,7) , (" acid " ,9 ,12) , (" ," ,13 ,13) ,
(" aspirin " ,15 ,21) , (" ," ,22 ,22) , (" and " ,24 ,26) ,
(" the " ,28 ,30) ,(" common " ,32 ,37) , (" cold " ,39 ,42) ],
["B- drug ", "I- drug ", "O", "B- brand ", "O", "O", "O",
"O", "O "])

DDI - DrugBank . d553 .s0 |0 -12| Ascorbic acid | drug

DDI - DrugBank . d553 .s0 |15 -21| aspirin | brand

In [None]:
def output_entities(sid, tokens, tags):
    '''
    Input:
        sid: sentence identifier (required by the evaluator output format)
        tokens: List of tokens in the sentence, i.e. list of tuples (word, offsetFrom, offsetTo)
        tags: List of B-I-O tags for each token
        
    Output:
        Prints to stdout the entities in the right format: one line per entity, fields separated by '|', 
        field order: id, offset, name, type.
    '''

## 4. Evaluating Results
- Repeat training: evaluation cycle on devel dataset to find out which is the best parametrization for the used algorithm.
- Repeat feature extraction: training-evaluation cycle on devel dataset to find out which features are useful.