# Machine Learning NER

In [None]:
# import libraries

## Feature Extractor
- Must be an independent program, separate from learner and classifier.
- Must get as argument the directory with the XML files to encode. 
- Must print the feature vectors to `stdout`

### Tokenize Text

### Extract features
Given a tokenized sentence, return a feature vector fo each token.
Example :
> `extract_features` ([(" Ascorbic " ,0 ,7) , (" acid " ,9 ,12) , (" ," ,13 ,13) ,
(" aspirin " ,15 ,21) , (" ," ,22 ,22) , (" and " ,24 ,26) , (" the " ,28 ,30) ,
(" common " ,32 ,37) , (" cold " ,39 ,42) , ("." ,43 ,43) ])
[ [ " form = Ascorbic ", " suf4 = rbic ", " next = acid ", " prev = _BoS_ ", "
capitalized " ],
[ " form = acid ", " suf4 = acid ", " next =,", " prev = Ascorbic " ],
[ " form =,", " suf4 =,", " next = aspirin ", " prev = acid ", " punct " ],
[ " form = aspirin ", " suf4 = irin ", " next =,", " prev =," ],
]

In [None]:
def extract_features(s):
    '''
    Input:
        s: A tokenized sentence (list of triples (word, offsetFrom, offsetTo) )
        
    Output: 
        A list of feature vectors, one per token.
        Features are binary and vectors are in sparse representeation (i.e. only active features are listed)
    '''

### Get tag
Given a token and a list of ground truth entities in a sentence, decide which is the B-I-O tag for the token.
**B-I-O** Ap
> `get_tag` ((" Ascorbic " ,0 ,7) , [(0 , 12, " drug ") , (15 , 21, " brand ") ]) --> B- drug

> `get_tag` ((" acid " ,9 ,12) , [(0 , 12, " drug ") , (15 , 21, " brand ") ]) --> I- drug

> `get_tag` ((" common " ,32 ,37) , [(0 , 12, " drug ") , (15 , 21, " brand ") ]) --> 0

> `get_tag` ((" aspirin " ,15 ,21) , [(0 , 12, " drug ") , (15 , 21, " brand ") ]) --> B- brand

In [None]:
def get_tag(token, gold):
    '''
    Input:
        token: A token, i.e. one triple (word, offsetFrom, offsetTo)
        gold: A list of ground truth entities, i.e. a list of triples (offsetFrom, offsetTo, type)
        
    Output:
        The B-I-O ground truth tag for the given token ("B-drug", "I-drug", "B-group", "I-group", "O", ...)
    '''

### Feature Extractor function

In [None]:
# process each file in directory
for f in listdir(datadir):
    # parse XML file, obtaining a DOM tree
    tree = parse(datadir + "/" + f)
    # process each sentence in the file
    sentences = tree.getElementsByTagName("sentence")
    
    for s in sentence:
        sid = s.attributes["id"].value # get sentence id
        stext = s.attributes["text"].value # get sentence text
        # load ground truth entities
        gold = []
        entities = s.getElementsByTagNameByTagName("entity")
        for e in entities:
            # for discontinuous entities, we only get the first span
            offset = e.attributes["charOffset"].value
            (start, end) = offset.split(":")[0].split("-")
            gold.append((int(start), int(end), e.attributes["type"].value))
            
        # tokenize text
        tokens = tokenize(stext)
        
        #extract features for eac word in the sentence
        features = extract_features(tokens)
        
        # print features in format suitable for the learner/classifier
        for i in range (0, len(tokens)):
            # see if the token is part of an entity, and which part (B/I)
            tag = get_tag(tokens[i], gold)
            print(sid, tokens[i][0], tokens[i][1], tokens[i][2], tag, "\t".join(features[i]), sep='\t')
            
        # black line to separate sentences
        print()

## B-I-O Approach 


Write a python program that parses all XML files in the folder given as argument and recognizes and classifies drug names. The program must use a sequence tagging machine learning algorithm.