## NERC Project for Recognizing and Classifying Drugs

In [248]:
import sys
import string
from os import listdir
from xml.dom.minidom import parse
import matplotlib.pyplot as plt

### Tokenize Sentence

In [2]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/mponsclo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mponsclo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Sentence examples
sent_1 = "Activation of an effector immediate-early gene arc by methamphetamine"
sent_2 = "In situations in which concurrent therapy is necessary, careful patient monitoring is essential."
sent_3 = "Phenothiazines and butyrophenones may reduce or reverse the pressor effect of epinephrine."

In [8]:
# Tokenize word
tokenized_sent_1 = word_tokenize(sent_1)
tokenized_sent_2 = word_tokenize(sent_2)
print(tokenized_sent_1)

['Activation', 'of', 'an', 'effector', 'immediate-early', 'gene', 'arc', 'by', 'methamphetamine']


In [9]:
# Use the .find() method to find offset and end position
print("OffsetFrom: " + str(sent_1.find(tokenized_sent_1[0]))) # offset
print("OffsetTo: " + str(sent_1.find(tokenized_sent_1[0]) + len(tokenized_sent_1[0]) - 1)) # end
print("")
print("OffsetFrom: " + str(sent_2.find(tokenized_sent_2[1]))) # offset
print("OffsetTo: " + str(sent_2.find(tokenized_sent_2[1]) + len(tokenized_sent_2[1]) - 1)) # end

# From this results generate desired output: list of tuples (word, offsetFrom, offsetTo)

OffsetFrom: 0
OffsetTo: 9

OffsetFrom: 3
OffsetTo: 12


In [10]:
# Removing Stopwords and Punctuations to Reduce Workload
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

filtered_sent = []

for w in tokenized_sent_1:
    if (w not in stop_words) & (w.isalpha()):
        filtered_sent.append(w)

print("Tokenized Sentence:",tokenized_sent_1)
print("Filterd Sentence:",filtered_sent)

Tokenized Sentence: ['Activation', 'of', 'an', 'effector', 'immediate-early', 'gene', 'arc', 'by', 'methamphetamine']
Filterd Sentence: ['Activation', 'effector', 'gene', 'arc', 'methamphetamine']


In [None]:
offset = 0
tokens = []
for t in tokenized_word_1:
    offset = sent_1.find(t, offset)
    return 

In [12]:
def tokenize(s):
    '''
    Given a sentence , calls nltk.tokenize to split it in tokens, and adds to each token its start / end offset 
    in the original sentence .
    Input - s: string containing the text for one sentence
    Output - Returns a list of tuples (word , offsetFrom , offsetTo )'''

    token_list = []
    tokens = word_tokenize(s)
    
    for t in tokens:
        if (t in stop_words) & (not t.isalpha()):
            continue
        else:
            offsetFrom = s.find(t)
            offsetTo = offsetFrom + len(t) - 1
            token_list.append((t, offsetFrom, offsetTo))
            
    return token_list

In [13]:
# "Activation of an effector immediate-early gene arc by methamphetamine"
tokenize(sent_1)

[('Activation', 0, 9),
 ('of', 11, 12),
 ('an', 14, 15),
 ('effector', 17, 24),
 ('immediate-early', 26, 40),
 ('gene', 42, 45),
 ('arc', 47, 49),
 ('by', 51, 52),
 ('methamphetamine', 54, 68)]

### Classify Token

Examine (by hand or collecting simple statistics) the train dataset and try to infer general rules that are right in most cases, even if they seldom apply (high precision, low recall).

In [222]:
import xml.etree.ElementTree as ET
import pandas as pd

datadir = "/Users/mponsclo/Documents/DataScience/ALHT_Project/train"
df = pd.DataFrame() # initialize Data Frame

for f in listdir(datadir):
        try:
            filename = datadir + "/" + f
            tree = ET.parse(filename) # Some file rose an error when parsing
            root = tree.getroot()
        
            for elem in root:
                for subelem in elem.findall('entity'):
                    aux_df = pd.DataFrame({'Name': subelem.get('text'), 'Type': subelem.get('type')}, index=[0])
                    df = df.append(aux_df)
        except:
            continue

In [267]:
# Analysis of Types
# df.to_csv("/Users/mponsclo/Desktop/df.csv", index=False) Save df as .csv

group = df[df["Type"] == "group"]
brand = df[df["Type"] == "brand"]
drug = df[(df["Type"]=='drug') | (df["Type"]== "drug_n")]

group_counts = pd.DataFrame(group['Name'].value_counts())
brand_counts = pd.DataFrame(brand['Name'].value_counts())
drug_counts = pd.DataFrame({'Count' : drug['Name'].value_counts()})


#group_counts
#brand_counts
drug_counts.head()

# ax = drug_counts[(drug_counts["Count"] > 1)].plot.bar()

Unnamed: 0,Count
warfarin,167
digoxin,145
phenytoin,141
theophylline,99
lithium,94
ketoconazole,91
alcohol,90
cimetidine,70
cyclosporine,70
carbamazepine,69


In [None]:
if word.isupper(): return True, "brand"
elif word[-5:] in ['azole ', 'idine ', 'amine ', 'mycin ']:
    return True, "drug"
else: return False, ""

### Entity Extractor

In [None]:
def extract_entities(s):
    ''' Given a tokenized sentence , identify which tokens (or groups of consecutive tokens) are drugs
    Input - s: A tokenized sentence ( list of triples (word , offsetFrom , offsetTo ) )
    Output - A list of entities. Each entity is a dictionary with the keys 'name ', ' offset ', and 'type '''



### Main Function

In [None]:
def main(datadir, outfile):
    '''datadir - directory with XML files
       outfile - name for the outputfile'''

    # process each file in directory
    for f in listdir(datadir):
        # parse XML file, obtaining a DOM tree
        tree = parse(datadir + "/" + f)
        # process each senetence in the file
        sentences = tree.getElementsByTagName("sentence")
        for s in sentences:
                sid = s.attributes["id"].value        # get sentence id
                stext = s.attributes["text"].value    # get sentence text
                # tokenize text
                tokens = tokenize(stext)
                # extract entities from tokenized sentence text
                entities = extract_entities(tokens)

                # print sentence entities in format requested for evaluation
                for e in entities:
                    print(sid + "|" + e["offset"] + "|" + e["text"] + "|" e["type"], file = outf)
        # print performance score
        evaluator.evaluate("NER", datadir, outfile)

### Others

In [224]:
# Check the process of iteration through XML files in the folder
datadir = "/Users/mponsclo/Documents/DataScience/ALHT_Project/train"
for f in listdir(datadir):
    filename = datadir + "/" + f
    #print(filename)

In [211]:
# Construction of the dataframe
tree = ET.parse("/Users/mponsclo/Documents/DataScience/ALHT_Project/train/Estramustine_ddi.xml")
root = tree.getroot()
    
for elem in root:
    for subelem in elem.findall('entity'):    
        # if we know the name of the attribute, access it directly
        # aux_dict = {"text":subelem.get('text'), "type":subelem.get('type')} # as a dict
        
        aux_df = pd.DataFrame({'Name': subelem.get('text'), 'Type': subelem.get('type')}, index=[0])
        df = df.append(aux_df)

Unnamed: 0,Name,Type
0,entecavir,drug
0,BARACLUDE,brand
0,entecavir,drug
0,entecavir,drug
0,lamivudine,drug
...,...,...
0,antidiabetic agents,group
0,VELCADE,brand
0,antidiabetic medication,group
0,calcium,drug
