In [3]:
# install required packages
! pip install textblob
! pip install fuzzywuzzy
! pip install python-Levenshtein

Collecting textblob
  Using cached textblob-0.15.3-py2.py3-none-any.whl (636 kB)
Installing collected packages: textblob
Successfully installed textblob-0.15.3
Collecting fuzzywuzzy
  Using cached fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Processing c:\users\mani vegupatti\appdata\local\pip\cache\wheels\f0\9b\13\49c281164c37be18343230d3cd0fca29efb23a493351db0009\python_levenshtein-0.12.0-cp37-cp37m-win_amd64.whl
Installing collected packages: python-Levenshtein
Successfully installed python-Levenshtein-0.12.0


In [4]:
# importing required packages and modules
import pandas as pd
from textblob import TextBlob
import numpy as np
import csv
from fuzzywuzzy import process
import pickle

In [5]:
# read the list of entities of the KG 
df = pd.read_csv('resources_list_v1.csv',header = None)
df.head(3)

Unnamed: 0,0,1,2,3
0,1,http://bio2rdf.org/drugbank:DB04223,http://bio2rdf.org/drugbank_vocabulary:Drug,Nitroarginine
1,2,http://bio2rdf.org/drugbank:DB08129,http://bio2rdf.org/drugbank_vocabulary:Drug,(1R)-2-amino-1-[3-(trifluoromethyl)phenyl]ethanol
2,3,http://bio2rdf.org/drugbank:DB03491,http://bio2rdf.org/drugbank_vocabulary:Drug,2'-Deoxyguanosine-5'-Diphosphate


In [6]:
# extract label of entities to create n-gram and inverted index
data = df.iloc[:,3]
data.head(3)

0                                        Nitroarginine
1    (1R)-2-amino-1-[3-(trifluoromethyl)phenyl]ethanol
2                     2'-Deoxyguanosine-5'-Diphosphate
Name: 3, dtype: object

In [7]:
# function to create inverted index from n-grams of entities
def create_inv_idx(data,max_len):
    index_dict = {}
    # loop the entities to create n-grams and dictionary   
    for i in range(len(data)):
        value = data[i]
        # entity length = word count = number of spaces + 1
        entity_len = value.count(' ') + 1  
        # max_len = required length/word count of n_grams
        if( entity_len >= max_len):
            # assign entity length to n-gram length
            ngram_len = entity_len
            # find number iteration needed for n-grams generation
            iterations = abs(ngram_len - max_len) + 1
            for j in range(iterations):
                ngrams = TextBlob(value).ngrams(ngram_len)
                # get list of n-grams
                ngram_list = [' '.join(grams) for grams in ngrams]
                # updated inverted index 
                # with key = n_gram and value = entity string
                for k in range(len(ngram_list)):
                    if ngram_list[k].upper() in index_dict:
                        index_dict[ngram_list[k].upper()].append(value)
                    else:
                        index_dict[ngram_list[k].upper()] = [value]
                # decrement the n-gram length
                ngram_len = ngram_len - 1
        else:
            if value.upper() in index_dict:
                index_dict[value.upper()].append(value)
            else:
                index_dict[value.upper()] = [value]
    
    return index_dict

In [8]:
# provide the length of n-grams
n_gram_split = 3
final_inv_idx = create_inv_idx(data,n_gram_split)

# exporting final inverted index to a pickle
with open('inverted_index_v2.pickle' , 'wb') as handle:
    pickle.dump(final_inv_idx, handle , protocol = pickle.HIGHEST_PROTOCOL)

In [9]:
# loading the inverted index from pickle
with open('inverted_index_v2.pickle', 'rb') as handle:
    inv_idx_pickle = pickle.load(handle)

# while creation, check created inverted idex is same as loaded inverted index
if 'final_inv_idx' in locals() or 'final_inv_idx' in globals():
    print(final_inv_idx == inv_idx_pickle)

True


In [14]:
# function to retrieve the top entity of the predicted entity string
def top_entity(pred_entity_string):
    
    # Step by step procedure to entities list using predicted entity string as input
    #*******************************************************************************
    # get top_10_rank_keys from the inverted index key for user input
    # Find max score from the top_10_rank_keys list
    # get max_score_keys using max_score
    # find max length from max_score_keys list 
    # get top_keys using max_length
    # get entities by index 0
    
    top_10_rank_keys = process.extract(pred_entity_string, keys , limit = 10)
    max_score_key = max(top_10_rank_keys, key=lambda item: item[1])
    max_key_score = max_score_key[1]
    h_score_keys = [row[0] for row in top_10_rank_keys if row[1] == max_key_score ]
    max_len_key = max(h_score_keys, key=lambda item: item.count(' ') + 1 )
    max_key_len = max_len_key.count(' ') + 1
    top_keys = [row for row in h_score_keys if row.count(' ') + 1 == max_key_len ]
    entities_list = inv_idx_pickle[top_keys[0]]
    
    # Step by step procedure to retrieve top scored entity from entities list using predicted entity string as input
    # **************************************************************************************************************
    # get top_10_entities from the previoues entities list
    # Find max Score from the top_10_entities list
    # get max_score_entities using max_score
    # find max Length from max_score_entities list 
    # get top_entities using max_length
    # get entity by index 0
    
    top_10_entities = process.extract(pred_entity_string, entities_list , limit = 10)
    max_score_entity = max(top_10_entities, key=lambda item: item[1])
    max_entity_score = max_score_entity[1]
    h_score_entities = [row[0] for row in top_10_entities if row[1] == max_entity_score ]
    max_len_entity = max(h_score_entities, key=lambda item: item.count(' ') + 1 )
    max_entity_len = max_len_entity.count(' ') + 1
    top_entities = [row for row in h_score_entities if row.count(' ') + 1 == max_entity_len ]
    return top_entities[0]    

In [15]:
# create list of keys from inverted index
keys = [k for k in inv_idx_pickle]

In [16]:
# check the retrieved entity using the substring of an entity
entity = top_entity('BOTULINUM TOXIN TYPE')
entity

'Botulinum Toxin Type B'

In [17]:
# read the list of predicted entities from NER task
df = pd.read_excel('test_qa_data_v00.xlsx')
df.head(3)

Unnamed: 0.1,Unnamed: 0,SlNo,Question,Relation,Pred_Relation,Relation_result,NER_Tag,Q_Len,T_Len,Subject,Pred_Subject,Subject_result,Pair_Result,IR_Pred_Subject,IR_Subject_Result,IR_Pair_Result,Subject_URI,Relation_URI
0,297,298,Nitroglycerin is patented under which number,patent,patent,Ok,B-E O O O O O,6,6,Nitroglycerin,Nitroglycerin,Ok,Ok,Nitroglycerin,Ok,Ok,http://bio2rdf.org/drugbank:DB00727,http://bio2rdf.org/drugbank_vocabulary:patent
1,470,471,which companies manufacture Phenmetrazine,manufacturer,manufacturer,Ok,O O O B-E,4,4,Phenmetrazine,Phenmetrazine,Ok,Ok,Phenmetrazine,Ok,Ok,http://bio2rdf.org/drugbank:DB00830,http://bio2rdf.org/drugbank_vocabulary:manufac...
2,536,537,list all synonyms of Nepafenac,synonym,synonym,Ok,O O O O B-E,5,5,Nepafenac,Nepafenac,Ok,Ok,Nepafenac,Ok,Ok,http://bio2rdf.org/drugbank:DB06802,http://bio2rdf.org/drugbank_vocabulary:synonym


In [18]:
# link predicted entities-string with actual entities using fuzzy-search 
pred_entities_list = []
for pred_subj in df['Pred_Subject']:
    entity = top_entity(pred_subj)
    pred_entities_list.append(entity)

In [19]:
# print the entities linked using fuzzy-search 
print(pred_entities_list)

['Nitroglycerin', 'Phenmetrazine', 'Nepafenac', 'Hydrochlorothiazide', 'Zafirlukast', 'Theophylline', 'Aldehyde oxidase', 'Vitamin D3 receptor', 'Glutethimide', 'L-Cysteine', 'Flurandrenolide', 'Dipivefrin', 'Flavohemoprotein', 'Preotact', 'MGCD-0103', 'Cytochrome P450 2B7 isoform', 'Deferasirox', 'Aspartate Semialdehyde', 'Ibuprofen', 'Transthyretin', 'Cefmetazole', 'Glycine', 'Hydromorphone', 'Fluticasone Propionate', 'Ketoconazole', 'Phenylephrine', 'Estriol', 'Acetyl-CoA carboxylase 2', 'Fusidic Acid', 'Simvastatin', 'Peginterferon alfa-2b', 'Loratadine', 'NBI-6024', 'Cephalexin', 'AZD6140', 'Vitamin D-binding protein', 'Warfarin', 'Pegaptanib', 'Human Serum Albumin', 'MEM 1414', 'Delorazepam', 'Menthol', 'Cytochrome P450 2B7 isoform', 'Chlorphenamine', 'Diclofenac', 'Venlafaxine', 'Hydrolase', 'Gold bond medicated lotion', 'Beta-lactamase TEM', 'Isradipine', '33 kDa chaperonin', 'Ammonia channel', 'Moxifloxacin', 'Aldehyde oxidase', 'Ziprasidone', 'Cytochrome P450 2B7 isoform', 'T

In [20]:
# write to csv file to calculate pair accuracy
with open('ir_p_entities_final_v00.csv', 'w', newline='', encoding='utf-8') as myfile:
     wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
     wr.writerow(pred_entities_list)