In [1]:
# NOTE: in terminal ulimit -n 4096 and then open this notebook 

# NER

In [1]:
# imports 
from pathlib import Path
from quickumls import QuickUMLS
import os 

In [2]:
def reading_files (path_file):
    """
    param: path to file
    returns the text in the file as a string 
    """
    text_list = []
    with open(path_file, encoding='ISO-8859-1') as infile:
            for row in infile:
                row = row.strip('\n')
                row = row.strip("\\")
                text_list.append(row)
                joined_strings = ','.join(text_list)
            
    return joined_strings

In [3]:
def map_strings_to_qumls (path_file, qumls_file): 
    """
    param: path to file
    param2: path to directory where qumls files are stored 
    returns of a file all entities which are mapped by quickumls to the KG
    """
    joined_strings = reading_files (path_file)
    matcher = QuickUMLS(qumls_file)
    maps = matcher.match(joined_strings, best_match=True, ignore_syntax=False)
    
    return maps

In [4]:
def itterating_sentences (file):
    """
    param: path to file 
    returns list of sentences
    """
    columns = []
    sentences = []
    with open(file, encoding='ISO-8859-1') as csv_file:
        for row in csv_file:
            row = row.strip('\n') 
            column = row.split('\t')
            columns.append(column)
    for x in columns[1:]: # leaving out headers
        sentences.append(x[0])
    return sentences

In [5]:
def check_if_maps_are_IC_eng(path_file, condition, qumls_file_eng):
    """
    param: path to file
    param2: list of strings with HIV indicors
    param3: path to directory where english qumls files are stored
    returns list of dictionaries with key position of word in file and value file name, the concept and similarity score
    """
    maps_eng = map_strings_to_qumls (path_file, qumls_file_eng)
    dict_list = []
    for qumls_list in maps_eng:
        for qumls_dict in qumls_list:
            new_dict = {}
            sim_score = qumls_dict["similarity"]
            good_term = qumls_dict["term"]
            position = qumls_dict["end"]
            for ent in condition:
                if good_term.lower() == ent.lower():
                    new_dict[position] = os.path.basename(path_file), good_term, sim_score
                    dict_list.append(new_dict) 
    return dict_list


In [25]:
import re
def matching_strings (search_list, text): 
    """
    param1: list with regular expressions
    param2: where regular expressions should be found
    returns list with items which are found
    
    """
    all_list = []
    for item in search_list:
        try:
            if re.findall(item, text, re.IGNORECASE) == None:
                continue
            if re.findall(item, text, re.IGNORECASE) == []:
                continue
            else:
                found = re.findall(item, text, re.IGNORECASE)
                all_list.append(found)
              
        except:
            continue
    return all_list

In [26]:

def check_if_maps_are_IC (path_file, condition, combined = "yes"):
    """
    param: path to file
    param2: keyargument are the IC's, if you would like to check e.g. medications, you could change this condition argument. NOTE: this should be of the format: list of list of strings, where each list of strings is one entity
    param3: keyargument is combination of Dutch and English UMLS, "nl" means only Dutch, "en" means only English data base
    returns a set if the file contains an IC and therefore the patient should be recommended testing
    """
    maps = map_strings_to_qumls(path_file, qumls_file)

    dict_list = []
    if combined =="yes":
            for qumls_list in maps:
                for qumls_dict in qumls_list:
                    new_dict = {}
                    sim_score = qumls_dict["similarity"]
                    good_term = qumls_dict["term"]
                    position = qumls_dict["end"]
                    for ent in condition:
                        if good_term.lower() == ent.lower():
                            new_dict[position] = os.path.basename(path_file), good_term, sim_score
                            dict_list.append(new_dict)


            for dicto in  check_if_maps_are_IC_eng(path_file, condition, qumls_file_eng):
                dict_list.append(dicto) if  dicto  not in dict_list else dict_list
    return dict_list 

In [27]:

def check_if_maps_are_IC (path_file, condition, combined = "yes"):
    """
    param: path to file
    param2: keyargument are the IC's, if you would like to check e.g. medications, you could change this condition argument. NOTE: this should be of the format: list of list of strings, where each list of strings is one entity
    param3: keyargument is combination of Dutch and English UMLS, "nl" means only Dutch, "en" means only English data base
    returns a set if the file contains an IC and therefore the patient should be recommended testing
    """
    maps = map_strings_to_qumls(path_file, qumls_file)

    dict_list = []
    if combined == "yes":
        for qumls_list in maps:
            for qumls_dict in qumls_list:
                new_dict = {}
                sim_score = qumls_dict["similarity"]
                good_term = qumls_dict["term"]
                position = qumls_dict["end"]
                for ent in condition:
                    if good_term.lower() == ent.lower():
                        new_dict[position] = os.path.basename(path_file), good_term, sim_score
                        dict_list.append(new_dict)


        for dicto in  check_if_maps_are_IC_eng(path_file, condition, qumls_file_eng):
            dict_list.append(dicto) if  dicto  not in dict_list else dict_list
        
        item = reg_ex_list     
        with open (path_file, encoding='ISO-8859-1') as infile:
            text = infile.read()
            x = matching_strings (item, text)
            for listo in x:
                new_dict={}
                new_dict['reg ex'] = os.path.basename(path_file), listo[0], 1
                dict_list.append(new_dict)
        
    if combined =="nlen":
        for qumls_list in maps:
            for qumls_dict in qumls_list:
                new_dict = {}
                sim_score = qumls_dict["similarity"]
                good_term = qumls_dict["term"]
                position = qumls_dict["end"]
                for ent in condition:
                    if good_term.lower() == ent.lower():
                        new_dict[position] = os.path.basename(path_file), good_term, sim_score
                        dict_list.append(new_dict)


        for dicto in  check_if_maps_are_IC_eng(path_file, condition, qumls_file_eng):
            dict_list.append(dicto) if  dicto  not in dict_list else dict_list
        
     
    if combined == "nl":
        for qumls_list in maps:
            for qumls_dict in qumls_list:
                new_dict = {}
                sim_score = qumls_dict["similarity"]
                good_term = qumls_dict["term"]
                position = qumls_dict["end"]
                for ent in condition:
                    if good_term.lower() == ent.lower():
                        new_dict[position] = os.path.basename(path_file), good_term, sim_score
                        dict_list.append(new_dict)
                        
    if combined == "en":
        for qumls_list in map_strings_to_qumls (path_file, qumls_file_eng):
            for qumls_dict in qumls_list:
                new_dict = {}
                sim_score = qumls_dict["similarity"]
                good_term = qumls_dict["term"]
                position = qumls_dict["end"]
                for ent in condition:
                    if good_term.lower() == ent.lower():
                        new_dict[position] = os.path.basename(path_file), good_term, sim_score
                        dict_list.append(new_dict)
        
    return dict_list
    



In [28]:
def itterating_folder (path_to_directory):
    """
    param: path to directory where the files are stored
    returns list with all the file names in a directory as strings
    """
    pathlist = []
    for path in Path(path_to_directory).glob("*"):
         path_in_str = str(path)
         pathlist.append(path_in_str)
    return pathlist

In [29]:
def check_whole_directory_qulms_ic(path_to_directory, combined="yes"): 
    """
    param: path to directory where the files are stored
    returns a list of which files contain an IC and therefore the patient should be recommended testing
    """
    recommendation_list = []
    for path_file in itterating_folder (path_to_directory):
        recommendation = check_if_maps_are_IC (path_file, condition, combined)
        recommendation_list.append(recommendation)
        recommendation_list2 = [x for x in recommendation_list if x != set()]
    return recommendation_list2
    
    

In [30]:
from collections import defaultdict

def find_highest_sim_score(path_to_directory, combined="yes"):
    """
    param: path to directory where the files are stored
    returns list with dictionaries key: position of term in text and value: number, concept, similarity score with term
    """
    merge_same_key_dict = defaultdict(list)
    for one_list in check_whole_directory_qulms_ic(path_to_directory, combined="yes"):
        for one_dict in one_list: 
            for key, value in one_dict.items():
                merge_same_key_dict[key].append(value)


    dict_list = []  
    for key, value in merge_same_key_dict.items():
        dicta = {}
        if len(value) == 1:
            dicta[key] = value  
            dict_list.append(dicta)
        if len(value) >1:
            listo = []
            for tup in value:
                listo.append(tup[2])
                highest = max(listo)
                if highest == tup[2]:
                    new = (tup[0], tup[1], highest)
                    dicta[key] = [new]
                    dict_list.append(dicta)
    return dict_list

In [31]:
import pandas as pd 
def create_xls_file(path_to_directory, combined="yes"):
    """
    param: path to directory 
    param2: keyword argument if the versions should be combined, "yes" means combined (Dutch, English, Regex), "nlen" means without regex, "nl" is only Dutch and "en" is only English 
    returns dataframe with name of note, concept and similarity score 
    """
    list_of_dicts = find_highest_sim_score(path_to_directory, combined="yes")
    tup_list = []
    for dicto in list_of_dicts:
        for key, value in dicto.items():
            for tup in value:
                tup_list.append(tup)
    df = pd.DataFrame(tup_list, columns =['Note', 'Concept', 'Sim Score']) 
    return df
  

    

# Evaluation

In [32]:
def evaluation_scores(df1, df2) : 
    """
    param1: pandas dataframe with name of note, concept and similarity score 
    param2: pandas dataframe with name of note, concept and similarity score  which should be evaluated against df1
    returns list with dicionaries with counters of true positives, false negatives and false positives 
    
    """
    
    x = df1["Note"].unique()
    x = list(x)
   
    eval_dict_list = []
   
    for el in x:
        TP_counter = 0 
        FN_counter = 0
        eval_dict = {}
        note = df1.loc[df1.Note == el]["Concept"]
        note_man = df2.loc[df2.Note == el]["Concept"]
        note = list(note)
        note_man=list(note_man)
        new_list = []
        new_list2 = []
        dict_list = []
        for el in note_man:
            new_list.append(el.lower())
        for el in note:
            new_list2.append(el.lower())

        for el in new_list:
            if el in new_list2:
                TP_counter += 1
                new_list2.remove(el)
            else:
                FN_counter +=1
        eval_dict["TP"] = TP_counter
        eval_dict["FN"] = FN_counter
        eval_dict["FP"] = len(new_list2)
        eval_dict_list.append(eval_dict)
        
    return  eval_dict_list

In [33]:
from collections import defaultdict

def summing_eval_values(df1, df2):
    """
    param1: pandas dataframe with name of note, concept and similarity score 
    param2: pandas dataframe with name of note, concept and similarity score  which should be evaluated against df1
    returs list with combined dictonaries 
    """
    score_list = []
    evaluation_scores(df1, df2) 

    dd = defaultdict(list)

    for d in (evaluation_scores(df1, df2)):
        for key, value in d.items():
            dd[key].append(value)
    for key, value in dd.items():
          score_list.append(key + " " + str(sum(value)))
    return score_list


In [34]:
def obtaining_recall_precision_f_score(df1, df2): 
    """
    param1: pandas dataframe with name of note, concept and similarity score 
    param2: pandas dataframe with name of note, concept and similarity score  which should be evaluated against df1
    returns dictionary with evaluation scores 
    """
    
    eval_dict = {}

    TP = summing_eval_values(df1, df2)[0]
    integer_TP =int(TP[2:])
    FN = summing_eval_values(df1, df2)[1]
    integer_FN =int(FN[2:])
    FP = summing_eval_values(df1, df2)[2]
    integer_FP =int(FP[2:])
    
    precision = ((integer_TP)/(integer_TP+integer_FP))
    recall = ((integer_TP)/(integer_TP+integer_FN))
    f_score = 2*((precision*recall)/(precision+recall))
    eval_dict["recall"] = recall
    eval_dict["precision"] = precision
    eval_dict["f_score"] = f_score
    
    return eval_dict
    

In [35]:
# Fill in paths: 
from Creating_rex_ex_list import creating_reg_ex_list

# path to Dutch qumls files
qumls_file ='/Users/lumei/Downloads/dut' 
# path to English qumls files
qumls_file_eng='/Users/lumei/Downloads/eng'
#Path to test notes (text files)
path_to_directory = "/Users/lumei/Desktop/Annotated_Notes"
#Path to file where conditions are stored
condition_file = "/Users/lumei/Documents/Stage/IC_list.csv"
condition = itterating_sentences(condition_file)
reg_ex_list= creating_reg_ex_list(condition_file, qumls_file, qumls_file_eng)
df1 = create_xls_file(path_to_directory)
#Path to manual annotations file 
path_to_manual_annotations = "/Users/lumei/Desktop/Mannual Annotations.csv"
df2 =  pd.read_excel(path_to_manual_annotations) 

obtaining_recall_precision_f_score(df1, df2)

{'recall': 0.7894736842105263,
 'precision': 0.8426966292134831,
 'f_score': 0.8152173913043478}