Test case

In [1]:
import os

working_dir =  r"C:\Users\Luzia T\job_applications\Senior_research_fellow_IHI_UCL"
filename = "example.txt"

with open(os.path.join(working_dir, filename), 'r') as reader:
    txt = reader.read()

print(type(txt))
print(txt)

<class 'str'>
Aspirin 75mg daily
PANADOL 1000mg WHEN NEEDED MAX FOUR TIMES DAILY
Atorvastatin 20mg daily
Penicillin 500mg daily
Paracetamol

History:
Feeling well up until yesterday, then he suddenly developed a severe headache.



In [2]:
import re

def my_meds_reader(s):
    """ 
    Reads data from text file and returns medication name.

    Parameters
    ----------
    s:  str
        input string
    
    Returns
    -------
    meds_dose_list: list
        list of lists containing medication code (int) and dose (str) (e.g. [[1,'75mg daily'],[3,'100mg every 2 hours']])
    """
    # names of drugs to search in text
    meds_to_search = {'aspirin': 2, 'paracetamol': 1,'panadol': 1,'penicillin': 3}

    # split on empty line\line containing only whitespaces.
    # retain string content prior to first split only (ignore anything below \n\s*\n)
    s = re.split("\n\s*\n",s)[0]

    # lower case everything
    s = s.lower()

    # split into individual lines
    s = s.split('\n')

    # split on first whitespace
    s_split = [f.split(" ",1) for f in s]

    # replace medication names with key (str)]
    meds_dose_list=[[meds_to_search[f[0]],f[1]]
                    if len(f)>1 else 
                    [meds_to_search[f[0]],""]
                    for f in s_split
                    if f[0] in meds_to_search.keys()]
    

    return meds_dose_list

In [5]:
meds_list = my_meds_reader(txt)
print(meds_list)

[[2, '75mg daily'], [1, '1000mg when needed max four times daily'], [3, '500mg daily'], [1, '']]


Accepting additional kwargs

In [6]:
import re

def my_meds_reader_w_keys(s,**kwargs):
    """ 
    Reads data from text file and returns medication name.

    Parameters
    ----------
    s:  str
        input string
    
    optional kwargs:    dict
        key-value pairs of drug names (str) and keys (int)
    
    Returns
    -------
    meds_dose_list: list
        list of lists containing medication code (int) and dose (str) (e.g. [[1,'75mg daily'],[3,'100mg every 2 hours']])
    """
    meds_to_search = {'aspirin': 2, 'paracetamol': 1,'panadol': 1,'penicillin': 3}
    #dict_variable = {key:value for (key,value) in dictonary.items()}
    if kwargs:
        for new_entries in kwargs.values():
            # names of drugs to search in text
            meds_to_search.update(new_entries)
    

    # split on empty line\line containing only whitespaces.
    # retain string content prior to first split only (ignore anything below \n\s*\n)
    s = re.split("\n\s*\n",s)[0]

    # lower case everything
    s = s.lower()

    # split into individual lines
    s = s.split('\n')

    # split on first whitespace
    s_split = [f.split(" ",1) for f in s]
    
    meds_dose_list=[[meds_to_search[f[0]],f[1]]
                    if len(f)>1 else 
                    [meds_to_search[f[0]],""]
                    for f in s_split
                    if f[0] in meds_to_search.keys()]

    return meds_dose_list

In [7]:
meds_list = my_meds_reader_w_keys(txt, addmeds = {'atorvastatin': 5})
print(meds_list)

[[2, '75mg daily'], [1, '1000mg when needed max four times daily'], [5, '20mg daily'], [3, '500mg daily'], [1, '']]


To check for spelling mistakes

In [8]:
import re
from difflib import SequenceMatcher

def my_meds_reader_close_matches(s):
    """ 
    Reads data from text file and returns medication name.
    Returns similarity score for non-matching drug names.

    Parameters
    ----------
    s:  str
        input string
    
    Returns
    -------
    meds_dose_list: list
        list of lists containing medication code (int) and dose (str) (e.g. [[1,'75mg daily'],[3,'100mg every 2 hours']])
    """
    # names of drugs to search in text
    meds_to_search = {'aspirin': 2, 'paracetamol': 1,'panadol': 1,'penicillin': 3}
    
    # lower case everything
    s = s.lower()

    # split on empty line\line containing only whitespaces.
    # retain string content prior to first split only (ignore anything below \n\s*\n)
    s = re.split("\n\s*\n",s)[0]

    # split into individual lines
    s = s.split('\n')

    # split on first whitespace
    s_split = [f.split(" ",1) for f in s]

    # get non matches
    non_matched_only = [f[0] for f in s_split
                        if f[0] not in meds_to_search.keys()]

    # check whether non matches are a close match for any of the drugs
    match_ratio = [[f, key, SequenceMatcher(None,f,key).ratio()]
                    for f in non_matched_only 
                    for key in meds_to_search.keys()]
    
    #check for highest match ratio in each sublist
    similarity = []
    for name in non_matched_only:  
        max_match = max([f for f in match_ratio if name in f], key=lambda item: item[2])
        similarity.append(max_match)
    print(f'For non-matching drug names, the highest similarity was: {similarity}')

    # replace medication names with key (str)]
    meds_dose_list=[[meds_to_search[f[0]],f[1]]
                    if len(f)>1 else 
                    [meds_to_search[f[0]],""]
                    for f in s_split
                    if f[0] in meds_to_search.keys()]
    
    return meds_dose_list, non_matched_only, match_ratio

In [9]:
meds_list,non_matches,matchrat = my_meds_reader_close_matches(txt)

For non-matching drug names, the highest similarity was: [['atorvastatin', 'paracetamol', 0.43478260869565216]]
