In [10]:
import pandas as pd
import random
random.seed(42)
def clean_df(df):
    new_df = df[['iri','prefLabel']].copy()
    new_df['prefLabel'] = new_df['prefLabel'].str.upper()
    new_df.columns = ['rxnorm_code','rxnorm_label']
    return new_df.dropna().drop_duplicates().sort_values('rxnorm_label')

def clean_form_ocrx(raw_form_df):
    def clean_form_ocrx_helper(el):
        # removes the word in and inverts it
        if " IN " in el:
            splitted = el.split(" IN ")
            if len(splitted) > 2:
                import ipdb; ipdb.st_trace()
            start, last = splitted[:2]
            return last + " " + start
        else:
            return el
            
    form_df = raw_form_df.copy()
#     import ipdb; ipdb.set_trace()
    form_df['old_ocrx_label'] = form_df['ocrx_label'].copy()
    form_df['ocrx_label'] = [clean_form_ocrx_helper(el) for el in form_df['old_ocrx_label'].values]
    return form_df
def gather_ocrx_medical_term(medical_term):
    df = pd.read_csv(f"/home/james/Datasets/ocrx-owls/match-ocrx/match_{medical_term}_en.csv")
    df = df.drop_duplicates().sort_values("sLabel")
    labels = df['sLabel'].str.upper().values
    codes = df['s'].values
    return [(str(val),str(val_code)) for val, val_code in zip(labels,codes)]

def parse_dosage(dose):
    splitted = dose.split(" ")
    unit = splitted[-1]
    if unit in units_conversion:
        pass
# print('\n'.join([str(el) for el in df_compo['dose'].unique()]))
ocrx_terms = ['form','roa','substance','strength','distinction']
ocrx_dfs = [gather_ocrx_medical_term(term) for term in ocrx_terms]

def convert_ocrx_to_df(ocrx_elems):
    return [pd.DataFrame(ocrx_df,columns=['ocrx_label','ocrx_code']) for ocrx_df in ocrx_elems]

ocrx_dfs = convert_ocrx_to_df(ocrx_dfs)
ocrx_dfs[0] = clean_form_ocrx(ocrx_dfs[0])
ocrx_form_df, ocrx_roa_df, ocrx_substance_df, ocrx_strength_df, ocrx_distinction_df = ocrx_dfs

In [11]:
import random
seps = [' , ',' | ']


class Word:
    def __init__(self,text,label):
        self.text = text
        self.len = len(text)
        self.label = label
        self.index = 0
    def copy(self):
        w = Word(self.text,self.label)
        w.index = self.index
        w.label = self.label
        return w
    def generate_annotations(self):
        return [(0,self.len,self.label)]
    def __str__(self):
        return self.text
        
class Group:
    def __init__(self,words):
        self.words = words
        self.index = 0
        self.label = ''
        self.annotations = None
    def stringify(self):
        return ''.join([str(word) for word in self.words])
    def __str__(self):
        return self.stringify()
    def copy(self):
        copy = Group([])
        for word in self.words:
            if type(word) == str:
                import ipdb; ipdb.set_trace()
                print(word)
#         import ipdb; ipdb.set_trace()
        copy.words = [word.copy() for word in self.words]
        copy.label = self.label
        return copy
    
    def set_label(self,string):
        self.label = string
        
    def __add__(self,word):
#         word = word_raw.copy()
        copy = self.copy()
        if type(word) == str:
            new_word = Word(word,"NA")
            copy.words.append(new_word)
        else:
            copy.words.append(word)
        return copy
    def generate_annotations(self):
        current_index = 0
        all_annotations = []
        for word in self.words:
            sub_annotations = [(current_index + start, current_index + end, label) for start,end,label in word.generate_annotations()]
            current_index += len(str(word))
            all_annotations.extend(sub_annotations)
        if self.label != '':
            all_annotations.append((0,len(str(self)),self.label))
        return all_annotations

def create_string(substance,strength):
    g = Group([Word(substance,"substance"),Word(" ","NA"),Word(strength,"strength")])
    g.set_label('component')
    return g
#     return f"{substance} {strength}"

def create_full_string(substances,strengths,form,route,distinction):
    subst_words = [create_string(substance,strength) for substance,strength in zip(substances,strengths)]
    g = Group([])
#     import ipdb; ipdb.set_trace()
    for i,word in enumerate(subst_words):
#         import ipdb; ipdb.set_trace()
        if i != 0 and type(word) != str:
            g += ' , '
        g += word
    g += " "
    g += Word(form,"form")
    if '(' not in form and ')' not in form:
        g += "("
        g += Word(distinction,"distinction")
        g += ")"
    g += " IN "
    g += Word(route,"roa")
    g.set_label("drug")
    return g
# def generate_labels(substances,forms,routes,strengths,distinctions):
    

In [12]:
import random

forms, roas, substances, strengths, distinctions = [list(df['ocrx_label'].values) for df in ocrx_dfs]
def generate_random(substances=substances,strengths=strengths,forms=forms,roas=roas,distinctions=distinctions):
    num_elems = random.randint(1,5)
    substances_el = random.sample(substances,num_elems)
    strengths_el = random.sample(strengths,num_elems)
    form = random.choice(forms)
    roa = random.choice(roas)
    distinction = random.choice(distinctions)
    return (substances_el, strengths_el, form, roa, distinction)

def generate_labels(n=1000,substances=substances,strengths=strengths,forms=forms,roas=roas,distinctions=distinctions):
    infos = [generate_random(substances,strengths,forms,roas,distinctions) for _ in range(n)]
    groups = [create_full_string(*info) for info in infos]
    dataset = [(w.stringify(),w.generate_annotations()) for w in groups]
    return dataset

In [13]:
'hi'.split('/')

['hi']

In [14]:
import re
clean_patt = re.compile('[0123456789\.]')
def remove_numbers(string):
    return clean_patt.sub('',string)
def extract_units(strengths):
    def extract_unit(strength):
        unit_space = strength.split(' ')[-1]
        unit_space = remove_numbers(unit_space)
        sep_units = unit_space.split('/')
        
        return sep_units
    units = sorted(list({unit for strength in strengths for unit in extract_unit(strength)}))
    return units

In [15]:
units_set = set(extract_units(strengths))
def check_substance_has_strength(substance):
    toks = substance.split(' ')[-2:]
    for tok in toks:
        sub_toks = tok.split('/')
        for sub_tok in sub_toks:
            clean_sub_tok = remove_numbers(sub_tok)
            if clean_sub_tok in units_set:
#                 print(clean_sub_tok)
                return True
    return False

In [16]:
no_strengths = [sub for sub in substances if not check_substance_has_strength(sub)]
with_strengths = [sub for sub in substances if check_substance_has_strength(sub)]

In [17]:
dataset = generate_labels(10000,substances=no_strengths)

In [18]:
dataset

[('.ALPHA.-DODECYL-.OMEGA.-HYDROXYTETRAKIS(OXYETHYLENE) 5 MG/0.4ML INTRASYNOVIAL LIQUID(3MONTH CONTROLLED RELEASE) IN INTRACEREBROVENTRICULAR',
  [(0, 52, 'substance'),
   (52, 53, 'NA'),
   (53, 63, 'strength'),
   (0, 63, 'component'),
   (63, 64, 'NA'),
   (64, 84, 'form'),
   (84, 85, 'NA'),
   (85, 110, 'distinction'),
   (110, 111, 'NA'),
   (111, 115, 'NA'),
   (115, 138, 'roa'),
   (0, 138, 'drug')]),
 ("PLATINUM, DIAMINE(1,1-CYCLOBUTANEDICARBOXYLATO (2-)-O,O')-, (SP-4-2)- 4.16 MG/ML , ACALYPHA INDICA WHOLE 5 MCG/5ML EPIDURAL SOLUTION(LONG ACTING) IN DISINFECTANT (INSTITUTIONAL/INDUSTRIAL)",
  [(0, 69, 'substance'),
   (69, 70, 'NA'),
   (70, 80, 'strength'),
   (0, 80, 'component'),
   (80, 83, 'NA'),
   (83, 104, 'substance'),
   (104, 105, 'NA'),
   (105, 114, 'strength'),
   (83, 114, 'component'),
   (114, 115, 'NA'),
   (115, 132, 'form'),
   (132, 133, 'NA'),
   (133, 144, 'distinction'),
   (144, 145, 'NA'),
   (145, 149, 'NA'),
   (149, 188, 'roa'),
   (0, 188, 'drug')

In [19]:
import pickle
with open("dataset.pickle","wb") as f:
    pickle.dump(dataset,f)

In [None]:
!nemo .


ImportError: could not import gobject (error was: ModuleNotFoundError("No module named 'gi'"))


Nemo-Share-[1;32mMessage[0m: [34m22:04:48.290[0m: Called "net usershare info" but it failed: 'net usershare' returned error 255: net usershare: cannot open usershare directory /var/lib/samba/usershares. Error No such file or directory
Please ask your system administrator to enable user sharing.

Nemo-Share-[1;32mMessage[0m: [34m22:41:34.967[0m: Called "net usershare info" but it failed: 'net usershare' returned error 255: net usershare: cannot open usershare directory /var/lib/samba/usershares. Error No such file or directory
Please ask your system administrator to enable user sharing.



In [None]:
import random
random.randint(1,10)

In [None]:
random.sample(['a','b','c'],2)

In [None]:
random.choice(['a','b','c'])

In [None]:
substances = [
    'ATORVASTATIN',
    'TYLENOL'
]
strengths = [
    '25 MG/ ML',
    '10 MG/ ML'
]
form = "TABLET"
distinction = "EXTENDED RELEASE"
roa = "ORAL"
group = create_full_string(substances,strengths,form,roa,distinction)
print(group)
group.generate_annotations()


In [None]:
'ATORVASTATIN 25 MG/ ML , TYLENOL 10 MG/ ML TABLET(EXTENDED RELEASE) IN ORAL'[25:32]