In [22]:
#import dataset
import pandas as pd
import numpy as np
import spacy
import re
import json
from spacy.pipeline import EntityRuler
from spacy.training import Example
from spacy.util import filter_spans
from spacy.tokens import DocBin
from tqdm import tqdm

In [23]:
# load the dataset
df = pd.read_parquet('clean_drug_master.parquet', engine='fastparquet')

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   drug_name                      999 non-null    object 
 1   medical_condition              999 non-null    object 
 2   side_effects                   999 non-null    object 
 3   generic_name                   999 non-null    object 
 4   drug_classes                   999 non-null    object 
 5   brand_names                    999 non-null    object 
 6   activity                       999 non-null    object 
 7   rx_otc                         999 non-null    object 
 8   pregnancy_category             999 non-null    object 
 9   csa                            999 non-null    object 
 10  alcohol                        999 non-null    object 
 11  related_drugs                  999 non-null    object 
 12  medical_condition_description  999 non-null    obj

In [25]:
# convert to lower case
df.drug_name = df.drug_name.str.lower().str.strip()

In [27]:
# get a list of drugs in an array inlcuding drugs with multiple names
drugs = df.drug_name.str.split("/").to_list()

In [28]:
# get the flattened array
drugs = [[d.strip() for d in drug] for drug in drugs]

## Build NER ML MODEL
this method try to build an NER with just drug name. result are unsucessful.

In [8]:
#training data of all files
TRAIN_DATA = []

for d in df.drug_name.to_list():
    TRAIN_DATA.append( {"text": d, "entities": [(0, len(d), "DRUG")] })

In [20]:
nlp = spacy.blank("en") # load a new spacy model
doc_bin = DocBin() # create a DocBin object

In [21]:
for training_example in tqdm(TRAIN_DATA):
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents 
    doc_bin.add(doc)

doc_bin.to_disk("training_data.spacy") # save the docbin objec

100%|██████████| 999/999 [00:00<00:00, 3001.85it/s]


In [22]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [23]:
!python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy 

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[2023-05-28 21:55:07,909] [INFO] Set up nlp object from config
[2023-05-28 21:55:07,921] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-05-28 21:55:07,927] [INFO] Created vocabulary
[2023-05-28 21:55:07,927] [INFO] Finished initializing nlp object
[2023-05-28 21:55:08,563] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     53.00    0.00    0.00    0.00    0.00
  2     200         15.32    907.51  100.00  100.00  100.00    1.00
  5     400          0.18      0.17  100.00  100.00  100.00    1.00
  8     600          0.00      0.00  100.00  100.00  100.00    1.00
 12     800          0.00      0.00  100.00  100.00  100.00    

In [104]:
nlp_ner = spacy.load("model-best")

In [105]:
text = "what are the side effects of minocycline and accutane having"
doc = nlp_ner(text)
for ent in doc.ents:
    if ent.label_ == "DRUG":
        print(ent.text, ent.label_)

minocycline and accutane having DRUG


training this model with just the drug name will not be effective and result in wrong drug detection in a drug. best resolution i to use a pattern matcher using spacy **Entity ruler** and adding it to the pipeline

## Build RULE BASED NER MODEL

In [29]:
# Load the base English model
nlp = spacy.load("en_core_web_sm",disable=["tagger", "lemmatizer","attribute_ruler"])

building pattern matching and add to nlp pipeline

In [30]:
patterns = []

for drug in tqdm(drugs):
    for d in drug:
        min_pattern = []
        for t_split in d.split(" "):
            for t in re.split(r'(-)',t_split):
                min_pattern.append({"LOWER": t.strip()})
        patterns.append({"label": "DRUG", "pattern": min_pattern})

ruler = nlp.add_pipe("entity_ruler",before='ner')
ruler.add_patterns(patterns)

100%|██████████| 999/999 [00:00<00:00, 123467.30it/s]


In [31]:
patterns

[{'label': 'DRUG', 'pattern': [{'LOWER': 'doxycycline'}]},
 {'label': 'DRUG', 'pattern': [{'LOWER': 'spironolactone'}]},
 {'label': 'DRUG', 'pattern': [{'LOWER': 'minocycline'}]},
 {'label': 'DRUG', 'pattern': [{'LOWER': 'accutane'}]},
 {'label': 'DRUG', 'pattern': [{'LOWER': 'clindamycin'}]},
 {'label': 'DRUG', 'pattern': [{'LOWER': 'aldactone'}]},
 {'label': 'DRUG', 'pattern': [{'LOWER': 'tretinoin'}]},
 {'label': 'DRUG', 'pattern': [{'LOWER': 'isotretinoin'}]},
 {'label': 'DRUG', 'pattern': [{'LOWER': 'bactrim'}]},
 {'label': 'DRUG',
  'pattern': [{'LOWER': 'retin'}, {'LOWER': '-'}, {'LOWER': 'a'}]},
 {'label': 'DRUG', 'pattern': [{'LOWER': 'aczone'}]},
 {'label': 'DRUG', 'pattern': [{'LOWER': 'benzoyl'}, {'LOWER': 'peroxide'}]},
 {'label': 'DRUG', 'pattern': [{'LOWER': 'differin'}]},
 {'label': 'DRUG', 'pattern': [{'LOWER': 'epiduo'}]},
 {'label': 'DRUG', 'pattern': [{'LOWER': 'adapalene'}]},
 {'label': 'DRUG', 'pattern': [{'LOWER': 'cephalexin'}]},
 {'label': 'DRUG', 'pattern': [{

the spacy pipeline

In [32]:
nlp.pipe_names

['tok2vec', 'parser', 'entity_ruler', 'ner']

In [33]:
# save model to disk
nlp.to_disk("./drugmatcher")

In [34]:
nlp = spacy.load('drugmatcher/')

In [35]:
# make a prediction
def get_result(query : str,focus : str):
    # Process the text
    text = query
    doc = nlp(text)

    # Print the entities
    query_result = []
    for ent in doc.ents:
        if ent.label_ == 'DRUG':
            query_result.append(ent.text)


    query_result = list(set(query_result))

    result = []
    focus = [f.strip() for f in focus.split(',')]
    #focus = list(set(focus))

    custom_focus = ['drug_name', 'side_effects', 'generic_name','drug_classes', 'brand_names',
       'pregnancy_category', 'csa', 'related_drugs']

    for q in query_result:
        indexes = [index for index, sublist in enumerate(drugs) if q in sublist]

        for idx in indexes:
            try:
                json_ = df.loc[idx, focus].to_json()
            except:
                json_ = df.loc[idx, custom_focus].to_json()

            result.append(json.loads(json_))

    return result

In [36]:
df.columns

Index(['drug_name', 'medical_condition', 'side_effects', 'generic_name',
       'drug_classes', 'brand_names', 'activity', 'rx_otc',
       'pregnancy_category', 'csa', 'alcohol', 'related_drugs',
       'medical_condition_description', 'rating'],
      dtype='object')

In [40]:
get_result("what are the effects of bactrim ?",'drug_name,side_effects, brand_names')

[{'drug_name': 'bactrim',
  'side_effects': ['stomach pain',
   'diarrhea that is watery or bloody',
   'yellowing of your skin or eyes',
   'a seizure',
   'new or unusual joint pain',
   'swelling',
   'dry mouth',
   'fruity breath odor',
   'fever',
   'high blood potassium - nausea',
   'weakness',
   'loss of movement',
   'confusion',
   'chills',
   'easy bruising',
   'skin rash'],
  'brand_names': ['Bactrim DS']}]