In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
4/3wGgE4PUVFQhEUfgTKtS-VkUW4tYDqQ0MkkLDtqwF79ZPkAAwwIkKuw
Mounted at /content/drive


In [2]:
import os
os.listdir('/content/drive/My Drive/Drug-Review')

['drugsComTest_raw.csv', 'drugsComTrain_raw.csv']

In [3]:
#load NLP packages
import spacy
from wordcloud import WordCloud,STOPWORDS
from spacy.util import minibatch, compounding
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import random 

In [4]:
!pip3 install spacy
!python3 -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [5]:
# NER

nlp = spacy.load('en_core_web_sm')

![](https://spacy.io/pipeline-7a14d4edd18f3edfee8f34393bff2992.svg)

Inside nlp object we have tokenizer,tagger,parser,ner

where,

*   tokenizer -> it creates 'Doc' -> Description : segment text into tokens
*   tagger -> it creates 'Doc[i].tag' -> Description : Assign part-of-speech tags.

*   parser -> it creates 'Doc[i].head, Doc[i].dep, Doc.sents, Doc.noun_chunks' -> Description : Assign dependency labels.

*   ner -> it creates 'Doc.ents, Doc[i].ent_iob, Doc[i].ent_type' -> Description : Detect and label named entities.

# Reference : [Language Processing Pipelines](https://spacy.io/usage/processing-pipelines)



In [6]:
# Get components of nlp object
nlp.pipe_names

['tagger', 'parser', 'ner']

In [7]:
ner = nlp.get_pipe('ner')

In [8]:
#ner.add_label()

In [9]:
# example

exp = "mobassir went to london to buy Morphine this year"

In [10]:
docs = nlp(exp)
type(docs)

spacy.tokens.doc.Doc

In [11]:
for entity in docs.ents:
  print(entity," -> ", entity.label_)

mobassir  ->  PERSON
london  ->  GPE
Morphine  ->  ORG
this year  ->  DATE


we can see that morphine was marked as ORG but actually its a drug. Morphine belongs to a class of drugs known as opioid analgesics.

**Drug class: opioid**

# we will try to see how to train our nlp object to be able to identify this one(morphine) correctly

now we will need to be able to seperate "ner" from our nlp object and then prepare our custom data

# How to Prepare the Data



*   Training data must be a tuple

TRAIN_DATA =[ ("who is mobassir hossen?",{"entities":[(start,stop,"label")]}) ]


example : TRAIN_DATA =[ ("who is mobassir hossen?",{"entities":[(7,22,"person")]}) ]



#loading the data

reference : [UCI ML Drug Review dataset](https://www.kaggle.com/jessicali9530/kuc-hackathon-winter-2018) 

In [12]:
df = pd.read_csv('/content/drive/My Drive/Drug-Review/drugsComTrain_raw.csv')
df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37


# About the data : 

dataset contains "Over 200,000 patient drug reviews", we are interested only on review and drugName columns from this data to accomplish our desired task


In [13]:
# get the drug names
drugs = df['drugName'].unique().tolist()
drugs = [x.lower() for x in drugs]
drugs

['valsartan',
 'guanfacine',
 'lybrel',
 'ortho evra',
 'buprenorphine / naloxone',
 'cialis',
 'levonorgestrel',
 'aripiprazole',
 'keppra',
 'ethinyl estradiol / levonorgestrel',
 'topiramate',
 'l-methylfolate',
 'pentasa',
 'dextromethorphan',
 'nexplanon',
 'liraglutide',
 'trimethoprim',
 'amitriptyline',
 'lamotrigine',
 'nilotinib',
 'atripla',
 'trazodone',
 'etonogestrel',
 'etanercept',
 'tioconazole',
 'azithromycin',
 'eflornithine',
 'daytrana',
 'ativan',
 'imitrex',
 'sertraline',
 'toradol',
 'viberzi',
 'mobic',
 'dulcolax',
 'morphine',
 'moviprep',
 'trilafon',
 'fluconazole',
 'contrave',
 'clonazepam',
 'metaxalone',
 'venlafaxine',
 'ledipasvir / sofosbuvir',
 'symbyax',
 'tamsulosin',
 'doxycycline',
 'dulaglutide',
 'intuniv',
 'buprenorphine',
 'qvar',
 'opdivo',
 'pyridium',
 'latuda',
 'bupropion',
 'implanon',
 'effexor xr',
 'drospirenone / ethinyl estradiol',
 'nuvaring',
 'prepopik',
 'tretinoin',
 'gildess fe 1 / 20',
 'ethinyl estradiol / norgestimate'

In [14]:
len(drugs)

3436

In [15]:
df['review']

0         "It has no side effect, I take it in combinati...
1         "My son is halfway through his fourth week of ...
2         "I used to take another oral contraceptive, wh...
3         "This is my first time using any form of birth...
4         "Suboxone has completely turned my life around...
                                ...                        
161292    "I wrote my first report in Mid-October of 201...
161293    "I was given this in IV before surgey. I immed...
161294    "Limited improvement after 4 months, developed...
161295    "I&#039;ve been on thyroid medication 49 years...
161296    "I&#039;ve had chronic constipation all my adu...
Name: review, Length: 161297, dtype: object

get all the drug names from df['review'] . get their position, locations and label

In [16]:
def processd_review(review):
  tokens = []
  for token in review.split():
    token = ''.join(e.lower() for e in token if e.isalnum())
    tokens.append(token)
  return ' '.join(tokens)

In [17]:
count = 0
train = []
for _,item in df.iterrows():
  ent_dict = {}
  if (count < 2000 ):
    review = processd_review(item["review"])
    # find drugs and their positions and add to visited
    visited = []
    entities = []
    for token in review.split():
      if token in drugs:
        for i in re.finditer(token,review):
          if token not in visited:
            entity = (i.span()[0], i.span()[1], "DRUG")
            visited.append(token)
            entities.append(entity)
    if len(entities) > 0 :
      ent_dict['entities'] = entities
      train_item = (review, ent_dict)
      train.append(train_item)
      count+=1



In [18]:
train

[('it has no side effect i take it in combination of bystolic 5 mg and fish oil',
  {'entities': [(50, 58, 'DRUG')]}),
 ('my son is halfway through his fourth week of intuniv we became concerned when he began this last week when he started taking the highest dose he will be on for two days he could hardly get out of bed was very cranky and slept for nearly 8 hours on a drive home from school vacation very unusual for him i called his doctor on monday morning and she said to stick it out a few days see how he did at school and with getting up in the morning the last two days have been problem free he is much more agreeable than ever he is less emotional a good thing less cranky he is remembering all the things he should overall his behavior is better we have tried many different medications and so far this is the most effective',
  {'entities': [(45, 52, 'DRUG')]}),
 ('i used to take another oral contraceptive which had 21 pill cycle and was very happy very light periods max 5 days no o

# Now let's train our NER model

reference : [Training Spacy ](https://spacy.io/usage/training#ner)

In [19]:
n_iter = 100

def ner_trainer (training_data):
    """steps: 
    1. create a Blank NLP model object
    2. create & add NER to the NLP model
    3. Add Labels from your training data
    4. train the model for n_iter epochs
    """
    TRAIN_DATA = training_data
    nlp = spacy.blank('en') # create blank language class
    print("created blank 'en' language class")
  
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

  
        nlp.begin_training()
        for itn in range(n_iter):
            #print(itn)
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)
        return nlp


In [20]:
%%time
nlp2 = ner_trainer(train)

created blank 'en' language class
Losses {'ner': 5557.338904499629}
Losses {'ner': 2030.4618439083554}
Losses {'ner': 1610.7894365430825}
Losses {'ner': 1455.55842819937}
Losses {'ner': 1248.9423267820491}
Losses {'ner': 1146.0940120744294}
Losses {'ner': 1100.6049682016944}
Losses {'ner': 1016.493737771673}
Losses {'ner': 942.2432579092546}
Losses {'ner': 875.3892429784313}
Losses {'ner': 839.7705434099198}
Losses {'ner': 766.3252893345583}
Losses {'ner': 758.3270181274745}
Losses {'ner': 717.7026833641144}
Losses {'ner': 648.2598374466077}
Losses {'ner': 623.99687884997}
Losses {'ner': 608.6279812805278}
Losses {'ner': 568.4285469871229}
Losses {'ner': 562.1698213004601}
Losses {'ner': 536.8418624473521}
Losses {'ner': 511.1086255950774}
Losses {'ner': 504.94089729406915}
Losses {'ner': 516.7236779232055}
Losses {'ner': 436.2955099915776}
Losses {'ner': 453.6656733638436}
Losses {'ner': 428.27019824243945}
Losses {'ner': 398.9351776542095}
Losses {'ner': 398.1698123899158}
Losses {'n

In [21]:
docx = nlp2(exp)

for entity in docx.ents:
  print(entity, entity.label_)

mobassir DRUG
Morphine DRUG


In [22]:
exp2 = "mobassir went to america to buy saxenda this year"
docx = nlp2(exp2)

for entity in docx.ents:
  print(entity, entity.label_)

mobassir DRUG


dear spacy,

**mobassir** is not a drug :( 

In [23]:
for text,_ in train[:10]:
  doc = nlp2(text)
  result =[(ent,ent.label_) for ent in doc.ents]
  print(result)

[(zoloft, 'DRUG')]
[(nexplanon, 'DRUG')]
[(chantix, 'DRUG')]
[(glucose, 'DRUG')]
[(augmentin, 'DRUG')]
[(chaparral, 'DRUG')]
[(lialda, 'DRUG')]
[(trinessa, 'DRUG'), (amethia, 'DRUG')]
[(zoloft, 'DRUG')]
[(synthroid, 'DRUG')]


In [24]:
def extract_entity(text):
  docs = nlp2(text)
  result =[(ent,ent.label_) for ent in doc.ents]
  return result

In [25]:
df['review'][0:10].apply(extract_entity)

0    [((synthroid), DRUG)]
1    [((synthroid), DRUG)]
2    [((synthroid), DRUG)]
3    [((synthroid), DRUG)]
4    [((synthroid), DRUG)]
5    [((synthroid), DRUG)]
6    [((synthroid), DRUG)]
7    [((synthroid), DRUG)]
8    [((synthroid), DRUG)]
9    [((synthroid), DRUG)]
Name: review, dtype: object

learn more here : [How to create custom NER in Spacy](https://confusedcoders.com/data-science/deep-learning/how-to-create-custom-ner-in-spacy)