<a href="https://colab.research.google.com/github/mathanamathav/NLP-Tablets-Annotation/blob/main/NLP_Building_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Downloading Required Modules and Imports for the Project**

In [None]:
!pip install paddlepaddle -q
!pip install paddleocr -q
!pip install 'spacy[transformers]' -q

In [143]:
from paddleocr import PaddleOCR,draw_ocr
from bs4 import BeautifulSoup
import spacy_transformers
import spacy
import pickle
import requests
import re
import cv2
import numpy as np
import pandas as pd
import urllib
from urllib.request import Request, urlopen
from google.colab.patches import cv2_imshow
import matplotlib.pyplot as plt
from PIL import Image
%matplotlib inline

In [110]:
import nltk
nltk.download('punkt')
nltk.download('words')
from nltk.metrics.distance  import edit_distance
from nltk.corpus import words
from nltk import word_tokenize
import itertools

def get_plain_vocabluary(column):
    df=med_df[[column]]
    df.dropna(inplace=True)
    sentencess = [word_tokenize(sentence[column]) for index, sentence in df.iterrows()]
    mergesentences = list(itertools.chain.from_iterable(sentencess))
    plainvocabulary = list(set(mergesentences))
    return plainvocabulary
  
def minimum_edit_distance(source, target):
    m, n = len(source), len(target)
    dp = np.zeros((m+1, n+1), dtype=int)
    
    
    for i in range(m+1):
        dp[i, 0] = i
    for j in range(n+1):
        dp[0, j] = j
    
    # Fill in the DP table
    for i in range(1, m+1):
        for j in range(1, n+1):
            if source[i-1] == target[j-1]:
                dp[i, j] = dp[i-1, j-1]
            else:
                dp[i, j] = min(dp[i-1, j-1]+1, dp[i-1, j]+1, dp[i, j-1]+1)
    
    return dp[m, n]

def time_optimized_spelling_correction(word, word_list):
    min_distance = float('inf')
    correct_word = word
    for w in correct_words:
      if word[0]==w[0]:
        distance = minimum_edit_distance(word, w)
        if distance < min_distance:
            min_distance = distance
            correct_word = w
    return correct_word

def spelling_correction_alter(sentence):
    splittedsentence = word_tokenize(sentence)
    for i,word in enumerate(splittedsentence):
        word=word.lower() 
        if (word.lower() not in valsLower and  word.isalpha()):
            try:
              splittedsentence[i] = time_optimized_spelling_correction(word,correct_words)
            except:
              splittedsentence[i] = word
        else:
            splittedsentence[i] = word
    return ' '.join(splittedsentence)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


## **Spell Checking Functions and Imports**

In [111]:
med_df=pd.read_csv('/content/drive/MyDrive/A_Z_medicines_dataset_of_India.csv')
tab_name=set(get_plain_vocabluary("name"))
manf=set(get_plain_vocabluary("manufacturer_name"))
short_chem=set(get_plain_vocabluary("short_composition1"))
long_chem=set(get_plain_vocabluary("short_composition2"))

In [112]:
correct_words = words.words()
correct_words.extend(tab_name)
correct_words.extend(manf)
correct_words.extend(short_chem)
correct_words.extend(long_chem)

In [113]:
valsLower = [item.lower() for item in correct_words]

## **Open the Pre-Trained Model**

In [24]:
# Load the saved model from the pickle file
with open("/content/drive/MyDrive/NLP DATA/spacy_model.pkl", "rb") as f:
    nlp = pickle.load(f)

## **Web scrapper and OCR given image**

In [99]:
class ScrapeTabletsImg():
  def __init__(self,required_inp_link):
    self.required_inp_link = required_inp_link
    self.list_of_product_link = []
    self.all_tablets_images = []
    self.all_downloaded_tablet_img = []

  def generate_links(self):
    """
    """
    req = Request(self.required_inp_link)
    html_page = urlopen(req)
    soup = BeautifulSoup(html_page, "html.parser")
    self.list_of_product_link = [link.get('href') for link in soup.find_all('a') if re.search(r"tablet|capsule", str(link.get('href')), re.IGNORECASE)]
    return self.list_of_product_link
  
  def scrape_img(self,url):
    """
    """
    response = Request(url)
    html_page = urlopen(response)
    soup = BeautifulSoup(html_page, 'html.parser')

    tablet_name = soup.find("h1", {"class": "black-txt"}).get_text()

    tablet_info = soup.find_all("div", {"id": "np_tab1"})
    tablet_data = " "
    for x in tablet_info:
      if (x.find('p')):
        tablet_data = (x.find('p').text)

    images = []
    for img in soup.find_all('img'):
      if re.search(r".(600x600).*(tablet|capsule).*", str(img.get('src')), re.IGNORECASE) and re.search(r"^(?!.*?formulation_based).*$", str(img.get('src')), re.IGNORECASE):
        images.append(img.get('src'))
    return tablet_name , tablet_data , images
  
  def download_img(self,url):
    """
    """
    resp = urllib.request.urlopen(url)
    image = np.asarray(bytearray(resp.read()), dtype="uint8")
    image = cv2.imdecode(image, cv2.IMREAD_COLOR) 
    return image
    
  def generate_tablets_img(self):
    """
    """
    self.generate_links()
    for link in self.list_of_product_link:
        images = self.scrape_img(link)
        if images:
            self.all_tablets_images.extend(images)

    for i in self.all_tablets_images:
      image = self.download_img(i)
      self.all_downloaded_tablet_img.append(image)
    return self.all_downloaded_tablet_img

  def display_img(self,link):
    """
    """
    resp = urllib.request.urlopen(link)
    image = np.asarray(bytearray(resp.read()), dtype="uint8")
    image = cv2.imdecode(image, cv2.IMREAD_COLOR) 
    cv2_imshow(image)
  
  def img_processing(self,image):
    """
    """
    img = cv2.resize(image, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    th, threshed = cv2.threshold(gray, 240, 255, cv2.THRESH_BINARY_INV)
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (11,11))
    morphed = cv2.morphologyEx(threshed, cv2.MORPH_CLOSE, kernel)
    cnts = cv2.findContours(morphed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[-2]
    cnt = sorted(cnts, key=cv2.contourArea)[-1]
    x,y,w,h = cv2.boundingRect(cnt)
    dst = img[y:y+h, x:x+w]
    dst = cv2.cvtColor(dst, cv2.COLOR_BGR2GRAY)
    kernel = np.ones((1, 1), np.uint8)
    dst = cv2.dilate(dst, kernel, iterations=1)
    dst = cv2.erode(dst, kernel, iterations=1)
    kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])

    return dst

  def takeText(self,preprocessed_image,ocr):
    """
    """
    result = ocr.ocr(preprocessed_image, cls = True)[0]
    boxes = [res[0] for res in result] 
    texts = [res[1][0] for res in result]
    scores = [res[1][1] for res in result]
    cols = []
    for x in result:
      yes = True
      if cols == []:
        cols.append(x[0][0][0])
      else:
        for i in cols:
          if abs(x[0][0][0] - i) <= 10:
            yes = False
            break
        if yes:
          cols.append(x[0][0][0])
    txts = []
    for x in cols:
      for res in result:
        if abs(res[0][0][0] - x) < 10:
          txts.append(res[1][0])
  
    return ' '.join(txts)

  def run_ocr_for_img_link(self):
    """
    """
    ocr_tablets_data = {'link' : [], 'text':[] , 'tablet_name':[] , 'tablet_data': [] ,'annotated_COMPONENT' : [], 'annotated_NAME' : [], 'annotated_MANUFACTURER':[]}
    self.generate_links()
    for link in self.list_of_product_link:
        tablet_name , tablet_data , images = self.scrape_img(link)
        if images and tablet_name and tablet_data.strip() != "":
            self.all_tablets_images.extend(images)

            ocr = PaddleOCR(use_angle_cls=True, lang = 'en', use_gpu = False)
            for img_link in images:
              resp = urllib.request.urlopen(img_link)
              image = np.asarray(bytearray(resp.read()), dtype="uint8")
              image = cv2.imdecode(image, cv2.IMREAD_COLOR)
              preprocessed_image = self.img_processing(image)
              data = self.takeText(preprocessed_image,ocr)
              if data.strip() == "":
                continue
              ocr_tablets_data['link'].append(img_link)
              ocr_tablets_data['text'].append(data)
              ocr_tablets_data['tablet_name'].append(tablet_name)
              ocr_tablets_data['tablet_data'].append(tablet_data)
    
    return ocr_tablets_data

## **Generate the Required Annotated Data**

In [136]:
url = 'https://www.netmeds.com/prescriptions/digestion'
title = 'digestion.csv'

In [None]:
obj = ScrapeTabletsImg(url)
tablets_df = obj.run_ocr_for_img_link()

In [138]:
for data in tablets_df['text']:
  doc=nlp(data)
  colors = {"COMPONENT": "#F67DE3", "NAME": "#7DF6D9", "MANUFACTURER":"#FFFFFF"}
  options = {"colors": colors} 

  # spacy.displacy.render(doc, style="ent", options= options, jupyter=True)
  annotated_COMPONENT , annotated_NAME , annotated_MANUFACTURER = [] , [] , []
  for ent in doc.ents:
    if ent.label_ == 'COMPONENT':
      # print(spelling_correction_alter(ent.text),ent.text)
      annotated_COMPONENT.append(spelling_correction_alter(ent.text))
    elif ent.label_ == 'NAME':
      annotated_NAME.append(spelling_correction_alter(ent.text))
    elif ent.label_ == 'MANUFACTURER':
      annotated_MANUFACTURER.append(spelling_correction_alter(ent.text))
  
  tablets_df['annotated_COMPONENT'].append(annotated_COMPONENT)
  tablets_df['annotated_NAME'].append(annotated_NAME)
  tablets_df['annotated_MANUFACTURER'].append(annotated_MANUFACTURER)

In [139]:
df = pd.DataFrame.from_dict(tablets_df)
df.head(10)

Unnamed: 0,link,text,tablet_name,tablet_data,annotated_COMPONENT,annotated_NAME,annotated_MANUFACTURER
0,https://www.netmeds.com/images/product-v1/600x...,Dr.Reddy's Pancreatin Delayed Release Capsules...,Agna 10000 Capsule 10'S,AGNA contains Pancreatin which belongs to drug...,[],[],[]
1,https://www.netmeds.com/images/product-v1/600x...,"Agna"" Pancreatin Delayed Release Capsules 1000...",Agna 10000 Capsule 10'S,AGNA contains Pancreatin which belongs to drug...,[pancreatin ip],[ear],"[hetero labs limited ( unit, ham management se..."
2,https://www.netmeds.com/images/product-v1/600x...,"Protectirom lightand moisture, Keep aut of rea...",Agna 10000 Capsule 10'S,AGNA contains Pancreatin which belongs to drug...,[pancreatin p .],[],[hetero labs limited]
3,https://www.netmeds.com/images/product-v1/600x...,"00 Agna"" Ag Keeoout gf riachot chrdren Pancrea...",Agna 25000 Capsule 10'S,AGNA 25000 CAPSULE contains Pancreatin which b...,[],[],"[pancreatic fisherpeople, waer labs limited]"
4,https://www.netmeds.com/images/product-v1/600x...,AristozymeCapsules HRESTIVEENZYMEOSPSULES Aris...,Aristozyme Capsule 15'S,Aristozyme capsule is a health supplement used...,[],[],[also aproctous]
5,https://www.netmeds.com/images/product-v1/600x...,Dnarana Bestozyme@ BE Manula Bestozyrne DSF 10...,Bestozyme Capsule 15'S,BESTOZYME contains the below mentioned compone...,[],[],[atap vagas]
6,https://www.netmeds.com/images/product-v1/600x...,399Dho-382225DistAhme ustomer care no.02718-22...,CADIPAN 10k Capsule 10's,CADIPAN 10K CAPSULE contains Pancreatin which ...,"[pancreatin, lipase activity]","[cadipan, 10, cadipan10]",[]
7,https://www.netmeds.com/images/product-v1/600x...,Each hard gelatin capsule contains Pancreatin ...,CADIPAN 10k Capsule 10's,CADIPAN 10K CAPSULE contains Pancreatin which ...,[pancreatin (],[],[]
8,https://www.netmeds.com/images/product-v1/600x...,Markcted by: 1389.0ho-382225.Dist.Ahmedab3 Cus...,CADIPAN 25k Capsule 10's,CADIPAN 25K CAPSULE contains Pancreatin which ...,[pancreatin ip],"[cadipan25, k]",[cian bealtared ltd]
9,https://www.netmeds.com/images/product-v1/600x...,Each Hard Gelatin Capsule contains: Parcreatin...,CADIPAN 25k Capsule 10's,CADIPAN 25K CAPSULE contains Pancreatin which ...,[],[],[]


In [140]:
df.tail(10)

Unnamed: 0,link,text,tablet_name,tablet_data,annotated_COMPONENT,annotated_NAME,annotated_MANUFACTURER
93,https://www.netmeds.com/images/product-v1/600x...,PanlipaseUc Each uncoated tablet contains: Pan...,PANLIPASE UC Tablet 10's,PANLIPASE UC TABLET contains Pancreatin which ...,"[pancreatin ip, excipient, excipient]",[],[sun pharma laboratories]
94,https://www.netmeds.com/images/product-v1/600x...,Each uncoated tablet contains: Pancreatin IP E...,PANLIPASE UC Tablet 10's,PANLIPASE UC TABLET contains Pancreatin which ...,"[pancreatin ip, amylase protease, excipient]",[],[]
95,https://www.netmeds.com/images/product-v1/600x...,Ecnerercoeotoercocs -Om-L-Aspartane150.0mo Pan...,QUIKLOR Tablet 10's,QUIKLOR TABLET is a combination of L Ornithine...,[],[],"[unite biotech ( p ) limited, eccrisis l-]"
96,https://www.netmeds.com/images/product-v1/600x...,Composition: Each enteric coated tablet contai...,QUIKLOR Tablet 10's,QUIKLOR TABLET is a combination of L Ornithine...,[l-ornithine-l],[],[]
97,https://www.netmeds.com/images/product-v1/600x...,wancroaun Berutar WTablets oralsed to contaln ...,Serutan Tablet 10'S,SERUTAN TABLET contains Pancreatin which belon...,[],[],[]
98,https://www.netmeds.com/images/product-v1/600x...,aanocoaianam conaas reaP212.5m Rdardised to co...,Serutan Tablet 10'S,SERUTAN TABLET contains Pancreatin which belon...,[],[],[]
99,https://www.netmeds.com/images/product-v1/600x...,Fungal DiastasePapain & Activated Charcoal Tab...,Unienzyme Tablet 15'S,Unienzyme Tablet is a digestive supplement whi...,[],[],[]
100,https://www.netmeds.com/images/product-v1/600x...,"UNINZYME Fungal Diastase,Papain & Activated Ch...",Unienzyme Tablet 15'S,Unienzyme Tablet is a digestive supplement whi...,[],[],[]
101,https://www.netmeds.com/images/product-v1/600x...,PHARMA torrent PNARMF torrent asaittctedbrihap...,Unienzyme Tablet 15'S,Unienzyme Tablet is a digestive supplement whi...,[],[],[pharma]
102,https://www.netmeds.com/images/product-v1/600x...,UTIPC Peerngach vegCaguocoetaes (apgren PcranA...,Utipac Capsule 10'S,UTIPAC contains Pancreatin which belongs to dr...,[],[],[]


In [141]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   link                    103 non-null    object
 1   text                    103 non-null    object
 2   tablet_name             103 non-null    object
 3   tablet_data             103 non-null    object
 4   annotated_COMPONENT     103 non-null    object
 5   annotated_NAME          103 non-null    object
 6   annotated_MANUFACTURER  103 non-null    object
dtypes: object(7)
memory usage: 5.8+ KB


In [142]:
df.to_csv('digestion-annotated.csv',header=True)