### Create a medical terms dictionary from https://www.medicinenet.com/

In [None]:
%pip install selenium
%pip install tqdm

#### Import necessary libraries

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import re
from tqdm import tqdm
from selenium.common.exceptions import TimeoutException

from textblob import TextBlob
from autocorrect import Speller

width = 1440
height = 990

In [26]:
import nltk
from nltk.corpus import wordnet

#### Open Chrome Web browser

In [4]:
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)

#### Web scrap medical terms to make glossary

In [5]:

base_url = 'https://www.medicinenet.com/script/main/alphaidx.asp?p='
alphabets = 'abcdefghijklmnopqrstuvwxyz'
med_words = {}

for letter in tqdm(alphabets):
    curr_url=base_url+letter+"_dict"
    med_words[letter] = []
    
    driver.set_page_load_timeout(10)
    try:
        driver.get(curr_url)
    except TimeoutException:
        driver.execute_script("window.stop();")
        
    elements = driver.find_elements(By.XPATH,'//div[@class="AZ_results"]/ul/li/a')
    for element in elements:
        med_words[letter].append(element.text)

driver.quit()


  0%|          | 0/26 [00:00<?, ?it/s]

100%|██████████| 26/26 [03:29<00:00,  8.05s/it]


In [7]:
sum(len(med_words[letter]) for letter in med_words)

4205

In [8]:
med_words['a']

['A (adenine)',
 'A-',
 'A-T',
 'a.c.',
 'AAA',
 'AAAS',
 'AAD',
 'AAO',
 'AAP',
 'Aarskog syndrome',
 'Aarskog-Scott syndrome',
 'Aase-Smith syndrome I',
 'Aase-Smith syndrome II',
 'Ab-',
 'Abate',
 'Abatement',
 'Abdomen',
 'Abdomen, acute',
 'Abdominal',
 'Abdominal aorta',
 'Abdominal aortic aneurysm',
 'Abdominal cavity',
 'Abdominal guarding',
 'Abdominal hysterectomy',
 'Abdominal pain',
 'Abducent nerve',
 'Abduction',
 'Abductor muscle',
 'Aberration',
 'Abiotic',
 'Abiotrophy',
 'Ablate',
 'Ablation',
 'Abnormal',
 'Abortifacient',
 'Abortion',
 'Abortive',
 'Abortive polio',
 'ABR test',
 'Abrade',
 'Abrasion',
 'Abs',
 'Abscess',
 'Abscission',
 'Abse',
 'Absence of the breast',
 'Absence of the nipple',
 'Absence seizure',
 'Absent eye',
 'Absinthism',
 'Absolute neutrophil count',
 'Absorb',
 'Absorption',
 'Abstinence',
 'AC joint',
 'Acanthamoeba',
 'Acapnia',
 'Acaricide',
 'ACC',
 'Access',
 'Accessory',
 'Accessory digestive organ',
 'Accessory dwelling unit',
 'Acc

#### Copy the terms into file

In [9]:
file_path = 'med_terms.txt'
try:
    with open(file_path, 'r') as file:
        pass
except FileNotFoundError:
    with open(file_path, 'w') as file:
        for letter in med_words.values():
            for word in letter:
                file.write(word + '\n')

#### Implement spell check

Sample Usage

In [10]:
text = "Why can't yu spel corrctly?"
tb = TextBlob(text)
tb

TextBlob("Why can't yu spel corrctly?")

In [11]:
print(tb.correct())

Why can't you spell correctly?


In [12]:
txt = '''
1 yeas olf old i patent paseo ‘Vinee >

A reliable ioral; Sa at Chomapape

with the clu cowplaints Bn
ita . ‘ ~~ fog

tle ipsa ae pl se

Geen)

J2aes Sawiduews Ww onset Set | cal «
Dusation > aday of

p” OAV! Ay

Qi vamal vanation : ue |

No Aq4: oy S —_
Rokoving “ae wudieation: N 4
Jupe of five: High grade .
oneewa t
Nl ells & vig" :
IN

ee i 4
Auaation —Stne \d days |
i, ue k weak anoticed.
No expe to onlvonn Kamp

ie
No a Hficourt itand sid

Nofomieel Wee

————————————

Kc
\
\
'''

In [13]:
txt_words = re.findall(r'[a-zA-Z0-9]+', txt)
txt_words

['1',
 'yeas',
 'olf',
 'old',
 'i',
 'patent',
 'paseo',
 'Vinee',
 'A',
 'reliable',
 'ioral',
 'Sa',
 'at',
 'Chomapape',
 'with',
 'the',
 'clu',
 'cowplaints',
 'Bn',
 'ita',
 'fog',
 'tle',
 'ipsa',
 'ae',
 'pl',
 'se',
 'Geen',
 'J2aes',
 'Sawiduews',
 'Ww',
 'onset',
 'Set',
 'cal',
 'Dusation',
 'aday',
 'of',
 'p',
 'OAV',
 'Ay',
 'Qi',
 'vamal',
 'vanation',
 'ue',
 'No',
 'Aq4',
 'oy',
 'S',
 'Rokoving',
 'ae',
 'wudieation',
 'N',
 '4',
 'Jupe',
 'of',
 'five',
 'High',
 'grade',
 'oneewa',
 't',
 'Nl',
 'ells',
 'vig',
 'IN',
 'ee',
 'i',
 '4',
 'Auaation',
 'Stne',
 'd',
 'days',
 'i',
 'ue',
 'k',
 'weak',
 'anoticed',
 'No',
 'expe',
 'to',
 'onlvonn',
 'Kamp',
 'ie',
 'No',
 'a',
 'Hficourt',
 'itand',
 'sid',
 'Nofomieel',
 'Wee',
 'Kc']

In [14]:
def editDistance(s,t):
    n = len(s)
    m = len(t)

    prev = [j for j in range(m+1)]
    curr = [0] * (m+1)

    for i in range(1, n+1):
        curr[0] = i
        for j in range(1, m+1):
            if s[i-1] == t[j-1]:
                curr[j] = prev[j-1]
            else:
                mn = min(1 + prev[j], 1 + curr[j-1])
                curr[j] = min(mn, 1 + prev[j-1])
        prev = curr.copy()

    return prev[m]

In [46]:
def get_med_dict():
    word_dict = {}
    with open("med_terms.txt",'r') as file:
        lines = file.readlines()
        
    for line in lines:
        first_letter=line[0].lower()
        if first_letter in word_dict:
            word_dict[first_letter].append(line.strip())
        else:
            word_dict[first_letter] = [line.strip()]
    return word_dict

In [16]:
def get_closest(ocr_word, max_dist=3):
    first = ocr_word[0].lower()
    d = get_med_dict()
    
    med_list = d[first]
    closest = []
    for word in med_list:
        if editDistance(word,ocr_word) <=max_dist:
            closest.append(word)
            
    max_len = min(5,len(closest))
    return closest[:max_len]


In [44]:
def cleaned_words(text):
    
    ocr_words = re.findall(r'[a-zA-Z]+', txt)
    ocr_words = [word.lower() for word in ocr_words]
    
    spell = Speller()
    corrected = [spell(word) for word in ocr_words]
    english_words = []
    for word in corrected:
        synsets = wordnet.synsets(word)
        if synsets:
            english_words.append(word)
        else:
            english_words.append('')
            
    med_closest = [get_closest(ocr_word) for ocr_word in ocr_words]
    
    key_words={}
    for i,word in enumerate(ocr_words):
        if word in key_words:
            key_words[word].append(english_words[i])
        else:
            key_words[word]=[english_words[i]]
            
        key_words[word].append(med_closest[i])
        
        
    return key_words

In [45]:
cleaned_words(txt)

{'yeas': ['year', ['yd.', 'Yeast']],
 'olf': ['', ['OB', 'OBS', 'OCG', 'OCP', 'oma']],
 'old': ['old', ['OB', 'OBS', 'OCG', 'OCP', 'oma']],
 'i': ['i',
  ['ICD', 'ICU', 'ID', 'IFA', 'Ig'],
  'i',
  ['ICD', 'ICU', 'ID', 'IFA', 'Ig'],
  'i',
  ['ICD', 'ICU', 'ID', 'IFA', 'Ig']],
 'patent': ['patent', ['pathy', 'Patient']],
 'paseo': ['', ['pathy']],
 'vinee': ['vine', []],
 'a': ['a',
  ['A-', 'A-T', 'a.c.', 'AAA', 'AAD'],
  'a',
  ['A-', 'A-T', 'a.c.', 'AAA', 'AAD']],
 'reliable': ['reliable', []],
 'ioral': ['moral', []],
 'sa': ['sa', ['SAA', 'Salt', 'Scan', 'Ser', 'SHP']],
 'at': ['at', ['A-', 'A-T', 'a.c.', 'AAA', 'AAD']],
 'chomapape': ['', []],
 'with': ['', ['Wt']],
 'the': ['', ['Td', 'TEE', 'Tic', 'Titre', 'TMR']],
 'clu': ['club', ['CA', 'CAD', 'Calf', 'caps', 'CBC']],
 'cowplaints': ['complaints', []],
 'bn': ['', ['Ba', 'BCC', 'BCG', 'BCM', 'BER']],
 'ita': ['', ['iatry', 'ICD', 'ICU', 'ID', 'IFA']],
 'fog': ['fog', ['Fat', 'FCC', 'FDA', 'FFI', 'Food']],
 'tle': ['', ['Td', 