In [1]:
#Dependencies
import numpy as np
import pandas as pd
from PIL import Image, ImageSequence
from pdf2image import convert_from_path, pdfinfo_from_path
import logging as logger
import os
import re
import pytesseract
import bs4

In [14]:
#Configs
raw_folder = "/Users/pdas59/Downloads/text_mapping/data/raw"
doc_name = "Exclusions - non-VIP 5.2.pdf"
file_name = os.path.join(raw_folder, doc_name)
processed_folder = "/Users/pdas59/Downloads/text_mapping/data/processed"

In [15]:
#Utils

def rename_images(image_path, folder_path):
    try:
        page_num = int(re.findall(r"(\d+)\.[a-z]{3}$", image_path)[0])
        new_path = os.path.join(folder_path, f"page{page_num}.png")
        os.rename(image_path, new_path)
        doc_name = folder_path.rstrip("/").split("/")[-1]
        logger.info(f"Saved Image Number {page_num} of {doc_name} at {new_path}")
    except Exception as e:
        logger.error(f"Error in renaming image: {e}")
        
def ocr_hocr(img_path, psm=4):
    return pytesseract.image_to_pdf_or_hocr(
        img_path, extension="hocr", config=f"--psm {psm}"
    )

def get_text_from_hocr(hocr):
    hocr_text = []
    soup = bs4.BeautifulSoup(hocr, "html.parser")
    for line in soup.select(".ocr_line"):
        line_text = re.sub(r"\s+", " ", line.text).strip()
        alphanum_text = re.sub("[^a-zA-Z0-9 \n]", "", line_text).strip()
        if alphanum_text != "":
            hocr_text.append(line_text)
    return "\n".join(hocr_text)

def get_ocr_df(page_num, hocr):
    page_dict = {
        "page": '$page_num',
        "line": '$line',
        "text": '$text',
    }
    line = 1
    text = get_text_from_hocr(hocr).split(".")
    text = [re.sub(r"\n\d+", "", txt).replace("\n", " ") for txt in text]
    text = list(filter(None, text))
    
    page_record = list()
    
    for txt in text:
        if txt.isdigit():
            continue
        line += 1
        page_dict["page"] = page_num
        page_dict["line"] = line
        page_dict["text"] = txt
        page_record.append(page_dict.copy())
    return pd.DataFrame.from_records(page_record)

In [16]:
# # def fltr(txt):
# #     return re.sub(r"\\n[\d+]", " ", txt)
# list(filter(lambda x: re.sub(r"\n\d+", " ", x),text))

In [17]:
# text

In [18]:
# text = get_text_from_hocr(hocr).split(".")
# text = [re.sub(r"\n\d+", "", txt) for txt in text]
# text = list(filter(None, text))
# text

In [19]:
# list(filter(None, text))

In [20]:
# get_text_from_hocr(hocr)

In [21]:
#File Splitting - Doc pre-processing

logger.info(f"Splitting pdf {doc_name} into PNGs")
output_folder = os.path.join(processed_folder, doc_name.split(".")[0])
os.makedirs(output_folder, exist_ok=True)
num_pages = pdfinfo_from_path(file_name)['Pages']
logger.info(
    f"Found {num_pages} pages of {doc_name} to split into PNGs")

image_paths = convert_from_path(file_name,
                                output_folder=output_folder,
                                paths_only=True,
                                fmt="png",
                                output_file="page")
for image_path in image_paths:
    rename_images(image_path, output_folder)

PermissionError: [Errno 13] Permission denied: '/Users/pdas59'

In [4]:
output_folder = os.path.join(processed_folder, doc_name.split(".")[0])

In [5]:
#Getting OCR Results
ocr_df = pd.DataFrame(columns= ["page","line","text", "CODE"])

for page_path in os.listdir(output_folder):
    page_num = int(re.findall(r'\d+', page_path)[0])
    page_img = os.path.join(output_folder, page_path)
    hocr = ocr_hocr(page_img).decode(encoding="utf-8", errors="ignore")
    ocr_df = pd.concat([ocr_df, get_ocr_df(page_num, hocr)])


In [6]:
ocr_df.head()

Unnamed: 0,page,line,text,CODE
0,9,2,Any expenses related to immunomodulators and I...,
1,9,3,Any expenses related to the treatment of slee...,
2,9,4,Services and educational programs for handicaps,
3,9,5,Injuries or illnesses suffered by the Insured...,
4,9,6,Injuries or illnesses suffered by the Insured...,


In [15]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.0.10-py2.py3-none-any.whl (242 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.1/242.1 KB[0m [31m69.1 kB/s[0m eta [36m0:00:00[0m31m67.6 kB/s[0m eta [36m0:00:01[0m
[?25hCollecting et-xmlfile
  Using cached et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.10
You should consider upgrading via the '/Users/sali115/text_mapping/text_env/bin/python3.9 -m pip install --upgrade pip' command.[0m[33m
[0m

In [2]:
cpt_df=pd.read_excel("/Users/sali115/text_mapping/data/helper/CPT CODES.xlsx")
cpt_df.head()

Unnamed: 0,CPT_CODE,SHORT_DESCRIPTION,LONG_DESCRIPTION,FULL_DESCRIPTION
0,0001F,HEART FAILURE COMPOSITE,HRT FAILURE ASSESSED,Heart failure assessed (includes assessment of...
1,0005F,OSTEOARTHRITIS COMPOSITE,OSTEOARTHRITIS COMPOSITE,Osteoarthritis assessed (OA) Includes assessme...
2,00100,ANESTH SALIVARY GLAND,ANESTHESIA SALIVARY GLANDS WITH BIOPSY,"Anesthesia for procedures on salivary glands, ..."
3,00102,ANESTH REPAIR OF CLEFT LIP,ANESTHESIA CLEFT LIP INVOLVING PLASTIC REPAIR,Anesthesia for procedures involving plastic re...
4,00103,ANESTH BLEPHAROPLASTY,ANESTHESIA EYELID RECONSTRUCTIVE PROCEDURE,Anesthesia for reconstructive procedures of ey...


In [13]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m131.5 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp39-cp39-macosx_10_11_x86_64.whl (3.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m341.7 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting tqdm>=4.27
  Using cached tqdm-4.64.0-py2.py3-none-any.whl (78 kB)
Collecting requests
  Using cached requests-2.27.1-py2.py3-none-any.whl (63 kB)
Collecting filelock
  Downloading filelock-3.7.1-py3-none-any.whl (10 kB)
Collecting regex!=2019.12.17
  Downloading regex-2022.6.2-cp39-cp39-macosx_10_9_x86_64.whl (288 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.0/289.0 KB[0m [31m865.8 kB/s[0m

In [3]:
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel

In [20]:
!pip install torch torchvision

Collecting torch
  Using cached torch-1.11.0-cp39-none-macosx_10_9_x86_64.whl (129.9 MB)
Collecting torchvision
  Using cached torchvision-0.12.0-cp39-cp39-macosx_10_9_x86_64.whl (1.2 MB)
Installing collected packages: torch, torchvision
Successfully installed torch-1.11.0 torchvision-0.12.0
You should consider upgrading via the '/Users/sali115/text_mapping/text_env/bin/python3.9 -m pip install --upgrade pip' command.[0m[33m
[0m

In [4]:
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
import torch

In [6]:
input_ids = torch.tensor(tokenizer.encode(cpt_df["FULL_DESCRIPTION"].values[0])).unsqueeze(0)

In [7]:
outputs = model(input_ids)

In [8]:
embeddings_of_last_layer=outputs[0]
cls_embeddings=embeddings_of_last_layer[0][0]

In [14]:
len(cls_embeddings)

768

In [9]:
# page_2_df = ocr_df[ocr_df.page == 2]
# page_2_df

In [10]:
def ClinicalBert_embeddings(text):
    
    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)
    outputs = model(input_ids)
    embeddings_of_last_layer = outputs[0]
    cls_embeddings = embeddings_of_last_layer[0][0]
    
    return cls_embeddings

In [11]:
import gc
cpt_embeddings= dict()
for _,row in cpt_df.iterrows():
    embeddings = ClinicalBert_embeddings(row["FULL_DESCRIPTION"])
    cpt_embeddings[row["CPT_CODE"]] = embeddings
    print(f"Embedding: {embeddings}")
    print(f'Description: {row["FULL_DESCRIPTION"]}')
    break
    del embeddings
    gc.collect()

Embedding: tensor([-1.0589e-01,  2.3349e-01, -1.3264e-01,  8.3456e-02,  1.9767e-01,
        -5.0167e-01,  7.6262e-01, -1.2887e-01,  6.3157e-01, -3.6499e-01,
        -5.4129e-01,  1.7551e-03, -4.9618e-01, -1.0218e-01, -5.6850e-01,
         7.6426e-02,  4.0520e-01, -9.7468e-02, -1.2326e-01, -2.0685e-01,
         5.4379e-02,  3.8240e-01, -4.6297e-02, -8.3208e-01, -3.3165e-01,
         3.3957e-01,  5.4114e-01,  1.7548e-01, -2.0968e-02,  7.1162e-01,
         2.2213e-01,  5.3445e-01, -5.0488e-01,  6.6914e-02,  1.0509e-01,
         1.4191e-01, -3.6755e-02,  3.0241e-02, -3.5191e-01,  3.7172e-01,
         1.3592e-01,  1.7833e-01,  5.7153e-01,  4.2908e-01,  3.3005e-01,
        -4.5765e-01,  2.4992e-01,  6.2640e-01, -3.4049e-01, -2.7678e-02,
         4.7249e-03,  8.2036e-01,  6.8205e-01, -1.3977e-01,  2.6645e-01,
        -1.6644e-01, -3.7232e-01, -1.4518e-01, -1.8354e-01,  1.7428e-01,
         1.6577e-01, -4.2287e-01,  6.1022e-02, -2.0929e-01, -2.9355e-01,
         1.8355e-01,  5.0113e-02,  2.585