In [5]:
#Dependencies
import numpy as np
import pandas as pd
from PIL import Image, ImageSequence
from pdf2image import convert_from_path, pdfinfo_from_path
import logging as logger
import os
import re
import pytesseract
import bs4

In [6]:
#Configs
raw_folder = "/Users/sali115/text_mapping/data/raw"
doc_name = "Exclusions - non-VIP 5.2.pdf"
file_name = os.path.join(raw_folder, doc_name)
processed_folder = "/Users/sali115/text_mapping/data/processed"

In [7]:
#Utils

def rename_images(image_path, folder_path):
    try:
        page_num = int(re.findall(r"(\d+)\.[a-z]{3}$", image_path)[0])
        new_path = os.path.join(folder_path, f"page{page_num}.png")
        os.rename(image_path, new_path)
        doc_name = folder_path.rstrip("/").split("/")[-1]
        logger.info(f"Saved Image Number {page_num} of {doc_name} at {new_path}")
    except Exception as e:
        logger.error(f"Error in renaming image: {e}")
        
def ocr_hocr(img_path, psm=4):
    return pytesseract.image_to_pdf_or_hocr(
        img_path, extension="hocr", config=f"--psm {psm}"
    )

def get_text_from_hocr(hocr):
    hocr_text = []
    soup = bs4.BeautifulSoup(hocr, "html.parser")
    for line in soup.select(".ocr_line"):
        line_text = re.sub(r"\s+", " ", line.text).strip()
        alphanum_text = re.sub("[^a-zA-Z0-9 \n]", "", line_text).strip()
        if alphanum_text != "":
            hocr_text.append(line_text)
    return "\n".join(hocr_text)

def get_ocr_df(page_num, hocr):
    page_dict = {
        "page": '$page_num',
        "line": '$line',
        "text": '$text',
    }
    line = 1
    text = get_text_from_hocr(hocr).split(".")
    text = [re.sub(r"\n\d+", "", txt).replace("\n", " ") for txt in text]
    text = list(filter(None, text))
    
    page_record = list()
    
    for txt in text:
        if txt.isdigit():
            continue
        line += 1
        page_dict["page"] = page_num
        page_dict["line"] = line
        page_dict["text"] = txt
        page_record.append(page_dict.copy())
    return pd.DataFrame.from_records(page_record)

In [186]:
# # def fltr(txt):
# #     return re.sub(r"\\n[\d+]", " ", txt)
# list(filter(lambda x: re.sub(r"\n\d+", " ", x),text))

In [187]:
# text

In [188]:
# text = get_text_from_hocr(hocr).split(".")
# text = [re.sub(r"\n\d+", "", txt) for txt in text]
# text = list(filter(None, text))
# text

In [189]:
# list(filter(None, text))

In [190]:
# get_text_from_hocr(hocr)

In [4]:
#File Splitting - Doc pre-processing

logger.info(f"Splitting pdf {doc_name} into PNGs")
output_folder = os.path.join(processed_folder, doc_name.split(".")[0])
os.makedirs(output_folder, exist_ok=True)
num_pages = pdfinfo_from_path(file_name)['Pages']
logger.info(
    f"Found {num_pages} pages of {doc_name} to split into PNGs")

image_paths = convert_from_path(file_name,
                                output_folder=output_folder,
                                paths_only=True,
                                fmt="png",
                                output_file="page")
for image_path in image_paths:
    rename_images(image_path, output_folder)

In [9]:
output_folder = os.path.join(processed_folder, doc_name.split(".")[0])

In [10]:
#Getting OCR Results
ocr_df = pd.DataFrame(columns= ["page","line","text", "CODE"])

for page_path in os.listdir(output_folder):
    page_num = int(re.findall(r'\d+', page_path)[0])
    page_img = os.path.join(output_folder, page_path)
    hocr = ocr_hocr(page_img).decode(encoding="utf-8", errors="ignore")
    ocr_df = pd.concat([ocr_df, get_ocr_df(page_num, hocr)])


In [11]:
ocr_df.head()

Unnamed: 0,page,line,text,CODE
0,9,2,Any expenses related to immunomodulators and i...,
1,9,3,Any expenses related to the treatment of slee...,
2,9,4,Services and educational programs for handicaps,
3,9,5,A,
4,9,6,Injuries or illnesses suffered by the Insured...,


In [12]:
cpt_df=pd.read_excel("/Users/sali115/text_mapping/data/helper/CPT CODES.xlsx")
cpt_df.head()

Unnamed: 0,CPT_CODE,SHORT_DESCRIPTION,LONG_DESCRIPTION,FULL_DESCRIPTION
0,0001F,HEART FAILURE COMPOSITE,HRT FAILURE ASSESSED,Heart failure assessed (includes assessment of...
1,0005F,OSTEOARTHRITIS COMPOSITE,OSTEOARTHRITIS COMPOSITE,Osteoarthritis assessed (OA) Includes assessme...
2,00100,ANESTH SALIVARY GLAND,ANESTHESIA SALIVARY GLANDS WITH BIOPSY,"Anesthesia for procedures on salivary glands, ..."
3,00102,ANESTH REPAIR OF CLEFT LIP,ANESTHESIA CLEFT LIP INVOLVING PLASTIC REPAIR,Anesthesia for procedures involving plastic re...
4,00103,ANESTH BLEPHAROPLASTY,ANESTHESIA EYELID RECONSTRUCTIVE PROCEDURE,Anesthesia for reconstructive procedures of ey...


In [13]:
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel

In [14]:
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
import torch

In [16]:
input_ids = torch.tensor(tokenizer.encode(cpt_df["FULL_DESCRIPTION"].values[0])).unsqueeze(0)

In [17]:
outputs = model(input_ids)

In [18]:
embeddings_of_last_layer=outputs[0]
cls_embeddings=embeddings_of_last_layer[0][0]

In [19]:
len(cls_embeddings)

768

In [20]:
page_2_df = ocr_df[ocr_df.page == 2]
page_2_df

Unnamed: 0,page,line,text,CODE
0,2,2,Emirate of Abu Dhabi Non-VIP Exclusions,
1,2,3,"Healthcare Services, which are not medically ...",
2,2,4,"All expenses relating to dental treatment, de...",
3,2,5,Domiciliary care; private nursing care; care ...,
4,2,6,Custodial care includes a,
5,2,7,Non-medical treatment services; or b,
6,2,8,Health-related services which do not seek to ...,
7,2,9,Services which do not require continuous admi...,
8,2,10,Personal comfort and convenience items (telev...,
9,2,11,Healthcare Services and associated expenses f...,


In [21]:
page_2_df=page_2_df.drop(0)
page_2_df

Unnamed: 0,page,line,text,CODE
1,2,3,"Healthcare Services, which are not medically ...",
2,2,4,"All expenses relating to dental treatment, de...",
3,2,5,Domiciliary care; private nursing care; care ...,
4,2,6,Custodial care includes a,
5,2,7,Non-medical treatment services; or b,
6,2,8,Health-related services which do not seek to ...,
7,2,9,Services which do not require continuous admi...,
8,2,10,Personal comfort and convenience items (telev...,
9,2,11,Healthcare Services and associated expenses f...,
10,2,12,Cosmetic operations which improve physical ap...,


In [22]:
def ClinicalBert_embeddings(text):
    
    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)
    outputs = model(input_ids)
    embeddings_of_last_layer = outputs[0]
    cls_embeddings = embeddings_of_last_layer[0][0]
    
    return cls_embeddings

In [23]:
# cpt_embeddings= dict()
# for _,row in cpt_df.iterrows():
#     embeddings = ClinicalBert_embeddings(row["FULL_DESCRIPTION"])
#     cpt_embeddings[row["CPT_CODE"]] = embeddings

In [24]:
# !pip install sklearn

In [25]:
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
import pickle

In [29]:
# infile=open("/Users/pdas59/Downloads/1.pickle","rb")
# emb_dict=pickle.load(infile)
# infile.close()

In [30]:
# top_dict=dict()

In [31]:
# code=[]
# for _,row in page_2_df.iterrows():
#     sim_dict=dict()
#     cls_emb=ClinicalBert_embeddings(row["text"])
#     for i in emb_dict:
#         s=cosine_similarity(cls_emb.detach().numpy().reshape(1,-1),emb_dict[i].detach().numpy().reshape(1,-1))[0][0]
#         sim_dict[i]=s
#     code.append(list(sorted(sim_dict.items(),key=lambda x:x[1]))[-1])

In [32]:
# code

In [27]:
new_df=ocr_df[~ocr_df.page.isin([1,10])]
new_df

Unnamed: 0,page,line,text,CODE
0,9,2,Any expenses related to immunomodulators and i...,
1,9,3,Any expenses related to the treatment of slee...,
2,9,4,Services and educational programs for handicaps,
3,9,5,A,
4,9,6,Injuries or illnesses suffered by the Insured...,
...,...,...,...,...
9,2,11,Healthcare Services and associated expenses f...,
10,2,12,Cosmetic operations which improve physical ap...,
11,2,13,Breast reconstruction following a mastectomy ...,
12,2,14,Surgical and non-surgical treatment for obesi...,


In [28]:
new_df1 = new_df[~(new_df.text.str.contains("Non-VIP") | new_df.text.str.contains("Pharmacy Exclusions"))]

In [36]:
# code=[]
# for _,row in new_df1.iterrows():
#     sim_dict=dict()
#     cls_emb=ClinicalBert_embeddings(row["text"])
#     for i in emb_dict:
#         s=cosine_similarity(cls_emb.detach().numpy().reshape(1,-1),emb_dict[i].detach().numpy().reshape(1,-1))[0][0]
#         sim_dict[i]=s
#     code.append(list(sorted(sim_dict.items(),key=lambda x:x[1]))[-1])

In [37]:
# len(code)

In [35]:
new_df1

Unnamed: 0,page,line,text,CODE
0,9,2,Any expenses related to immunomodulators and i...,
1,9,3,Any expenses related to the treatment of slee...,
2,9,4,Services and educational programs for handicaps,
3,9,5,A,
4,9,6,Injuries or illnesses suffered by the Insured...,
...,...,...,...,...
9,2,11,Healthcare Services and associated expenses f...,
10,2,12,Cosmetic operations which improve physical ap...,
11,2,13,Breast reconstruction following a mastectomy ...,
12,2,14,Surgical and non-surgical treatment for obesi...,


In [29]:
overall_code=[]
for file_name in os.listdir("/Users/sali115/text_mapping/data/processed/pickle"):
    infile=open(os.path.join("/Users/sali115/text_mapping/data/processed/pickle",file_name),"rb")
    pkl_dict=pickle.load(infile)
    infile.close()
    code=[]
    for _,row in new_df1.iterrows():
        sim_dict=dict()
        cls_emb=ClinicalBert_embeddings(row["text"])
        for i in pkl_dict:
            s=cosine_similarity(cls_emb.detach().numpy().reshape(1,-1),pkl_dict[i].detach().numpy().reshape(1,-1))[0][0]
            sim_dict[i]=s
        code.append(list(sorted(sim_dict.items(),key=lambda x:x[1]))[-1])
    overall_code.append(code)

In [34]:
sim_dict

{'73030': 0.71515155,
 '73040': 0.76858157,
 '73050': 0.7675899,
 '73060': 0.72997016,
 '73070': 0.7105893,
 '73080': 0.7272713,
 '73085': 0.77351654,
 '73090': 0.69786316,
 '73092': 0.75083447,
 '73100': 0.7022052,
 '73110': 0.7200954,
 '73115': 0.77507746,
 '73120': 0.7020019,
 '73130': 0.72069526,
 '73140': 0.7487508,
 '73200': 0.76089895,
 '73201': 0.7621031,
 '73202': 0.7645806,
 '73206': 0.79179114,
 '73218': 0.7608068,
 '73219': 0.7577073,
 '73220': 0.7740932,
 '73221': 0.7552671,
 '73222': 0.7578173,
 '73223': 0.7760589,
 '73225': 0.76937556,
 '73501': 0.7068284,
 '73502': 0.7466437,
 '73503': 0.73154783,
 '73521': 0.7184992,
 '73522': 0.74625057,
 '73523': 0.7393962,
 '73525': 0.7721214,
 '73551': 0.6763139,
 '73552': 0.7118989,
 '73560': 0.7199042,
 '73562': 0.69641805,
 '73564': 0.7379331,
 '73565': 0.73862743,
 '73580': 0.7630634,
 '73590': 0.69222814,
 '73592': 0.7506412,
 '73600': 0.70067894,
 '73610': 0.7167757,
 '73615': 0.7715854,
 '73620': 0.7004035,
 '73630': 0.72140

In [36]:
len(overall_code)

20

In [37]:
final_code=[]
for num_text in range(len(overall_code[0])):
    L=[]
    for num_pkl in range(len(overall_code)):
        L.append(overall_code[num_pkl][num_text])
    final_code.append(sorted(L,key=lambda x:x[1])[-1][0])

In [38]:
new_df1["CODE"]=final_code
new_df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df1["CODE"]=final_code


Unnamed: 0,page,line,text,CODE
0,9,2,Any expenses related to immunomodulators and i...,32940
1,9,3,Any expenses related to the treatment of slee...,94774
2,9,4,Services and educational programs for handicaps,4305F
3,9,5,A,84630
4,9,6,Injuries or illnesses suffered by the Insured...,59855


In [50]:
new_df1.text.values[4]

' Injuries or illnesses suffered by the Insured Person as a result of military operations of whatever type'

In [51]:
new_df1.CODE.values[4]

'27759'