In [5]:
from pdf2image import convert_from_path
import cv2
import tempfile
import numpy as np
import pandas as pd
import pickle
import pytesseract
import matplotlib.pyplot as plt

pdf_file="sarcoma_codes.pdf"

table_ocrs=[]
with tempfile.TemporaryDirectory() as path:
    images_from_path = convert_from_path( pdf_file, output_folder=path)
    for image in images_from_path:
        img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
        
        #img = cv2.rotate(img, cv2.cv2.ROTATE_90_CLOCKWISE)
        thresh, img_bin = cv2.threshold(img, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
        img_bin = 255 - img_bin

        kernel_len = np.array(img).shape[1] // 100

        ver_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_len))
        hor_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_len, 1))
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))

        image_1 = cv2.erode(img_bin, ver_kernel, iterations=3)
        vertical_lines = cv2.dilate(image_1, ver_kernel, iterations=3)

        image_2 = cv2.erode(img_bin, hor_kernel, iterations=3)
        horizontal_lines = cv2.dilate(image_2, hor_kernel, iterations=3)

        img_vh = cv2.addWeighted(vertical_lines, 0.5, horizontal_lines, 0.5, 0.0)
    
        img_vh = cv2.erode(~img_vh, kernel, iterations=2)

        thresh, img_vh = cv2.threshold(img_vh, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)

        bitxor = cv2.bitwise_xor(img, img_vh)
        bitnot = cv2.bitwise_not(bitxor)
        contours, hierarchy = cv2.findContours(img_vh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

        boundingBoxes = [cv2.boundingRect(c) for c in contours]
        (contours, boundingBoxes) = zip(*sorted(zip(contours, boundingBoxes),
                                            key=lambda b: b[1][1], reverse=False))

        box = []
        for c in contours:
            x, y, w, h = cv2.boundingRect(c)
            image = cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
            box.append([x, y, w, h])

        row = []
        column = []
        j = 0

        for i in range(len(box)):

            if (i == 0):
                column.append(box[i])
                previous = box[i]

            else:
                if (box[i][1] == previous[1] ):

                    column.append(box[i])
                    previous = box[i]

                    if (i == len(box) - 1):
                        row.append(column)

                else:
                    row.append(column)
                    column = []
                    previous = box[i]
                    column.append(box[i])


        countcol = 0
        for i in range(len(row)):
            countcol = len(row[i])
            if countcol > countcol:
                countcol = countcol


        center = [int(row[i][j][0] + row[i][j][2] / 2) for j in range(len(row[i])) if row[0]]

        center = np.array(center)
        center.sort()

        finalboxes = []
        for i in range(len(row)):
            lis = []
            for k in range(countcol):
                lis.append([])
            for j in range(len(row[i])):
                diff = abs(center - (row[i][j][0] + row[i][j][2] / 4))
                minimum = min(diff)
                indexing = list(diff).index(minimum)
                lis[indexing].append(row[i][j])
            finalboxes.append(lis)

        outer = []
        for i in range(len(finalboxes)):
            for j in range(len(finalboxes[i])):
                inner = ''
                if (len(finalboxes[i][j]) == 0):
                    outer.append(' ')
                else:
                    for k in range(len(finalboxes[i][j])):
                        y, x, w, h = finalboxes[i][j][k][0], finalboxes[i][j][k][1], finalboxes[i][j][k][2], \
                                     finalboxes[i][j][k][3]
                        finalimg = bitnot[x:x + h, y:y + w]
                        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 1))
                        border = cv2.copyMakeBorder(finalimg, 2, 2, 2, 2, cv2.BORDER_CONSTANT, value=[255, 255])
                        resizing = cv2.resize(border, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
                        dilation = cv2.dilate(resizing, kernel, iterations=1)
                        erosion = cv2.erode(dilation, kernel, iterations=2)

                        out = pytesseract.image_to_string(erosion)
                        if (len(out) == 0):
                            #out = pytesseract.image_to_string(erosion, config='--psm 3')
                            out = pytesseract.image_to_string(erosion)
                        inner = inner + " " + out
                    outer.append(inner)

        arr = np.array(outer)
        dataframe = pd.DataFrame(arr.reshape(len(row), countcol))
        table_ocrs.append(dataframe)
sarcoma_df=pd.concat(table_ocrs, axis=0)

sarcoma_df=(sarcoma_df.applymap(lambda x: x.replace("\n"," ")))
sarcoma_df=(sarcoma_df.applymap(lambda x: x[:-2]))
sarcoma_df=sarcoma_df[~sarcoma_df[0].str.contains(" Morphological codes ")]
sarcoma_df=(sarcoma_df.applymap(lambda x: x.replace("NOS","")))
sarcoma_df=(sarcoma_df.applymap(lambda x: x.replace(",","")))
sarcoma_df=sarcoma_df.reset_index(drop=True)

tumour_category=[" Adipocytic tumours"," Fibroblastic/myofibroblastic tumours",
              " So-called fibrohistiocytic tumours"," Smooth muscle tumours"," Skeletal muscle tumours",
             " Vascular tumours"," Nerve sheath tumours",
              " Chondro-osseus tumours"," Tumours of uncertain differentiation"]
tumour_type=[" Intermediate (locally aggressive)"," Intermediate (rarely metastasizing)", " Malignant"]


sarcoma_df["tumour_category"]=""
category_indices=sarcoma_df.index[sarcoma_df[0].isin(tumour_category) ].tolist()
category_indices.append(sarcoma_df.shape[0])
ranges=list(zip(category_indices[:-1], category_indices[1:]))

for enum,(start, end) in enumerate(ranges):
    sarcoma_df["tumour_category"].iloc[start:end]=sarcoma_df[0].iloc[start]

sarcoma_df["tumour_type"]=""
type_indices=sarcoma_df.index[sarcoma_df[0].isin(tumour_type) ].tolist()
type_indices.append(sarcoma_df.shape[0])
ranges=list(zip(type_indices[:-1], type_indices[1:]))
for enum,(start, end) in enumerate(ranges):
    sarcoma_df["tumour_type"].iloc[start:end]=sarcoma_df[0].iloc[start]
indexes_to_keep = set(range(sarcoma_df.shape[0])) - set(type_indices+category_indices)
sarcoma_df = sarcoma_df.take(list(indexes_to_keep))
sarcoma_df=sarcoma_df.reset_index(drop=True)
sarcoma_df=sarcoma_df.drop([2,3], axis=1)
sarcoma_df=sarcoma_df.drop(index=0)
sarcoma_df=sarcoma_df.rename(columns={0: "subtype", 1: "code", 2: "subtype-detail"})

In [21]:
sarcoma_df["subtype"]=(sarcoma_df["subtype"].apply(lambda x: x.lower()))
sarcoma_subtypes=sarcoma_df["subtype"].values.tolist()

In [22]:
import json
from collections.abc import Iterable

def load_config_file(nfile, abspath=False):

    ext = '.json' if 'json' not in nfile else ''
    pre = '' if abspath else './'
    fp = open(pre + nfile + ext, 'r')

    s = ''

    for l in fp:
        s += l

    return json.loads(s)


def flatten(l):
    for el in l:
        if isinstance(el, Iterable) and not isinstance(el, (str, bytes)):
            yield from flatten(el)
        else:
            yield el


In [23]:
tumour_types=load_config_file("subtypes.json")
tumour_types = pd.DataFrame(tumour_types)
#benign_subtypes=[el.lower() for el in list(flatten(tumour_types.values.tolist())) if isinstance(el, str)]


Unnamed: 0,Adipocytic tumours,Fibroblastic/myofibroblastic tumours,So-called fibrohistiocytic tumours,Smooth-muscle tumours,Skeletal-muscle tumours,Nerve sheath tumours,Chondro-osseus tumours,Gastrointestinal stromal tumours,Tumours of uncertain differentiation,Undifferentiated / unclassified sarcomas,Notochordal_tumours,Other not sarcomas,Uncertain morphology
Benign,"[Fibrolipoma, Lipomatosis, Lipomatosis of nerv...","[Nodular fasciitis, Proliferative fasciitis, P...","[Tenosynovial giant cell tumour, Tenosynovial ...","[Leiomyoma, smooth muscle neoplasm, smooth mus...",[Rhabdomyoma],"[Schwannoma, Melanotic schwannoma, Neurofibrom...",,[Benign gastrointestinal stromal (tumour|tumor)],"[Acral fibromyxoma, Intramuscular myxoma, Juxt...",,"[Benign notochordal tumour, Benign notochordal...",[epidermoid cyst],
Intermediate (rarely metastasizing),,"[Solitary fibrous tumour, Solitary fibrous tum...",,,,,,,,,,,
Malignant,[myxoid sarcoma],,,,"[Epithelioid hemangioendothelioma, Angiosarcom...",[Neuroblastoma],[chondrosarcoma],"[Gastrointestinal stromal tumour, Gastrointest...","[synovial sarcoma, Pleomorphic dermal sarcoma]","[spindle cell sarcoma, pleomorphic sarcoma, ro...",[Chordoma],"[Melanoma, Adenosarcoma, Adenoid cystic carcin...",
Not identified,,,,,,,,,,,,,[uncertain]
other,,,,,,,,,,,,"[rheumatoid arthritis, endometriosis, fibrosis...",


In [34]:
#subtypes=sarcoma_df["subtype"].values.tolist()+ tumour_types

In [26]:
tumour_category=[]
tumour_subtypes=[]
for i, columns in tumour_types.iterrows():
    for column in columns:
           if isinstance(column, list):
                tumour_subtypes.extend(column)
                tumour_category.extend([i]*len(column))
               

In [27]:
benign_tumours=np.vstack((tumour_subtypes, tumour_category))

In [28]:
benign_tumours=pd.DataFrame(np.transpose(benign_tumours), columns=["subtype","tumour_type"])

In [29]:
sarcoma_df=sarcoma_df[["subtype", "tumour_type"]]

frames = [sarcoma_df, benign_tumours]
  
soft_tissue_tumours = pd.concat(frames)


In [30]:
soft_tissue_tumours["subtype"]=soft_tissue_tumours["subtype"].apply(lambda x: x.lower())

In [16]:

soft_tissue_tumours.to_csv("soft_tissue_tumours.csv")

In [40]:
subtype_regex=("|".join(soft_tissue_tumours["subtype"].values.tolist()))

In [41]:
subtype_regex

' atypical lipomatous tumour| well-differentiated liposarcoma| dedifferentiated liposarcoma| myxoid liposarcoma| pleomorphic liposarcoma| liposarcoma not otherwise specified| palmar/plantar type fibromatosis| desmoid-type fibromatosis| lipofibromatosis| giant cell fibroblastoma| dermatofibrosarcoma protuberans| fibrosarcomatous dermatofibrosarcoma protuberans| pigmented dermatofibrosarcoma protuberans| solitary fibrous tumour| solitary fibrous tumour malignant| inflammatory myofibroblastic sarcoma| low grade myofibroblastic sarcoma| myxoinflammatory fibroblastic sarcoma| infantile fibrosarcoma| adult fibrosarcoma| myxofibrosarcoma| low-grade fibromyxoid sarcoma| sclerosing epithelioid fibrosarcoma| plexiform fibrohistiocytic tumour| giant cell tumour of soft tissue| leiomyosarcoma| embryonal rhabdomyosarcoma (including botryoid anaplastic)| alveolar rhabdomyosarcoma (including solid anaplastic)| pleomorphic rhabdomyosarcoma| spindle cell/sclerosing rhabdomyosarcoma| kaposiform haemangi

In [42]:
def rotateImage(cvImage, angle: float):
    newImage = cvImage.copy()
    (h, w) = newImage.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    newImage = cv2.warpAffine(newImage, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return newImage


def getSkewAngle(cvImage) -> float:
    # Prep image, copy, convert to gray scale, blur, and threshold
    
    newImage = cvImage.copy()
    gray = cv2.cvtColor(newImage, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (9, 9), 0)
    thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

    # Apply dilate to merge text into meaningful lines/paragraphs.
    # Use larger kernel on X axis to merge characters into single line, cancelling out any spaces.
    # But use smaller kernel on Y axis to separate between different blocks of text
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (30, 5))
    dilate = cv2.dilate(thresh, kernel, iterations=5)

    # Find all contours
    contours, hierarchy = cv2.findContours(dilate, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    contours = sorted(contours, key = cv2.contourArea, reverse = True)

    # Find largest contour and surround in min area box
    largestContour = contours[0]
    minAreaRect = cv2.minAreaRect(largestContour)

    # Determine the angle. Convert it to the value that was originally used to obtain skewed image
    angle = minAreaRect[-1]
    if angle < -45:
        angle = 90 + angle
    return -1.0 * angle

# Deskew image
def deskew(cvImage):
    angle = getSkewAngle(cvImage)
    return rotateImage(cvImage, -1.0 * angle)

In [43]:
from pdfreader import pdfReader, medicalDb
import os
import json
import pickle
import re
import os
from pdf2image import convert_from_path, exceptions
import pytesseract
import pandas as pd


pdf_dir="medical_reports"

pdf_files=os.listdir(pdf_dir)

In [44]:
from nltk.corpus import wordnet
import collections
synonyms = collections.defaultdict(list)


duplicates=[]
words_of_interest=["highly","represent","suggestive","suggest","features"]
for word in words_of_interest:
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            duplicates.append(l.name())
           
    synonyms[word]=(list(set(duplicates)))


expressions = [
    "(?:(?:features|findings|this|tumour)\s*(?:are|is)\s*consistent\s*with|"
    "(?:would\s*be|are|is)(?!not)(?:.*?)(?:in\s*keeping\s*with\s*)|"
    "features\s*(?:.*?)are\sof\s*|"
    "(features|diagnosis)(?!not)(?:.*?)(?:{})*".format("|".join(synonyms["suggest"]))+"|"
    "(is|are)\s*interpreted\s*as\s*(?!.*negative)|"
    "(?:could|might|might\s*possibly|most\s*likely|may)\s*(?!not)(?:{})".format("|".join(synonyms["represent"]))+"|"
    "(?:features|findings|this|tumour|morphology)\s*(?:are|is)\s*(?!not)(?:.*?)\s*(?:suggestive)\s*of|"
    "(?:{})\s*(?:suspicious)\s*of".format("|".join(synonyms["highly"]))+"|"
    "features\s*(?:.*?)(?:favor|favoring|favouring)|"
    "(?:features|morphology)\s*(?:in keeping\s*with\s*))"
    "(.*?(;|\.)+?)"]



In [48]:
expressions = [
    "(?:features|findings|this|tumour)\s*(?:are|is)\s*consistent\s*with(.*?(;|\.)+?)",
    "(?:would\s*be|are|is)(?!not)(?:.*?)(?:in\s*keeping\s*with\s*(.*?(;|\.)+?))",
    "features\s*(?:.*?)are\sof\s*(.*?(;|\.)+?)",
    "features\s*(?:.*?)(?:favor|favoring|favouring)(.*?(;|\.)+?)",
    "(?:features|morphology)\s*(?:in keeping\s*with\s*)(.*?(;|\.)+?)",
    "(features|diagnosis)(?!.*not)(?:.*?)(?:{})*(.*?(;|\.)+?)".format("|".join(synonyms["suggest"])),
    "(?:features|findings|this|tumour|morphology)\s*(?:are|is)\s*(?!.*not)(?:.*?)\s*(?:suggestive)\s*of(.*?(;|\.)+?)",
    "(is|are)\s*interpreted\s*as\s*(?!.*negative)(.*?(;|\.)+?)",
    "(?:could|might|might\s*possibly|most\s*likely|may)\s*(?!.*not)(?:{})(.*?(;|\.)+?)".format("|".join(synonyms["represent"])),
    "(?:{})\s*(?:suspicious)\s*of(.*?(;|\.)+?)".format("|".join(synonyms["highly"])), 
   ]


In [76]:
from PIL import Image
import nltk
import cv2
import itertools

patient_records=[]


pdf_dir="medical_reports"

pdf_files=os.listdir(pdf_dir)

regex=[r'([A-Z]?[0-9]{5,12})',
       "(\d{4}(:?(/|\s))\d{2})",
       r"(?:Sex|\d{1,2} (?:YEAR-OLD|YEAR OLD|year old))+ (\w)",
       r'(\d{1,2}) (?:YEAR-OLD|YEAR OLD|year old)+',
       r'grade (\d{1,2})']


for pdf_file in pdf_files:
    #file_name=(os.path.splitext(os.path.basename(pdf_file))[0])
    
    keys = ["Patient","Lab No","Sex","Age","Grade"]
    patient_dir = collections.defaultdict(list)
        
    text=''
    if pdf_file.endswith(".pdf"):
            try:
                    images=convert_from_path(os.path.join(pdf_dir,pdf_file))
                    for image in images:
                        text += str(pytesseract.image_to_string(image))
            except exceptions.PDFPageCountError:
                    raise ValueError('Exception: PDFPageCountError for %s' % path)
    
    elif pdf_file.endswith(".jpg"):     
            img = cv2.imread(os.path.join(pdf_dir,pdf_file))

            rotated=deskew(img)

            text += str(pytesseract.image_to_string(rotated))
            text=text.replace("\n"," ")
    else:
            print ("The file does not have the required format {}".format(pdf_file))
            continue
            
    for enum,k in enumerate(keys):
        try:
                patient_dir[k]=re.search(regex[enum], text, re.MULTILINE | re.DOTALL).group(1)
                
        except AttributeError:
                print ("There is no information available in {} for field {}".format(pdf_file,k))
            
    if not patient_dir["Patient"]:
                     patient_dir["Patient"]=os.path.splitext(pdf_file)[0]
    
    patient_dir["report_file"]=os.path.join(os.path.join(pdf_dir,pdf_file))
        
    sentences = nltk.tokenize.sent_tokenize(text)
    
    for sentence, expression in itertools.product(sentences, expressions):
            try:
                match=re.search(expression, sentence, re.MULTILINE | re.DOTALL).group(0)
                match=match.replace("well- differentiated liposarcoma","well-differentiated liposarcoma")

                if match:
                    subtype=re.search(subtype_regex,match.lower(), re.MULTILINE | re.DOTALL).group(0)
                    if subtype:
                        patient_dir["Diagnosis"].append(subtype)
                        break 
            except AttributeError:
                    pass
    patient_records.append(patient_dir)

There is no information available in 746516.jpg for field Grade
There is no information available in 752384.jpg for field Grade
There is no information available in 742180.pdf for field Grade
There is no information available in 736259.pdf for field Age
There is no information available in 737695.pdf for field Lab No
There is no information available in 737695.pdf for field Sex
There is no information available in 737695.pdf for field Age
There is no information available in 737695.pdf for field Grade
There is no information available in 740278.jpg for field Grade
There is no information available in 749189.pdf for field Grade
There is no information available in 740690.pdf for field Lab No
There is no information available in 740690.pdf for field Grade
There is no information available in 574148.pdf for field Grade
There is no information available in 745337.pdf for field Grade
There is no information available in 749815.jpg for field Grade
There is no information available in 667098.

There is no information available in 731916.jpg for field Grade
There is no information available in 746293.jpg for field Grade
There is no information available in 749860.jpg for field Grade
There is no information available in 740567.jpg for field Grade
There is no information available in 751277.jpg for field Grade
There is no information available in 615457.pdf for field Sex
There is no information available in 615457.pdf for field Age
There is no information available in 588674.jpg for field Grade
There is no information available in 740094.pdf for field Grade
There is no information available in 748839.jpg for field Grade
There is no information available in 686845.jpg for field Grade
There is no information available in 746364.jpg for field Grade
There is no information available in 742392.jpg for field Grade
There is no information available in 750836.jpg for field Grade
There is no information available in 656644.jpg for field Grade
There is no information available in 749825.

In [77]:
records=pd.DataFrame(patient_records)

In [78]:
patient_records

[defaultdict(list,
             {'Patient': '746516',
              'Lab No': '4066 20',
              'Sex': 'F',
              'Age': '21',
              'report_file': 'medical_reports/746516.jpg'}),
 defaultdict(list,
             {'Patient': '741984',
              'Lab No': '2392 20',
              'Sex': 'M',
              'Age': '59',
              'Grade': '2',
              'report_file': 'medical_reports/741984.pdf',
              'Diagnosis': [' leiomyosarcoma']}),
 defaultdict(list,
             {'Patient': '726249',
              'Lab No': '8150/20',
              'Sex': 'M',
              'Age': '81',
              'Grade': '3',
              'report_file': 'medical_reports/726249.jpg',
              'Diagnosis': ['fibrosis']}),
 defaultdict(list,
             {'Patient': '752384',
              'Lab No': '8805/20',
              'Sex': 'M',
              'Age': '68',
              'report_file': 'medical_reports/752384.jpg',
              'Diagnosis': ['spindle cell sar

In [79]:
records["Diagnosis"]=records["Diagnosis"].apply(lambda x:list(set(x)) if isinstance(x,list) else x)

records["Diagnosis"]=records["Diagnosis"].apply(lambda x:",".join(x) if isinstance(x,list) else x)

In [80]:
records["Diagnosis"]=records["Diagnosis"].apply(lambda x:x[0] if isinstance(x,list) else x)

In [81]:
records

Unnamed: 0,Age,Diagnosis,Grade,Lab No,Patient,Sex,report_file
0,21,,,4066 20,746516,F,medical_reports/746516.jpg
1,59,leiomyosarcoma,2,2392 20,741984,M,medical_reports/741984.pdf
2,81,fibrosis,3,8150/20,726249,M,medical_reports/726249.jpg
3,68,spindle cell sarcoma,,8805/20,752384,M,medical_reports/752384.jpg
4,61,schwannoma,,2180: 10,742180,F,medical_reports/742180.pdf
5,,lipoma,1,0604/20,736259,F,medical_reports/736259.pdf
6,,lipoma,,,737695,,medical_reports/737695.pdf
7,71,leiomyosarcoma,3,3384/20,678367,F,medical_reports/678367.jpg
8,65,myofibroblastoma,,3764 20,740278,F,medical_reports/740278.jpg
9,22,desmoid-type fibromatosis,,6395 20,749189,F,medical_reports/749189.pdf


In [82]:
no_definite_diagnosis=records.loc[records["Diagnosis"].isna()]

In [85]:
no_definite_diagnosis["Patient"].values.tolist()

['746516',
 '752665',
 '380577',
 '570034',
 '559279',
 '751516',
 '742763',
 '748127',
 '6129041934',
 '751024',
 '740069',
 '749805',
 '752662',
 '746526',
 '665253',
 'H14404',
 '698711',
 'M89313',
 '4567206215',
 '733266',
 '4688493924',
 '739357',
 '749498',
 '630043',
 '747132',
 '749835',
 '753040',
 '747860',
 '752446',
 '728569',
 '749815',
 '744378',
 '740093',
 '662198',
 '751165']

In [50]:
print (len(no_definite_diagnosis["Patient"].values.tolist()))

35


In [145]:
import pandas as pd
biomarkers=load_config_file("immuno.json")
biomarker_pd=pd.DataFrame(biomarkers)
biomarkers=[el.lower() for el in list(flatten(biomarker_pd.values.tolist())) if isinstance(el, str)]

In [146]:
biomarker_pd["Muscle cells"]["immuno markers"]

['Actin', 'Caldesmon', 'Desmin', 'Myoglobin', 'Myogenin', 'myod1', '\\bSMA\\b']

In [147]:

fusion_proteins=pd.read_csv("fusion_genes.txt", sep='\t')
biomarkers=biomarkers+fusion_proteins["IHC MARKER"].values.tolist()
biomarkers=[el.lower() for el in biomarkers]

In [148]:
biomarkers= ('|'.join(biomarkers))

In [149]:
biomarkers_regex = "({})".format(biomarkers)

In [150]:
biomarkers_regex


'(alpha fetoprotein|oct3/4|sall4|cd30|placental alkaline phosphatase (plap)|cytokeratin|ema|actin|caldesmon|desmin|myoglobin|myogenin|myod1|\\bsma\\b|cd10|cd45|cd79a|cd15|cd1a|cd68|myeloperoxidate|tdt|cd21|cd23|cd35|cd117|cd138|von willebrand factor|cd31|cd34|cdk4|ck5/6|erg|neuron-specific enolase|cd56|cd57|pgp5.5|synaptophysin|chromogranin|neuron|neurofilaments|s100|hmb45|mitf|melana|dog1|kit|pax8|beta-catenin|mdm2|smarcb1|sdh|brachyury|p63|gfap|hmga2|h3g34w|h3k36m|pax3|tle1|f13a|pgr| er|stat6|bgr1|ini1|p16|calretinin|cam5.2|sox10|state|k7|k8|k18|k19|k13|k14|k17|k20|ae1/ae3|fli1|erg|ap1β|tle1|alk|alk|ros1|nr4a3|bcl-2|wt1|myc|nut|bcl6|tfe3|zap70|muc4|ccnb3|muc4)'

In [117]:
#weak_positivity= "There is (.*(scanty|weak)).*?(expression|positivity)|weak staining for [a-zA-Z0-9&._$-]+"
negative=["\s*negative for\s*([^.]*)",
            "(.*?)are negative",
            "(.*?)interpreted as negative"]

In [151]:
positive=["(?:[A-Za-z]+){1,7}positive for\s*(.*?(while|with|negative)|([^.|;]*))",
           "(?:t|T)here is.*?(expression|positivity)(\s*of)*(.*?(while|with|negative)|([^.|;]*))",
           "(.*?)\s(?:shows|show)\s.*?expression(\s*of)*(.*?(while|with|negative)|([^.|;]*))",
           "(?:[A-Za-z]+ ){1,4}\s(?:is|appears)\s(.*?)positive",
           "(?:[A-Za-z]+ ){1,4}\s(?:stain(s)*\s[^.|;]*)",
            "(.*?)\s(?:show(s)*(.*?)staining)",
            "with(.*?)(expression|positivity)(\sof)*[^.|;]*",
            "(?:[A-Za-z]+ ){1,4}(?:appear(s)*)(.*?)retained in nuclei"]

significance_levels=["moderate(ly)*","weak(ly)*","weak(ly)* to moderate(ly)*",
                     "moderate(ly)* to strong(ly)*","scanty", "strong(ly)*",
                     "of uncertain significance","some","focal(ly)*","granular",
                     "nuclear","multifocal","cytoplasmic",
                     "diffuse(ly)*","scattered","peripheral"]


In [152]:
biomarkers_regex


'(alpha fetoprotein|oct3/4|sall4|cd30|placental alkaline phosphatase (plap)|cytokeratin|ema|actin|caldesmon|desmin|myoglobin|myogenin|myod1|\\bsma\\b|cd10|cd45|cd79a|cd15|cd1a|cd68|myeloperoxidate|tdt|cd21|cd23|cd35|cd117|cd138|von willebrand factor|cd31|cd34|cdk4|ck5/6|erg|neuron-specific enolase|cd56|cd57|pgp5.5|synaptophysin|chromogranin|neuron|neurofilaments|s100|hmb45|mitf|melana|dog1|kit|pax8|beta-catenin|mdm2|smarcb1|sdh|brachyury|p63|gfap|hmga2|h3g34w|h3k36m|pax3|tle1|f13a|pgr| er|stat6|bgr1|ini1|p16|calretinin|cam5.2|sox10|state|k7|k8|k18|k19|k13|k14|k17|k20|ae1/ae3|fli1|erg|ap1β|tle1|alk|alk|ros1|nr4a3|bcl-2|wt1|myc|nut|bcl6|tfe3|zap70|muc4|ccnb3|muc4)'

In [177]:
import os
from pdf2image import convert_from_path
import pytesseract
import nltk
import re
import collections
import cv2

pdf_dir="medical_reports"
positive_biomarkers=collections.defaultdict(list)
negative_biomarkers=collections.defaultdict(list)
pdf_files=os.listdir(pdf_dir)

for pdf_file in pdf_files:
   
    text=''
    if pdf_file.endswith(".pdf"):
            try:
                    images=convert_from_path(os.path.join(pdf_dir,pdf_file))
                    for image in images:
                        text += str(pytesseract.image_to_string(image))
                        text=text.replace("\n"," ")
            except exceptions.PDFPageCountError:
                    raise ValueError('Exception: PDFPageCountError for %s' % path)
    
    elif pdf_file.endswith(".jpg"):
            
            img = cv2.imread(os.path.join(pdf_dir,pdf_file))
            rotated=deskew(img)
            text += str(pytesseract.image_to_string(rotated))
            text=text.replace("\n"," ")
    text=text.replace("h- caldesmon","caldesmon")
    text=text.replace("$100","S100")
    text=text.replace("$OX10","SOX10")
    sentences = nltk.tokenize.sent_tokenize(text)
    patient_name=os.path.splitext(pdf_file)[0]
    
    for sentence in sentences:
        sentence=sentence.replace("(","")
        sentence=sentence.replace(")","")
        for positive_regex in positive:
            try:
                match=re.search(positive_regex, sentence, re.MULTILINE | re.DOTALL).group(0)
                 
                if match:                     
                    positive_biomarker=(list("".join(x) for x in re.findall(biomarkers_regex,match.lower())))
              
                    positive_biomarkers[patient_name].append(positive_biomarker)
#                     for marker in positive_biomarker:
#                         antigen_regex= r'(?:[A-Za-z]+ ){1,4}\b%s\b' % (marker.lower())
                        
#                         for significance in significance_levels:
                            
#                             previous_words= (re.findall(antigen_regex,match.lower()))
                            
                        
            except AttributeError:
                pass
        for negative_regex in negative:
            try:
                match=re.search(negative_regex, sentence, re.MULTILINE | re.DOTALL).group(0)
                if match:
                    negative_biomarker=(list("".join(x) for x in re.findall(biomarkers_regex,match.lower())))
                    negative_biomarkers[patient_name].append(negative_biomarker)
            except AttributeError:
                pass
            
            
    


In [178]:
with open('positive.json', 'w') as fp:
    json.dump(positive_biomarkers, fp)

In [179]:
with open('negative.json', 'w') as fp:
    json.dump(negative_biomarkers, fp)