<a href="https://colab.research.google.com/github/paulokuriki/anonymize_phi_ocr/blob/main/anonymize_phi_ocr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Searches for Patients' Protected Health Information (PHI) burned in images, drawing rectangles to cover them

Disclaimer: As with all automated techniques, identifying PHI is subject to failures.
A human double-check is always advisable.

<img src="https://raw.githubusercontent.com/paulokuriki/anonymize_phi_ocr/main/sample_512.png">

In [None]:
# installing tesseract-ocr
!add-apt-repository -y ppa:alex-p/tesseract-ocr
!apt update
!apt install -y tesseract-ocr

# removing tesseract-ocr (optional)
#!sudo add-apt-repository ppa:alex-p/tesseract-ocr -r -y
#!apt remove tesseract-ocr --auto-remove

In [None]:
pip install pytesseract

# STOP!
### After tesseract-ocr was installed above, it's important to restart the kernel at this point to get it loaded correctly.

### The restart should be done only once

In [None]:
import os
import glob
import time
import multiprocessing
from multiprocessing import Pool

import cv2
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
from tqdm.contrib.concurrent import process_map
from tqdm import tqdm

import pytesseract
from pytesseract import Output

In [None]:
# preview during processing
preview_images = True

# uses multiple cores to speed up processing. 
# depending on the number of cores it runs much, much faster
# note: when using multiprocessing, the preview option will be disabled
use_multiprocessing = False

# folder containing the orginal png files
reading_path = '.'

# folder where anonymized images will be saved to
export_path = './ocr_processed'

# final processing report 
ocr_report_csv = 'ocr_report.csv'

# defines the border thickness around the text anonymized
border = 40

if not os.path.exists(export_path):
    os.makedirs(export_path)

In [None]:
def calculate_mode(img):
    # Calculate mode for each channel separately
    blue_mode = float(stats.mode(img[:, :, 0], axis=None).mode[0])
    green_mode = float(stats.mode(img[:, :, 1], axis=None).mode[0])
    red_mode = float(stats.mode(img[:, :, 2], axis=None).mode[0])

    mode = np.array([blue_mode, green_mode, red_mode])
    
    return mode


def search_and_anonymize(file):
    img = cv2.imread(file)
    
    # these are some color sugestions for drawing the cv2.rectangle
    mean_color = np.mean(img, axis=(0, 1))
    median_color = np.median(img, axis=(0, 1))
    mode_color = calculate_mode(img)
    black_color = (0, 0, 0)

    # searches for the text into the image
    d = pytesseract.image_to_data(img, output_type=Output.DICT)
    
    # optional preview
    if preview_images and not use_multiprocessing:
        plt.imshow(img, cmap='gray')
        plt.show()
    
    ocr_text_extracted = ''
    for i, text in enumerate(d['text']):
        if text.strip() != '':           
            ocr_text_extracted = ocr_text_extracted + text.strip() + ' '
            
            left = d['left'][i]
            top = d['top'][i]
            width = d['width'][i]
            height = d['height'][i]
        
            # draws a filled rectangle around the text
            cv2.rectangle(img, (left - border, top - border), 
                          (left + width + border, top + height + border), 
                          mode_color, -1)
    
    # defines png export filename
    png_file = os.path.join(export_path, os.path.basename(file))
    
    # if a text was found, save exported image
    if ocr_text_extracted:
        f, axarr = plt.subplots(1, 1)
        axarr.axis('off')
    
        plt.imsave(png_file, img, cmap='gray')
        plt.close()        
        
        # optional preview
        if preview_images and not use_multiprocessing:
            plt.imshow(img, cmap='gray')
            plt.show()
            
    return (file, png_file, ocr_text_extracted)

In [None]:
start = time.time()

print('Searching for images...')
files = glob.glob(os.path.join(reading_path, '*.png'))
print('Processing OCR...')

if use_multiprocessing:
    # runs as multiprocessing
    results = process_map(search_and_anonymize, files, max_workers=multiprocessing.cpu_count())

else:
    # runs individually, possibly to preview images
    results = []
    for file in files:
        r = search_and_anonymize(file)
        results.append(r)

processed = {}
processed['original_image'] = [r[0] for r in results] 
processed['processed_image'] = [r[1] for r in results] 
processed['ocr_text_extracted'] = [r[2] for r in results]

In [None]:
# saves the report as a csv
df = pd.DataFrame(processed)
df.sort_values(by=['original_image'], inplace=True)
df.reset_index(drop=True, inplace=True)
df.to_csv(ocr_report_csv, index=False)

print("Finished in", time.time() - start, 'secs')
print('CSV report saved to file:', ocr_report_csv)
df

### Kudos to Filip Zelic & Anuj Sable for the Tesseract Code
https://nanonets.com/blog/ocr-with-tesseract/