In [26]:
# This program will automate the tedious, manual task of inputting a batch of scanned pdfs into an online database (revolutionehr)
# The program will take a batch of pdfs, extract the relevant information, and input it into the database
# We will use pytesseract and pdf2image to extract the information from the pdfs
# We will use the selenium module to automate the process of inputting the data into the database

# Extracting Patient Information from the Batch of PDFs

In [27]:
# Import the necessary modules
from pdf2image import convert_from_path
import pandas as pd
import pytesseract
import re

In [28]:
file_type = 'intake' # 'intake' or 'vf

In [29]:
# Define pandas options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [30]:
# Define the paths
path_to_batch = r"C:\Users\shtey\Downloads\Batch-53.pdf" if file_type == 'intake' else r"C:\Users\shtey\Downloads\Batch-132.pdf"

In [31]:
# Check the number of pages in the batch
pages = convert_from_path(path_to_batch)[:2]
num_pages = len(pages)
print(f"Number of pages in the batch: {num_pages}")

Number of pages in the batch: 2


In [32]:
# Crop the pages so that they only include the top half of the page
for i in range(num_pages):
    pages[i] = pages[i].crop((0, 0, pages[i].width, pages[i].height / 3))

In [33]:
def extract_text(page) -> str:
    """
    Extract the text from a page of a pdf
    """
    return pytesseract.image_to_string(page, lang='eng', config='--psm 6')


pages_text = [extract_text(page) for page in pages]

In [34]:
def extract_information_from_text(text: str, file_type: str) -> list:
    """
    Extract the relevant information from the text of a page
    """
    data_regexes = {}
    if file_type == 'intake':
        data_regexes = {
            'First name': r'First:\s([A-Za-z]+)',
            'Last name': r'Last:\s([A-Za-z]+)',
            'DOB': r'DOB:\s(\d{2}/\d{2}/\d{4})',
            'Sex': r'Sex:\s([A-Za-z]+)',
            'Preferred Phone': r'Preferred:\sCell:\s(\(\d{3}\)\s\d{3}-\d{4})',
            'Address': r'Address:\s(.+?)\n',
            'Provider': r'Provider:\s(.+?)\n',
            'Document Date': r'(\d{2}/\d{2}/\d{4})(?!.*\d{2}/\d{2}/\d{4})'
        }
    elif file_type == 'vf':
        data_regexes = {
            'name': r"NAME:\s+(\w+\s*,\s*\w+)\s+",
            'dob': r"DOB:\s*(\d{2}-\d{2}-\d{4})",
            'screening date': r"Screening DATE:\s*(\d{2}-\d{2}-\d{4})"
        }

    info = []

    for key, regex in data_regexes.items():
        matches = re.findall(regex, text)
        if matches:
            info.append(matches[-1])
        else:
            info.append('***')

    if file_type == 'intake':
        # Set Document Date to UnknownDate if it is the same as the DOB
        if info[-1] == info[1]:
            info[-1] = "UnknownDate"
    elif file_type == 'vf':
        # Split the name into first and last name
        if info[0] != '***':
            first_name = info[0].split(',')[1].strip()
            last_name = info[0].split(',')[0].strip()
            info[0] = first_name
            info.insert(1, last_name)
        else:
            info.insert(1, '***')

    return info

In [35]:
# Create a dataframe that will hold the info for each patient
df = pd.DataFrame(
    columns=['First Name', 'Last Name', 'Date of Birth', 'Sex', 'Preferred Phone', 'Address', 'Provider', 'Document Date', 'Screening Date'])

In [36]:
# Extract the information from each page
for i in range(num_pages):
    data = extract_information_from_text(pages_text[i], file_type)
    if file_type == 'intake':
        data.append('UnknownDate')
    elif file_type == 'vf':
        for j in range(2, 7):
            data.insert(j, '***')

    df.loc[i] = data

In [37]:
df

Unnamed: 0,First Name,Last Name,Date of Birth,Sex,Preferred Phone,Address,Provider,Document Date,Screening Date
0,Alexander,Shilkrot,02/22/2007,Male,(312) 399-8378,55 Fairmount Ave,"Yoo, Eun OD",02/22/2007,UnknownDate
1,Cecilia,Zhang,10/21/1989,Female,***,100 Shearwater Ct East ; d,"Shteyn, Yekaterina OD",10/21/1989,UnknownDate


# Using Selenium to access the database

In [38]:
# Import the necessary modules
from selenium import webdriver

In [39]:
url = "https://revolutionehr.com/static/#/"

In [42]:
# Open the browser
driver = webdriver.Chrome()

ValueError: Timeout value connect was <object object at 0x000001C7B64E3240>, but it must be an int, float or None.