# FDA Reading Room 483 Analysis

In [None]:
from datetime import date
print("Script last ran on {}".format(date.today().strftime("%m/%d/%Y")))

In [None]:
# import packages
import numpy as np
import pandas as pd
import requests
import lxml
import html5lib 
from bs4 import BeautifulSoup
import janitor
import tempfile

# selenium functions
from selenium import webdriver
from selenium.webdriver.common.keys import Keys #allow  you to enter keystrokes into fields
from selenium.webdriver.support.ui import Select #allow you to select a dropdown item
from selenium.webdriver.support.ui import WebDriverWait #lets you modify a field before proceeding
from selenium.common.exceptions import NoSuchElementException 

## Step 1: Scrape Data Table and List of URLs to Form 483s from URL

__Establishment Types to Filter By__  
Outsourcing Facility, Producer of Sterile and Non Sterile Drug Products, Producer of Sterile Drug Products, Producer of Non Sterile Drug Products, Manufacturer

In [None]:
url = 'https://www.fda.gov/about-fda/office-regulatory-affairs/ora-foia-electronic-reading-room'
driver = webdriver.Chrome()
driver.get(url)

table class = lcsd-datatable--ora-foia-reading table table-bordered dataTable no footer dtr-inline

In [None]:
# utilize the filter box to select only 483s (value = 0)
select = Select(driver.find_element_by_css_selector('#lcds-datatable-filter--record-filter'))
select.select_by_value('0')

# get the table headers
## read the datatable of page 1 for the sake of retrieving headers
datatable_xpath = '//*[@id="DataTables_Table_0"]'
records = driver.find_element_by_xpath(datatable_xpath)
records_innerhtml = records.get_attribute('innerHTML')
soup = BeautifulSoup(records_innerhtml, 'html.parser')
trs = soup.findAll('tr')
headers = []
for th in trs[0].findAll('th'):
    headers.append(th.text)
headers

# scrape all rows in each page of the paginated datatable
final_page = int(driver.find_element_by_xpath('//*[@id="DataTables_Table_0_paginate"]/ul/li[8]').text)
rows = []
for page in range(1, final_page + 1):
    
    datatable_xpath = '//*[@id="DataTables_Table_0"]'
    records = driver.find_element_by_xpath(datatable_xpath)
    records_innerhtml = records.get_attribute('innerHTML')
    soup = BeautifulSoup(records_innerhtml, 'html.parser')
    trs = soup.findAll('tr')
    
    for i in range(1, len(trs)):
        tds = []
        #page_rows = []
        for td in trs[i].findAll('td'):
            a = td.findAll('a')
            spans = td.findAll('span')
            inputs = td.findAll('input')
            ret = ""
            if len(a) != 0 or len(spans) != 0 or len(inputs) != 0:
                if len(a) != 0:
                    for link in a:
                        ret += link.text + ' - '+link['href']
                if len(spans) != 0:
                    for span in spans:
                        ret += span.text + ' - '+span['title']
                if len(inputs) != 0:
                    for inp in inputs:
                        if inp.has_attr('value'):
                            if inp.has_attr('type'):
                                if inp['type'] == 'hidden':
                                    ret += inp['value']
            else: 
                ret = td.text if td.text != '' and td.text != '\n' else "NaN"
            tds.append(ret)
        rows.append(tds)
        
    driver.find_element_by_xpath('//*[@id="DataTables_Table_0_next"]/a').click()

len(rows)

In [None]:
df = pd.DataFrame(rows, columns = headers)
df[['Record Type', 'HREF']] = df['Record Type'].str.split(' - ', 1, expand = True)
df = df.clean_names()
df['pdf_url'] = 'https://www.fda.gov' + df['href']
df.head()

In [None]:
#list(df.groupby('establishment_type').size().reset_index(name = 'count').sort_values(by = 'count', ascending = False).establishment_type)
relavent_establishment_types = ['Producer of Sterile Drug Products', 'Outsourcing Facility', 'Manufacturer', 'Drug Manufacturer',
                               'Compounding Pharmacy', 'Producer of Non Sterile Drug Products', 'Sterile Drug Manufacturer', 
                               'Pharmaceutical Manufacturer', 'Human Drug Manufacturer', 'Biological Drug Manufacturer', 
                               'Active Pharmaceutical Ingredient Manufacturer', 'Manufacturer and Repacker', 
                               'Active Pharmaceutical Ingredient & Finished Dosage Manufacturer', 'Biotech API Manufacturer',
                               'Finished Pharmaceutical Manufacturer']
df2 = df[df['establishment_type'].isin(relavent_establishment_types)].reset_index()
df2['record_date'] = pd.to_datetime(df2['record_date'])
df2['publish_date'] = pd.to_datetime(df2['publish_date'])
df2.shape

In [None]:
df2

## Step 2: Read each 483 PDF through an OCR machine into notebook

In [None]:
list_urls = list(df2.pdf_url)
test_pdf = df2.pdf_url[0]
#test_href = df2.href[0]
record_reference = str(df2.company_name[0]) + str(df2.record_date[0])

In [None]:
driver.get(test_pdf)

In [None]:
#!pip install pdf2image 
from pdf2image import convert_from_path

pages = convert_from_path(test_pdf, 300,
                         output_file = str(record_reference + " - Page " + str(i) + " of " + str(len(pages)) + ".jpg"),
                         output_folder = "./temp483pdfs")
i = 1
for page in pages:
    image_name = str(record_reference + " - Page " + str(i) + " of " + str(len(pages)) + ".jpg")
    page.save("/temp483pdfs/" + image_name, "JPEG")
    i = i+1

#with tempfile.TemporaryDirectory() as path:
#    images_from_path = convert_from_path(test_pdf, 300, output_folder = path)
#    print("images exported")
#    # do something else
    
#pdf_images_tf.close()

In [None]:
tempfile.TemporaryDirectory().cleanup()

In [None]:
driver.quit()

## Step 3: Tidy each document into a corpus

## Step 4: Perform Topic Modeling on Most Recent 1 Year of Drug Observations

## References

__Useful Links__  
https://medium.com/@sarfrazarshad/scraping-dynamically-created-tables-196b7cbe6c84  
https://stackoverflow.com/questions/56757261/extract-href-using-pandas-read-html/56757977  
https://stackoverflow.com/questions/60757571/python-pandas-parse-html-table-to-get-hidden-values-and-links  
https://stackoverflow.com/questions/51092362/selenium-clicking-to-next-page-until-on-last-page