In [38]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import requests
import os
from PyPDF2 import PdfFileReader
from whoosh import index
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID
import sys
from whoosh.qparser import QueryParser
from whoosh import scoring
from whoosh.index import open_dir

#returns url of pma summary pdf and determines if pma has summary
def getSummary(URL):
    
    page_response = requests.get(URL, timeout=5)
    page_content = BeautifulSoup(page_response.content, "html.parser")

    textContent = page_content.find_all(style="text-decoration:underline;")
    
    for element in textContent:
        if(element.text == 'Summary of Safety and Effectiveness'):
            return 'http://www.accessdata.fda.gov/cdrh_docs/pdf' + URL[69:71] + '/' + URL[68:75] + 'B.pdf'
        
    return URL[68:75] + ' does not contain a Summary of Safety and Effectiveness' 

#gets the url of pmas given a range of time
def getPMAS(fromDate, toDate):

    driver = webdriver.Firefox()
    driver.get('https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpma/pma.cfm')

    fromBox = driver.find_element_by_name('decisiondatefrom')
    toBox = driver.find_element_by_name('decisiondateto')
    searchBox = driver.find_element_by_name('Search')

    fromBox.send_keys(fromDate)
    toBox.send_keys(toDate)

    searchBox.click()

    rppBox = ''
    
    try:
        rppBox = driver.find_element_by_name('rpp')
    except:
        url = driver.current_url
        driver.quit()
        return url
    
    select = Select(rppBox)
    select.select_by_value('500')
    
    url = driver.current_url
    
    driver.quit()
    
    return url
    
#downloads all pma summary pdfs in the url
def getSummaries(url):
    
    startUrl = 'https://www.accessdata.fda.gov'

    page_response = requests.get(url, timeout=5)
    page_content = BeautifulSoup(page_response.content, "html.parser")

    textContent = page_content.find_all(style="text-decoration:underline;")
    
    #looks through all pmas in url
    for element in textContent:
        string = str(element)
        deviceURL = startUrl + string[9: 9 + string[9:len(string)].index('\"')]
        
        #if the pma is original
        if len(deviceURL) == 75:
              
            file_url = getSummary(deviceURL)
            
            #if the pma does not have a summary
            if(len(file_url) == 62):
                print(file_url)
            else:
                r = requests.get(file_url, stream = True) 
  
                #download pmas into Summaries folder
                with open('Summaries/' + file_url[47:59],"wb") as pdf: 
                    for chunk in r.iter_content(chunk_size=1024): 
                        if chunk: 
                            pdf.write(chunk)
    
class Date:
    
    daysInMonth = [[0, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31], [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31], [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31], [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]]
    
    
    def __init__(self, month, day, year):
        self.m = month
        self.d = day
        self.y = year
    
    def getDate(self):
        
        if(len(str(self.m)) > 1):
            toReturn = str(self.m)
        else:
            toReturn = '0' + str(self.m)
            
        if(len(str(self.d)) > 1):
            toReturn += '/' + str(self.d)
        else:
            toReturn += '/0' + str(self.d)
            
        toReturn += '/' + str(self.y)
        
        return toReturn
    def nextMonth(self):
        if(self.m == 12):
            self.m = 1
            self.y += 1
        else:
            self.m +=1
        self.d = 1
            
    def prevMonth(self):
        if(self.m == 1):
            self.m = 12
            self.y -= 1
        else:
            self.m -= 1
        self.d = 1
    
    def lastDay(self):
        return str(self.m) + '/' + str(self.daysInMonth[self.y%4][self.m]) + '/' + str(self.y)
    
    def firstDay(self):
        return str(self.m) + '/' + '01' + '/' + str(self.y)
    
    def monthsInBetween(self, other):
        return (other.y - self.y)*12 + (other.m - self.m)

#deletes files found in a given root
def deleteFiles(root):
    filepaths = [os.path.join(root,i) for i in os.listdir(root)]
    for path in filepaths:
        os.remove(path)

#call this
def search(startDate, endDate, words):
    for i in range(startDate.monthsInBetween(endDate)):
        getSummaries(getPMAS(startDate.getDate(), startDate.lastDay()))
        startDate.nextMonth()
        
    getSummaries(getPMAS(endDate.firstDay(), endDate.getDate()))
    
    createSearchableData('Summaries')
    
    results = []
    
    for i in range(len(words)):
        results.append(termQuery(words[i]))
    
    deleteFiles('Summaries')
    
    for i in range(len(results)):
        print(words[i] + ':')
        
        for element in results[i]:
            print(element)
    

#adds all pdfs in directory to an index
def createSearchableData(root):   
    schema = Schema(title=TEXT(stored=True),path=ID(stored=True),\
              content=TEXT)
    if not os.path.exists("indexdir"):
        os.mkdir("indexdir")
 
    ix = index.create_in("indexdir", schema)
    writer = ix.writer()
 
    filepaths = [os.path.join(root,i) for i in os.listdir(root)]
    for path in filepaths:
        text = getPDFText(path)
        writer.add_document(title=path.split("\\")[1], path=path,\
            content=text)
    writer.commit()

#searches index for particular phrase and prints results
def termQuery(phrase):

    ix = open_dir("indexdir")
 
    qp = QueryParser("content", schema=ix.schema)
    q = qp.parse(phrase)
    
    res = []
    
    with ix.searcher() as s:
        results = s.search(q, limit=None)
        for result in results:
            res.append(str(result))
        return res
            
    

#returns text found in pdf
def getPDFText(path):

    text = ''
    
    with open(path, 'rb') as f:
        pdf = PdfFileReader(f)
        numPages = pdf.getNumPages()
        for pageNum in range(numPages):
            page = pdf.getPage(pageNum)
            text += page.extractText().replace('\n', ' ')
    
    return text
        
    
search(Date(9,1,1999), Date(4,1,2001), ['indications', 'finite element'])    
#print(getPDFText('P170036B.pdf'))

PdfReadError: EOF marker not found

In [8]:
import requests
file_url = 'https://www.accessdata.fda.gov/cdrh_docs/pdf19/P190004B.pdf'
            
if(len(file_url) == 62):
    print(file_url)
else:
    r = requests.get(file_url, stream = True) 
  
    with open('Summaries/' + file_url[47:59],"wb") as pdf: 
        for chunk in r.iter_content(chunk_size=1024): 
            if chunk: 
                pdf.write(chunk) 
            
    print(file_url[47:59])

P190004B.pdf


In [None]:
import requests

page_response = requests.get(URL, timeout=5)
page_content = BeautifulSoup(page_response.content, "html.parser")

textContent = page_content.find_all(style="text-decoration:underline;")