# Scraping images from PDFs

Note to self: Don't use regular `PyPDF2`, use the following:

`pip install --upgrade https://github.com/sylvainpelissier/PyPDF2/archive/master.zip`

`pip install pillow`

In [2]:
import os
import PyPDF2
from PIL import Image

In [5]:
pdf_files = [f for f in os.listdir("pdfs") if f.endswith("pdf")]

for pdf in pdf_files:
    pdf_path = os.path.join("pdfs", pdf)
    file = PyPDF2.PdfFileReader(open(pdf_path, "rb"))
    page = file.getPage(0)

    try:
        xObject = page['/Resources']['/XObject'].getObject()
    except KeyError:
        print("KeyError at:", xObject[obj]['/Name'], "(page %d)" % page_num)
    
    image_folder = os.path.join("images", pdf.replace(".pdf", ""))
    if not os.path.exists(image_folder):
        os.makedirs(image_folder)

    for obj in xObject:
        if xObject[obj]['/Subtype'] == '/Image':
            name = xObject[obj]['/Name'].replace("/", "")
            size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
            filepath = os.path.join(image_folder, name)
            
            try:
                data = xObject[obj].getData()
            except NotImplementedError:
                print("NotImplementedError at:", xObject[obj]['/Name'], "for filter type", 
                      xObject[obj]['/Filter'], "(page %d)" % page_num)
                continue
            
            if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
                mode = "RGB"
            else:
                mode = "P"

            if xObject[obj]['/Filter'] == '/FlateDecode':
                img = Image.frombytes(mode, size, data)
                img.save(filepath + ".png")
            elif xObject[obj]['/Filter'] == '/DCTDecode':
                img = open(filepath + ".jpg", "wb")
                img.write(data)
                img.close()
            elif xObject[obj]['/Filter'] == '/JPXDecode':
                img = open(filepath + ".jp2", "wb")
                img.write(data)
                img.close()
            else:
                print("No decoder for image %s" % name)