# Working With PDF and Word Documents

## PDF Documents

In [3]:
import PyPDF2
pdfFileObj = open('materials/meetingminutes.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
print(pdfReader.numPages)
pageObj = pdfReader.getPage(0)
print(pageObj.extractText())
pdfFileObj.close()

19
OOFFFFIICCIIAALL  BBOOAARRDD  MMIINNUUTTEESS   Meeting of 
March 7
, 2014
        
     The Board of Elementary and Secondary Education shall provide leadership and 
create policies for education that expand opportunities for children, empower 
families and communities, and advance Louisiana in an increasingly 
competitive glob
al market.
 BOARD 
 of ELEMENTARY
 and 
 SECONDARY
 EDUCATION
  


### Decrypting PDFs

In [14]:
import PyPDF2
pdfReader = PyPDF2.PdfFileReader(open('materials/encrypted.pdf', 'rb'))
print(pdfReader.isEncrypted)
# pdfReader.getPage(0) PdfReadError: file has not been decrypted
print(pdfReader.decrypt('rosebud'))
pageObj = pdfReader.getPage(0)
print(pageObj.extractText())

True
1
OOFFFFIICCIIAALL  BBOOAARRDD  MMIINNUUTTEESS   Meeting of 
March 7
, 2014
        
     The Board of Elementary and Secondary Education shall provide leadership and 
create policies for education that expand opportunities for children, empower 
families and communities, and advance Louisiana in an increasingly 
competitive glob
al market.
 BOARD 
 of ELEMENTARY
 and 
 SECONDARY
 EDUCATION
  


### Creating PDFs

#### Copying Pages

In [16]:
import PyPDF2
pdf1File = open('materials/meetingminutes.pdf', 'rb')
pdf2File = open('materials/meetingminutes2.pdf', 'rb')
pdf1Reader = PyPDF2.PdfFileReader(pdf1File)
pdf2Reader = PyPDF2.PdfFileReader(pdf2File)
pdfWriter = PyPDF2.PdfFileWriter()

for pageNum in range(pdf1Reader.numPages):
    pageObj = pdf1Reader.getPage(pageNum)
    pdfWriter.addPage(pageObj)
    
for pageNum in range(pdf2Reader.numPages):
    pageObj = pdf2Reader.getPage(pageNum)
    pdfWriter.addPage(pageObj)
    
pdfOutputFile = open('materials/combinedminutes.pdf', 'wb')
pdfWriter.write(pdfOutputFile)
pdfOutputFile.close()
pdf1File.close()
pdf2File.close()

#### Rotating Pages

In [20]:
import PyPDF2
minutesFile = open('materials/meetingminutes.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(minutesFile)
page = pdfReader.getPage(0)
page.rotateClockwise(90)

pdfWriter = PyPDF2.PdfFileWriter()
pdfWriter.addPage(page)
resultPdfFile = open('materials/rotatedPage.pdf', 'wb')
pdfWriter.write(resultPdfFile)
resultPdfFile.close()
minutesFile.close()

#### Overlaying Pages

In [22]:
import PyPDF2
minutesFile = open('materials/meetingminutes.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(minutesFile)
minutesFirstPage = pdfReader.getPage(0)

pdfWatermarkReader = PyPDF2.PdfFileReader(open('materials/watermark.pdf', 'rb'))
minutesFirstPage.mergePage(pdfWatermarkReader.getPage(0))
pdfWriter = PyPDF2.PdfFileWriter()
pdfWriter.addPage(minutesFirstPage)

for pageNum in range(1, pdfReader.numPages):
    pageObj = pdfReader.getPage(pageNum)
    pdfWriter.addPage(pageObj)
    
resultPdfFile = open('materials/watermarkedCover.pdf', 'wb')
pdfWriter.write(resultPdfFile)
minutesFile.close()
resultPdfFile.close()

#### Encrypting PDFs

In [24]:
import PyPDF2
pdfFile = open('materials/meetingminutes.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFile)
pdfWriter = PyPDF2.PdfFileWriter()
for pageNum in range(pdfReader.numPages):
    pdfWriter.addPage(pdfReader.getPage(pageNum))

pdfWriter.encrypt('swordfish')
resutPdf = open('materials/encryptedminutes.pdf', 'wb')
pdfWriter.write(resutPdf)
resutPdf.close()

## Project: Combining Select Pages from Many PDFs

At a high level, here's what the program will do:
1. Find all PDF files in the current working directory.
1. Sort the filenames so the PDFs are added in order.
1. Write each page, excluding the first page, of each PDF to the output file.

In terms of implementation, the code will need to do the following:
1. Call os.listdir() to find all the files in the working directory and remove any non-PDF files.
1. Call Python's sort() list method to alphabetize the filenames.
1. Create a *PdfFileWriter* object for the output PDF.
1. Loop over each PDF file, creating a *PdfFileReader* object for it.
1. Loop over each page (except the first) in each PDF files.
1. Add the pages to the output PDF.
1. Write the output PDF to a file named *allminutes.pdf*.

In [28]:
# combinePdfs.py - Combines all the PDFs in the current working directory into a single PDF.

import PyPDF2, os
# Get all the PDF filenames.
pdfFiles = []
for filename in os.listdir('materials/combineDir'):
    if filename.endswith('.pdf'):
        pdfFiles.append(filename)
pdfFiles.sort(key=str.lower)

pdfWriter = PyPDF2.PdfFileWriter()

# Loop through all the PDF files.
for filename in pdfFiles:
    pdfFileObj = open('materials/combineDir/' + filename, 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    # Loop through all the pages (except the first) and add them.
    for pageNum in range(1, pdfReader.numPages):
        pageObj = pdfReader.getPage(pageNum)
        pdfWriter.addPage(pageObj)
    
# Save the resulting PDF to a file.
pdfOutput = open('materials/allminuts.pdf', 'wb')
pdfWriter.write(pdfOutput)
pdfOutput.close()

## Word Documents

In [37]:
import docx
doc = docx.Document('materials/demo.docx')
print(len(doc.paragraphs))
print(doc.paragraphs[0].text)
print(doc.paragraphs[1].text)
print(len(doc.paragraphs[1].runs))
print(doc.paragraphs[1].runs[0].text)
print(doc.paragraphs[1].runs[1].text)
print(doc.paragraphs[1].runs[2].text)
print(doc.paragraphs[1].runs[3].text)
print(doc.paragraphs[1].runs[4].text)

7
Document Title
A plain paragraph with some bold and some italic
5
A plain paragraph with
 some 
bold
 and some 
italic


### Getting the Full Text from a .docx File

In [44]:
import docx

def getText(filename):
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append('  ' + para.text)
    return '\n'.join(fullText)

print(getText('materials/demo.docx'), end='')

  Document Title
  A plain paragraph with some bold and some italic
  Heading, level 1
  Intense quote
  first item in unordered list
  first item in ordered list
  


### Styling Paragraph and Run Objects

### Creating Word Documents with Nondefault Styles

### Run Attributes

In [54]:
import docx
doc = docx.Document('materials/demo.docx')
print(doc.paragraphs[0].text)
print(doc.paragraphs[0].style) # The exact id my be different
doc.paragraphs[0].style = 'Normal'
print(doc.paragraphs[1].text)
print((doc.paragraphs[1].runs[0].text, doc.paragraphs[1].runs[1].text, doc.paragraphs[1].runs[2].text, doc.paragraphs[1].runs[3].text, doc.paragraphs[1].runs[4].text))
doc.paragraphs[1].runs[0].style = 'QuoteChar'
doc.paragraphs[1].runs[1].underline = True
doc.paragraphs[1].runs[3].underline = True
doc.save('materials/restyled.docx')

Document Title
_ParagraphStyle('Title') id: 140286691934064
A plain paragraph with some bold and some italic
('A plain paragraph with', ' some ', 'bold', ' and some ', 'italic')


### Writing Word Documents

In [55]:
import docx
doc = docx.Document()
doc.add_paragraph('Hello, world!')
doc.save('materials/helloworld.docx')

In [58]:
import docx
doc = docx.Document()
doc.add_paragraph('Hello world!', 'Title')
paraObj1 = doc.add_paragraph('This is a second paragraph.')
paraObj2 = doc.add_paragraph('This is a yet another paragraph.')
paraObj1.add_run(' This text is being added to the second paragraph.')
doc.save('materials/multipleParagraphs.docx')

### Adding Headings

In [59]:
doc = docx.Document()
doc.add_heading('Header 0', 0)
doc.add_heading('Header 1', 1)
doc.add_heading('Header 2', 2)
doc.add_heading('Header 3', 3)
doc.add_heading('Header 4', 4)
doc.save('materials/headings.docx')

### Adding Line and Page Breaks

In [65]:
import docx
doc = docx.Document()
doc.add_paragraph('This is on the first page!')
doc.paragraphs[0].runs[0].add_break(docx.enum.text.WD_BREAK.PAGE)
doc.add_paragraph('This is on the second page!')
doc.save('materials/twoPage.docx')

### Adding Pictures

In [70]:
import docx
doc.add_picture('materials/zophie.png', width=docx.shared.Inches(1), height=docx.shared.Cm(4))
doc.save('materials/picture.docx')

## Creating PDFs from Word Documents