In [1]:
import requests

import pdfminer.pdfinterp
import pdfminer.converter
import pdfminer.layout
import pdfminer.pdfpage

import csv
import re
import urllib.parse
import io
import os.path
import os
import random
random.seed(853915)

In [2]:
def downloadIfNeeded(targetURL, outputFile, **openkwargs):
    if not os.path.isfile(outputFile):
        outputDir = os.path.dirname(outputFile)
        #This function is a more general os.mkdir()
        if len(outputDir) > 0:
            os.makedirs(outputDir, exist_ok = True)
        r = requests.get(targetURL, stream=True)
        #Using a closure like this is generally better than having to
        #remember to close the file. There are ways to make this function
        #work as a closure too
        with open(outputFile, 'wb') as f:
            f.write(r.content)
    return open(outputFile, **openkwargs)

In [3]:
def readPDF(pdfFile):
    #Based on code from http://stackoverflow.com/a/20905381/4955164
    #Using utf-8, if there are a bunch of random symbols try changing this
    codec = 'utf-8'
    rsrcmgr = pdfminer.pdfinterp.PDFResourceManager()
    retstr = io.StringIO()
    layoutParams = pdfminer.layout.LAParams()
    device = pdfminer.converter.TextConverter(rsrcmgr, retstr, laparams = layoutParams) #, codec = codec)
    #We need a device and an interpreter
    interpreter = pdfminer.pdfinterp.PDFPageInterpreter(rsrcmgr, device)
    password = ''
    maxpages = 0
    caching = True
    pagenos=set()
    for page in pdfminer.pdfpage.PDFPage.get_pages(pdfFile, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    device.close()
    returnedString = retstr.getvalue()
    retstr.close()
    return returnedString

In [4]:
url = 'http://www.congreso.gob.pe/Docs/participacion/museo/congreso/files/mensajes/2001-2020/files/mensaje-2019-01-vizcarra.pdf'
file = '../week-1/mensaje-2019-01-vizcarra.pdf'
speech = readPDF(downloadIfNeeded(url, file, mode = 'rb'))

In [5]:
sentences1 = re.findall('[A-Z¿¡][^.]*\.',speech, re.DOTALL)

In [6]:
sentences = []
for i, sentence in enumerate(sentences1):
    new_sentence = re.sub('\n\n \n\n[0-9]+ \n\n\x0c', '', sentence)
    new_sentence = re.findall('[A-Z¿¡](?!.*\n \n).*\.', new_sentence, re.DOTALL)[0]
    new_sentence = re.sub('\n', '', new_sentence)
    if not 'https://' in new_sentence:
        sentences.append(new_sentence)

In [7]:
num_sample = 20

In [8]:
samples = []
max_number = len(sentences) - 1
for i in range(num_sample):
    number = random.randint(0, max_number)
    if not number in samples:
        samples.append(number)

In [9]:
sentences_sample = [sentences[i] for i in samples]

In [10]:
sample_file = 'sample_sentences.csv'
with open(sample_file, 'w', newline='') as file:
    wr = csv.writer(file)
    wr.writerow(['N','sentence'])
    for i, sentence in enumerate(sentences_sample):
        wr.writerow([i + 1, sentence])