# Download the files

In [1]:
# libraries
import requests, bs4, re, os, json
from PyPDF2 import PdfReader

# user-agent https://stackoverflow.com/a/38489588/3720258
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

# 2005 Constitution

In [2]:
# create a folder to store the files
os.makedirs('2005', exist_ok = True)

# download the pdf
filename = '00-CPR-2005.pdf'
if not os.path.exists(os.path.join('2005', filename)):
    res = requests.get('https://servicios-leychile.bcn.cl/Consulta/Exportar?radioExportar=Normas&exportar_formato=pdf&nombrearchivo=Constitucion_Chilena_1980_s_notas&exportar_con_notas_bcn=False&exportar_con_notas_originales=False&exportar_con_notas_al_pie=False&hddResultadoExportar=242302..0.0%23', headers = headers)
    res.raise_for_status()
    with open(os.path.join('2005', filename), 'wb') as file:
        for chunk in res.iter_content(100000):
            file.write(chunk)

In [3]:
# read the pdf as text
with open(os.path.join('2005', filename), 'rb') as file:
    reader = PdfReader(file)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"

# remove first 16 lines
text = re.sub(r'^.*\n', '', text, count = 16, flags = re.MULTILINE)

# remove all ^\s+ from the text
text = re.sub(r'^\s+', '', text, flags = re.MULTILINE)

# add title
text = re.sub(r'Constituci\u00f3n Pol\u00edtica de la Rep\u00fablica:\n', '# CONSTITUCI\u00d3N POL\u00cdTICA DE LA REP\u00daBLICA DE CHILE\n', text, flags = re.MULTILINE)

# convert all ^Cap\u00edtulo + roman number + newline to # CAPITULO + roman number
text = re.sub(r'^Cap\u00edtulo ([IVX]+)\n', '\n## CAP\u00cdTULO \\1 - ', text, flags = re.MULTILINE)

# convert all ^Art\u00edculo to # Art\u00edculo
# consider Articulo XX bis.-
text = text.replace('\u00ba', '\u00b0')
text = re.sub(r'^Art\u00edculo ([0-9]+)\u00b0\.- ', '\n##### Art\u00edculo \\1\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^Art\u00edculo ([0-9]+)\.- ', '\n##### Art\u00edculo \\1\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^Art\u00edculo ([0-9]+)\. ', '\n##### Art\u00edculo \\1\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^Art\u00edculo ([0-9]+) bis\. ', '\n##### Art\u00edculo \\1 bis\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^Art\u00edculo ([0-9]+) bis\.- ', '\n##### Art\u00edculo \\1 bis\n\n', text, flags = re.MULTILINE)

# convert numerals to numbered list
text = re.sub(r'([0-9]+)\u00b0.-', '\n\\1.', text, flags = re.MULTILINE)

# remove linebreak from lines that do not end with a period
text = re.sub(r'([^\n\.])\n', '\\1 ', text, flags = re.MULTILINE)

# add linebreak after each line starting with '#'
text = re.sub(r'^(#.*)', '\\1\n', text, flags = re.MULTILINE)

# remove 'Decreto 100 (2005) Biblioteca del Congreso Nacional de Chile - www.leychile.cl - documento generado el 20-Feb-2023'
text = re.sub(r'Decreto 100 \(2005\) Biblioteca del Congreso Nacional de Chile - www.leychile.cl - documento generado el 20-Feb-2023 ', '', text, flags = re.MULTILINE)

# remove 'p\u00e1gina NUMBER de NUMBER'
text = re.sub(r'p\u00e1gina [0-9]+ de [0-9]+', '', text, flags = re.MULTILINE)

# if 'Artículo NUMBER.' appears not at the beginning of a line, add a linebreak
text = re.sub(r'([^\n])Art\u00edculo ([0-9]+)\.', '\\1\n##### Art\u00edculo \\2\n\n', text, flags = re.MULTILINE)

# remove final spaces from lines
text = re.sub(r' +$', '', text, flags = re.MULTILINE)

# remove multiple spaces
text = re.sub(r' +', ' ', text, flags = re.MULTILINE)

# remove spaces at the beginning of lines
text = re.sub(r'^ ', '', text, flags = re.MULTILINE)

# for each line, if a line starts with #### and the previous line is not blank, add a linebreak
for line in text.splitlines():
    if line.startswith('####'): 
        if not text.splitlines()[text.splitlines().index(line) - 1] == '':
            text = text.replace(line, '\n' + line)

# add linebreaks to separate subtitles from paragraphs
subtitles = ['Gobierno y Administración Regional',
'Gobierno y Administración Provincial',
'Administración Comunal',
'Disposiciones Generales',
'Disposiciones Especiales',
'Reforma de la Constitución',
'Del procedimiento para elaborar una Nueva Constitución Política de la República']

for line in text.splitlines():
    if line in subtitles:
        text = text.replace(line, '\n#### ' + line)

# convert all a) b) c) to list
text = re.sub(r'([a-z])\)', '\n\\1.', text, flags = re.MULTILINE)

# convert 1) 2) 3) to list
text = re.sub(r'([0-9])\)', '\n\\1.', text, flags = re.MULTILINE)

# add a linebreak for numbered lines
text = re.sub(r'^([0-9]+)\.', '\n\\1.', text, flags = re.MULTILINE)
text = re.sub(r'^([a-z])\.', '\n\n\\1.', text, flags = re.MULTILINE)

# fix linebreaks (Art 10)
text = re.sub(r'\nEsta renuncia s\u00f3lo producir\u00e1', ' Esta renuncia s\u00f3lo producir\u00e1', text, flags = re.MULTILINE)

# fix linebreaks (Art 17)
text = re.sub(r' Los que hubieren perdido la ciudadan\u00eda', '\n\nLos que hubieren perdido la ciudadan\u00eda', text, flags = re.MULTILINE)

# fix 'GOBIERNO Presidente de la República' (Cap 3)
text = re.sub(r'GOBIERNO Presidente de la Rep\u00fablica', 'GOBIERNO\n\n#### Presidente de la Rep\u00fablica', text, flags = re.MULTILINE)

# fix '21. Disponer, mediante decreto supremo fundado' (Art 32)
text = re.sub(r'\n21\. Disponer, mediante decreto supremo fundado', '21. Disponer, mediante decreto supremo fundado', text, flags = re.MULTILINE)

# fix 'Ministros de Estado' (Art 33)
text = re.sub(r'^Ministros de Estado\n', '\n#### Ministros de Estado\n', text, flags = re.MULTILINE)

# fix 'Bases generales de la Administración del Estado' (Art 38)
text = re.sub(r'^Bases generales de la Administraci\u00f3n del Estado\n', '\n#### Bases generales de la Administraci\u00f3n del Estado\n', text, flags = re.MULTILINE)

# fix 'c. Un ex Contralor o Subcontralor de la Contraloría General de la República.' (Art 38 bis)
text = re.sub(r'\nc\. Un ex Contralor o Subcontralor de la Contralor\u00eda General de la Rep\u00fablica\.', 'c. Un ex Contralor o Subcontralor de la Contralor\u00eda General de la Rep\u00fablica.', text, flags = re.MULTILINE)

# fix number/letter list (Art 19)
text = re.sub(r'en los n\u00fameros \n\n1\. a \n\n6\.', 'en los n\u00fameros 1. a 6.', text, flags = re.MULTILINE)

# remove degree symbols (Art 19)
text = re.sub(r'\u00b0', '', text, flags = re.MULTILINE)

# fix title (Art 39)
text = re.sub(r' Estados de excepci\u00f3n constitucional', '\n\n#### Estados de excepci\u00f3n constitucional', text, flags = re.MULTILINE)

# fix title (Art 47)
text = re.sub(r'^Composici\u00f3n y generaci\u00f3n de la C\u00e1mara de Diputados y del Senado$', '\n\n#### Composici\u00f3n y generaci\u00f3n de la C\u00e1mara de Diputados y del Senado', text, flags = re.MULTILINE)

# fix title (Art 52)
text = re.sub(r'Atribuciones exclusivas de la C\u00e1mara de Diputados', '\n\n#### Atribuciones exclusivas de la C\u00e1mara de Diputados', text, flags = re.MULTILINE)

# PENDING fix linebreak (Art 52)
text = re.sub(r'letras \n\n\nb\., \n\n\nc\., \n\n\nd\. y \n\n\ne\.', 'letras b., c., d. y e.', text, flags = re.MULTILINE)

# fix title (Art 53)
text = re.sub(r'^Atribuciones exclusivas del Senado', '\n\n#### Atribuciones exclusivas del Senado', text, flags = re.MULTILINE)

# fix number/letter list (Art 53)
text = re.sub(r'^0\. Dar su dictamen', '10. Dar su dictamen', text, flags = re.MULTILINE)

# fix title (Art 54)
text = re.sub(r'^Atribuciones exclusivas del Congreso', '\n\n#### Atribuciones exclusivas del Congreso', text, flags = re.MULTILINE)

# fix line break (Art 55)
text = re.sub(r'^Funcionamiento del Congreso', '\n\n#### Funcionamiento del Congreso', text, flags = re.MULTILINE)

# fix title (Art 57)
text = re.sub(r'^Normas comunes para los diputados y senadores$', '\n\n#### Normas comunes para los diputados y senadores', text, flags = re.MULTILINE)

# fix number/letter list (Art 57)
text = re.sub(r'^0\. Los Comandantes en Jefe', '10. Los Comandantes en Jefe', text, flags = re.MULTILINE)
text = re.sub(r'en los n\u00fameros \n\n7\. y \n\n8\.,', 'en los n\u00fameros 7. y 8.,', text, flags = re.MULTILINE)
text = re.sub(r'en el n\u00famero \n\n9\.', 'en el n\u00famero 9.', text, flags = re.MULTILINE)

# fix title (Art 63)

text = re.sub(r'^Materias de Ley$', '\n\n#### Materias de Ley', text, flags = re.MULTILINE)

# fix line break (Art 63)
text = re.sub(r'; 1\n\n([0-9])\.', '\n\n1\\1.', text, flags = re.MULTILINE)
text = re.sub(r'y 2\n\n0\.', '\n\n20.', text, flags = re.MULTILINE)

# fix line break (Art 65)
text = re.sub(r'Formaci\u00f3n de la ley', '\n\n#### Formaci\u00f3n de la ley', text, flags = re.MULTILINE)

# fix title (Art 92)
text = re.sub(r'Cap\u00edtulo VIII TRIBUNAL CONSTITUCIONAL', '\n\n## CAP\u00cdTULO VIII - TRIBUNAL CONSTITUCIONAL', text, flags = re.MULTILINE)

# fix linebreak (Art 93)
text = re.sub(r'n\u00famero \n\n7\.', 'n\u00famero 7.', text, flags = re.MULTILINE)

# fix title (Art 127)
text = re.sub(r' Reforma de la Constituci\u00f3n', '\n\n#### Reforma de la Constituci\u00f3n', text, flags = re.MULTILINE)

# fix title (Art 144)
text = re.sub(r'^DEL NUEVO PROCEDIMIENTO PARA ELABORAR', '\n\n### DEL NUEVO PROCEDIMIENTO PARA ELABORAR', text, flags = re.MULTILINE)
text = re.sub(r' Del Consejo Constitucional', '\n\n#### Del Consejo Constitucional', text, flags = re.MULTILINE)

# fix line break + number/letter list (Art 144)
text = re.sub(r' 1\. A la elecci\u00f3n de los integrantes', '\n\n1. A la elecci\u00f3n de los integrantes', text, flags = re.MULTILINE)
text = re.sub(r'letra \n\n\na\.', 'letra a.', text, flags = re.MULTILINE)
text = re.sub(r'letra a\.\.', 'letra a.', text, flags = re.MULTILINE)
text = re.sub(r'letra \n\n\nb\.', 'letra b.', text, flags = re.MULTILINE)
text = re.sub(r'letra \n\n\nd\.', 'letra d.', text, flags = re.MULTILINE)
text = re.sub(r' i\. Se determinar\u00e1', '\n\ni. Se determinar\u00e1', text, flags = re.MULTILINE)
text = re.sub(r'^ii\. Se ordenar\u00e1', '\nii. Se ordenar\u00e1', text, flags = re.MULTILINE)
text = re.sub(r'^iii\. A continuaci', '\niii. A continuaci', text, flags = re.MULTILINE)
text = re.sub(r'^iv\. El candidato(.*)', '\niv. El candidato\\1\n', text, flags = re.MULTILINE)

# fix title (Art 145)
text = re.sub(r'^De la Comisi\u00f3n Experta', '\n#### De la Comisi\u00f3n Experta', text, flags = re.MULTILINE)

# fix title (Art 152)
text = re.sub(r'^Del procedimiento$', '\n#### Del procedimiento', text, flags = re.MULTILINE)

# fix enumerated lists (Art 154)
text = re.sub(r' 1\. Chile es una Rep\u00fablica democr\u00e1tica', '\n\n1. Chile es una Rep\u00fablica democr\u00e1tica', text, flags = re.MULTILINE)

# fix title (Art 155)
text = re.sub(r'^Del requerimiento ante el Comit\u00e9 T\u00e9cnico de Admisibilidad$', '\n#### Del requerimiento ante el Comit\u00e9 T\u00e9cnico de Admisibilidad', text, flags = re.MULTILINE)

# fix title (Art 159)
text = re.sub(r'^Del plebiscito constitucional$', '\n#### Del plebiscito constitucional', text, flags = re.MULTILINE)

# fix enumerated lists (Art 161)
text = re.sub(r' 1. Se considerar\u00e1 como per\u00edodo', '\n\n1. Se considerar\u00e1 como per\u00edodo', text, flags = re.MULTILINE)

# fix linebreaks (Disposiciones transitorias)
text = re.sub(r'^DISPOSICIONES TRANSITORIAS ', '\n## DISPOSICIONES TRANSITORIAS\n\n', text, flags = re.MULTILINE)

# fix headers (Disposiciones transitorias)
text = re.sub(r'^PRIMERA\.- ', '##### PRIMERA\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^SEGUNDA\.- ', '\n##### SEGUNDA\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^TERCERA\.- ', '\n##### TERCERA\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^CUARTA\.- ', '\n##### CUARTA\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^QUINTA\.- ', '\n##### QUINTA\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^SEXTA\.- ', '\n##### SEXTA\n\n', text, flags = re.MULTILINE)
text = re.sub(r' SEPTIMA\.- ', '\n\n##### SEPTIMA\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^OCTAVA\.- ', '\n##### OCTAVA\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^NOVENA\.- ', '\n##### NOVENA\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^DECIMA\.- ', '\n##### DECIMA\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^DECIMO([A-Z]+)\.- (.*)', '\n##### DECIMO\\1\n\n\\2\n', text, flags = re.MULTILINE)
text = re.sub(r' DECIMOSEGUNDA\.- ', '\n\n##### DECIMOSEGUNDA\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^VIGESIMA\.- ', '\n##### VIG\u00c9SIMA\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^VIGESIMA PRIMERA\.- ', '\n##### VIG\u00c9SIMA PRIMERA\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^VIGESIMO([A-Z]+)\.- (.*)', '\n##### VIGESIMO\\1\n\n\\2\n', text, flags = re.MULTILINE)
text = re.sub(r'^VIG\u00c9SIMO([A-Z]+)\.- (.*)', '\n##### VIG\u00c9SIMO\\1\n\n\\2\n', text, flags = re.MULTILINE)
text = re.sub(r'^VIGESIMOS\u00c9PTIMA\.- ', '\n##### VIGESIMOS\u00c9PTIMA\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^VIG\u00c9SIMO OCTAVA\.- ', '\n##### VIG\u00c9SIMO OCTAVA\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^VIG\u00c9SIMO NOVENA\. ', '\n##### VIG\u00c9SIMO NOVENA\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^TRIG\u00c9SIMA\. ', '\n##### TRIG\u00c9SIMA\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^TRIG\u00c9SIMA PRIMERA\. ', '\n##### TRIG\u00c9SIMA PRIMERA\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^TRIG\u00c9SIMA ([A-Z]+)\. (.*)', '\n##### TRIG\u00c9SIMA \\1\n\n\\2\n', text, flags = re.MULTILINE)
text = re.sub(r'^TRIG\u00c9SIMA ([A-Z]+)\.- (.*)', '\n##### TRIG\u00c9SIMA \\1\n\n\\2\n', text, flags = re.MULTILINE)
text = re.sub(r'^TRIG\u00c9SIMA S\u00c9PTIMA\.- ', '\n##### TRIG\u00c9SIMA S\u00c9PTIMA\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^CUADRAG\u00c9SIMA\. ', '\n##### CUADRAG\u00c9SIMA\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^CUADRAG\u00c9SIMA ([A-Z]+)\. (.*)', '\n##### CUADRAG\u00c9SIMA \\1\n\n\\2\n', text, flags = re.MULTILINE)

# fix linebreaks (Disposiciones transitorias)
text = re.sub(r'letras \n\n\na\. y \n\n\nb\.', 'a. y b.', text, flags = re.MULTILINE)
text = re.sub(r'letra \n\n\nc\. del', 'letra c. del', text, flags = re.MULTILINE)
text = re.sub(r'letras \n\n\na\., \n\n\nb\., \n\n\nc\. y \n\n\nd.', 'letras a., b., c. y d.', text, flags = re.MULTILINE)
text = re.sub(r'org\u00e1nica constitucional de \n#### Gobierno y Administraci\u00f3n', 'org\u00e1nica constitucional de Gobierno y Administraci\u00f3n', text, flags = re.MULTILINE)
text = re.sub(r'org\u00e1nica constitucional sobre \n#### Gobierno y Administraci\u00f3n', 'org\u00e1nica constitucional sobre Gobierno y Administraci\u00f3n', text, flags = re.MULTILINE)
text = re.sub(r'letras \n\n\na\. a la \n\n\ne\.', 'letras a. a la e.', text, flags = re.MULTILINE)
text = re.sub(r' - El 50 por ciento', '\n\n- El 50 por ciento', text, flags = re.MULTILINE)
text = re.sub(r'literales \n\n\n\nd\., \n\n\ne\., \n\n\ng\. y \n\n\nj\.', 'literales d., e., g. y j.', text, flags = re.MULTILINE)
text = re.sub(r'\n\nDe la participaci\u00f3n de los pueblos ind\u00edgenas en la elecci\u00f3n de convencionales constituyentes.$', ' - De la participaci\u00f3n de los pueblos ind\u00edgenas en la elecci\u00f3n de convencionales constituyentes.', text, flags = re.MULTILINE)
text = re.sub(r'letras \n\n\na\., \n\n\nc\., \n\n\nd\., \n\n\ne\. y \n\n\nf\.', 'letras a., c., d., e. y f.', text, flags = re.MULTILINE)
text = re.sub(r'^CUADRAG\u00c9SIMA S\u00c9PTIMA\. ', '\n\n##### CUADRAG\u00c9SIMA S\u00c9PTIMA\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^QUINCUAG\u00c9SIMA\. ', '\n\n##### QUINCUAG\u00c9SIMA\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^QUINCUAG\u00c9SIMA PRIMERA\. ', '\n\n##### QUINCUAG\u00c9SIMA PRIMERA\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^QUINCUAG\u00c9SIMA SEGUNDA\. ', '\n\n##### QUINCUAG\u00c9SIMA SEGUNDA\n\n', text, flags = re.MULTILINE)
text = re.sub(r'^QUINCUAG\u00c9SIMA TERCERA\. ', '\n\n##### QUINCUAG\u00c9SIMA TERCERA\n\n', text, flags = re.MULTILINE)
text = re.sub(r' orgánica constitucional de \n\n#### Gobierno', ' orgánica constitucional de Gobierno', text, flags = re.MULTILINE)
text = re.sub(r'literales \n\n\nc\., \n\n\nd\. y \n\n\nf\.', 'literales c., d. y f.', text, flags = re.MULTILINE)

# remove 'Anótese, tómese razón y publíquese' and the rest of the text
text = re.sub(r'An\u00f3tese, t\u00f3mese raz\u00f3n y publ\u00edquese.*', '', text, flags = re.MULTILINE)

# remove 'Lo que transcribo a Ud.' and the rest of the text
text = re.sub(r'Lo que transcribo a Ud.*', '', text, flags = re.MULTILINE)

# convert all multiple linebreaks to single linebreaks
text = re.sub(r'\n\n+', '\n\n', text, flags = re.MULTILINE)

# add linebreaks at the start of new paragraphs
text = re.sub(r'^([A-Z])', '\n\n\\1', text, flags = re.MULTILINE)

# convert consecutive whitelines to single whitelines
text = re.sub(r'\n\s*\n', '\n\n', text, flags = re.MULTILINE)

# save as markdown in utf-8
with open(os.path.join('2005', '00-CPR-2005.md'), 'w', encoding = 'utf-8') as file:
    file.write(text)

In [22]:
# split text into a list of of chapters (i.e., ## define a new element in the list)
chapters = re.split(r'^## ', text, flags = re.MULTILINE)

# remove final \n\n from each element in the list
chapters = [chapter.strip() for chapter in chapters]

# from the second element on, add '## ' to the start of each element
chapters[1:] = ['## ' + chapter for chapter in chapters[1:]]

# from the second element on, use the first \n\n to split the element and put the second part as a nested list
chapters[1:] = [re.split(r'\n\n', chapter, maxsplit = 1) for chapter in chapters[1:]]

# from the second element on, put the second element of each part as a nested list
chapters[1:] = [[chapter[0], [chapter[1]]] for chapter in chapters[1:]]
 
# pass the first element of each part as the key and the second element as the value
chapters = dict(chapters[1:])
chapters = {chapters0: chapters}

# save chapters as json
with open(os.path.join('2005', '00-CPR-2005.json'), 'w', encoding = 'utf-8') as file:
    json.dump(chapters, file, ensure_ascii = False, indent = 4)

## 2016 Constitutional Process

Download the links from https://www.unaconstitucionparachile.cl/

In [None]:
# create a folder to store the files
os.makedirs('2016', exist_ok = True)

# download the page
main_html = os.path.join('2016', '00-index.html')

if not os.path.exists(main_html):
    res = requests.get('https://www.unaconstitucionparachile.cl/', headers = headers)
    res.raise_for_status()
    with open(main_html, 'wb') as f:
        f.write(res.content)

In [None]:
# find all the links
with open(main_html) as file:
    soup = bs4.BeautifulSoup(file.read(), 'html.parser')
    links = soup.select('a[href^="https://www.unaconstitucionparachile.cl/wp-content/uploads/"]')

# download the files
for link in links:
    filename = re.search(r'\/([^\/]+)$', link.get('href')).group(1)
    if os.path.exists(os.path.join('2016', filename)):
        continue
    else:
        res = requests.get(link.get('href'), headers = headers)
        res.raise_for_status()
        with open(os.path.join('2016', filename), 'wb') as file:
            for chunk in res.iter_content(100000):
                file.write(chunk)

Now download the pdfs that were downloaded as html in the previous step

In [None]:
# read the html files in 2016/ and download the pdfs in the same folder
for filename in os.listdir('2016'):
    if filename.endswith('.htm') & (filename != main_html):
        with open(os.path.join('2016', filename), 'r') as file:
            soup = bs4.BeautifulSoup(file.read(), 'html.parser')
            links = soup.select('a[href^="https://web.archive.org/web/"]')
            for link in links:
                filename = re.search(r'\/([^\/]+)$', link.get('href')).group(1)
                if not filename.endswith('.pdf'):
                    continue
                if os.path.exists(os.path.join('2016', filename)):
                    continue
                if filename == 'memoria_proceso_constituyente.pdf':
                    continue
                else:
                    print(link.get('href'))
                    print(filename)
                    res = requests.get(link.get('href'), headers = headers)
                    res.raise_for_status()
                    with open(os.path.join('2016', filename), 'wb') as file:
                        for chunk in res.iter_content(100000):
                            file.write(chunk)


# the html contains the wrong link to the pdf, this is the correct one (found on Google)
# download https://web.archive.org/web/20180904061952if_/https://www.unaconstitucionparachile.cl/memoria_proceso_constituyente.pdf as 2016/memoria_proceso_constituyente.pdf
if not os.path.exists(os.path.join('2016', 'memoria_proceso_constituyente.pdf')):
    res = requests.get('https://web.archive.org/web/20180904061952if_/https://www.unaconstitucionparachile.cl/memoria_proceso_constituyente.pdf', headers = headers)
    res.raise_for_status()
    with open(os.path.join('2016', 'memoria_proceso_constituyente.pdf'), 'wb') as file:
        for chunk in res.iter_content(100000):
            file.write(chunk)

## 2022 Constitutional Process

Download the final text (rejected by the people)

In [None]:
# create a folder to store the files
os.makedirs('2022', exist_ok = True)

# download the pdf
filename = '00-Texto-Definitivo-CPR-2022.pdf'
if not os.path.exists(os.path.join('2022', filename)):
    res = requests.get('https://www.chileconvencion.cl/wp-content/uploads/2022/07/Texto-Definitivo-CPR-2022-Tapas.pdf', headers = headers)
    res.raise_for_status()
    with open(os.path.join('2022', filename), 'wb') as file:
        for chunk in res.iter_content(100000):
            file.write(chunk)

Download the links from https://www.chileconvencion.cl/documentos/ (meetings' summaries, etc.)

In [None]:
# download the page
main_html = os.path.join('2022', '00-index.html')

if not os.path.exists(main_html):
    res = requests.get('https://www.chileconvencion.cl/documentos/', headers = headers)
    res.raise_for_status()
    with open(main_html, 'wb') as f:
        f.write(res.content)

In [None]:
# find all the links
with open(main_html) as file:
    soup = bs4.BeautifulSoup(file.read(), 'html.parser')
    links = soup.select('a[href^="https://www.chileconvencion.cl/wp-content/uploads/"]')

# list pdf files including the ones in subfolders
pdfs = []
for root, dirs, files in os.walk('2022'):
    for file in files:
        if file.endswith('.pdf'):
            pdfs.append(os.path.join(root, file)) 

# remove all before / in the list
pdfs = [re.search(r'\/([^\/]+)$', pdf).group(1) for pdf in pdfs]

# download the files
for link in links:
    filename = re.search(r'\/([^\/]+)$', link.get('href')).group(1)
    # if filename matches existing files in subfolders, skip
    if filename in pdfs:
        continue
    else:
        res = requests.get(link.get('href'), headers = headers)
        res.raise_for_status()
        with open(os.path.join('2022', filename), 'wb') as file:
            for chunk in res.iter_content(100000):
                file.write(chunk)

In [None]:
# move the files to folders organized by topic
keywords = ['acuerdo', 'acta', 'convenio', 'cronograma', 'discurso', 'informe', 'iniciativa', 'norma', 'oficio', 'propuesta', 'reglamento']

for keyword in keywords:
    os.makedirs(os.path.join('2022', keyword + 's'), exist_ok = True)
    for filename in os.listdir('2022'):
        if filename.endswith('.pdf'):
            if re.search(keyword, filename, re.IGNORECASE):
                os.rename(os.path.join('2022', filename), os.path.join('2022', keyword + 's', filename))

# move all files with 'citacion' to 'citaciones'
os.makedirs(os.path.join('2022', 'citaciones'), exist_ok = True)
for filename in os.listdir('2022'):
    if filename.endswith('.pdf'):
        if re.search(r'citaci|Citaci|CITACI', filename):  
            os.rename(os.path.join('2022', filename), os.path.join('2022', 'citaciones', filename))

# move all files with 'comision' to 'comisiones'
os.makedirs(os.path.join('2022', 'comisiones'), exist_ok = True)
for filename in os.listdir('2022'):
    if filename.endswith('.pdf'):
        if re.search(r'comisi|Comisi|COMISI', filename):  
            os.rename(os.path.join('2022', filename), os.path.join('2022', 'comisiones', filename))

# move all files with 'declarac' to 'declaraciones'
os.makedirs(os.path.join('2022', 'declaraciones'), exist_ok = True)
for filename in os.listdir('2022'):
    if filename.endswith('.pdf'):
        if re.search(r'declarac|Declarac|Declarac', filename):  
            os.rename(os.path.join('2022', filename), os.path.join('2022', 'declaraciones', filename))

# move all files with '^OF' to 'oficios'
for filename in os.listdir('2022'):
    if filename.endswith('.pdf'):
        if re.search(r'^OF', filename):  
            os.rename(os.path.join('2022', filename), os.path.join('2022', 'oficios', filename))

# move all files with '^IPI/^IPC/iniciat' to 'iniciativas'
for filename in os.listdir('2022'):
    if filename.endswith('.pdf'):
        if re.search(r'^IPC|^IPI|iniciat|Iniciat|INICIAT', filename):  
            os.rename(os.path.join('2022', filename), os.path.join('2022', 'iniciativas', filename))

In [None]:
# find 'Articulado de la Iniciativa Popular de Norma' inside pdfs and move them to 'iniciativas'
for filename in os.listdir('2022'):
    if filename.endswith('.pdf'):
        with open(os.path.join('2022', filename), 'rb') as file:
            reader = PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
            if re.search(r'Articulado de la Iniciativa Popular de Norma|PROPUESTA\s+CONSTITUYENTE\s+CACICADOS', text):
                os.rename(os.path.join('2022', filename), os.path.join('2022', 'iniciativas', filename))

In [None]:
# find 'INFORME DE LA COM' or 'INFORME EJECUTIVO' inside pdfs and move them to 'informes'
for filename in os.listdir('2022'):
    if filename.endswith('.pdf'):
        with open(os.path.join('2022', filename), 'rb') as file:
            reader = PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
            if re.search(r'INFORME DE LA COM|INFORME EJECUTIVO', text):
                os.rename(os.path.join('2022', filename), os.path.join('2022', 'informes', filename))

In [None]:
# find 'CONVENIO DE COLAB' inside pdfs and move them to 'convenios'
for filename in os.listdir('2022'):
    if filename.endswith('.pdf'):
        with open(os.path.join('2022', filename), 'rb') as file:
            reader = PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
            if re.search(r'CONVENIO\s+DE\s+COLAB', text):
                os.rename(os.path.join('2022', filename), os.path.join('2022', 'convenios', filename))

In [None]:
# find 'Propuesta de norma' inside pdfs and move them to 'propuestas'
for filename in os.listdir('2022'):
    if filename.endswith('.pdf'):
        with open(os.path.join('2022', filename), 'rb') as file:
            reader = PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
            if re.search(r'Propuesta\s+de\s+norma', text):
                os.rename(os.path.join('2022', filename), os.path.join('2022', 'iniciativas', filename))