# Introduction

**This notebook reads a directory of pdf files, extracts the texts using an OCR package (pytesseract), finetunes an OpenAI Curie model, and runs the model on a subdirectory of pdf files. <br><br>The structure of the notebook goes like this:**
1. [Setup](#Setup)
2. [OCR Text Extraction](#OCR-Text-Extraction)
3. [Training Data Preparation](#Training-Data-Preparation)
4. [Fine Tuning OpenAI's Curie Model](#Fine-Tuning-OpenAI's-Curie-Model)
5. [Validation Data Preparation](#Validation-Data-Preparation)
6. [Running Fine Tuned Model](#Running-Fine-Tuned-Model)

# Setup

In [1]:
from pdf2image import convert_from_bytes
import pytesseract
import os
import pandas as pd
from tqdm import tqdm
import re
import openai
import tiktoken
import random
import json
import fitz
import time
import requests
from requests.packages.urllib3.util import ssl_
import warnings
from requests.packages.urllib3.exceptions import InsecureRequestWarning

In [2]:
os.getcwd()

'C:\\Users\\matia\\OneDrive - Universidad del Pacífico\\01-Medidas_emergencia_PE\\01-DATA_PERU\\03-CODE\\Base de Reporte'

**Set own root directory.**

In [3]:
root = r'C:\Users\matia\OneDrive - Universidad del Pacífico\01-Medidas_emergencia_PE'
os.chdir(root)

**We also define other important directories.**

In [4]:
data_raw = root + r'\01-DATA_PERU\01-DATA_RAW'
data_pro = root + r'\01-DATA_PERU\02-DATA_PROCESSED'
documentation = root + r'\01-DATA_PERU\04-DATA_DOCUMENTATION'

# PDF Document Scraping

In [5]:
OSCE_main_sample = pd.read_stata(data_pro + r'\OSCE_main_sample.dta', convert_dates=True, convert_categoricals=True, index_col=None, convert_missing=False, preserve_dtypes=True, columns=None, order_categoricals=True, chunksize=None, iterator=False, compression='infer', storage_options=None)
OSCE_main_sample

Unnamed: 0,codigoconvocatoria,n_cod_contrato,urlcontrato,fecha_suscripcion_contrato,year_suscripcion,n_item1,ruc_proveedor1,ruc_destinatario_pago1,n_item2,ruc_proveedor2,...,ruc_destinatario_pago64,n_item65,ruc_proveedor65,ruc_destinatario_pago65,n_item66,ruc_proveedor66,ruc_destinatario_pago66,n_item67,ruc_proveedor67,ruc_destinatario_pago67
0,590320,2008355,http://zonasegura.seace.gob.pe/documentos//srv...,2020-07-13,2020.0,223,20503300525,20503300525,,,...,,,,,,,,,,
1,590320,2006287,http://zonasegura.seace.gob.pe/documentos//srv...,2020-08-11,2020.0,2,20503300525,20503300525,,,...,,,,,,,,,,
2,590320,2003330,http://zonasegura.seace.gob.pe/documentos//srv...,2020-05-28,2020.0,192,20536390201,20536390201,338.0,20536390201,...,,,,,,,,,,
3,591706,2025400,http://zonasegura.seace.gob.pe/documentos//srv...,2020-11-10,2020.0,33,20255361695,20255361695,34.0,20255361695,...,,,,,,,,,,
4,590320,2006532,http://zonasegura.seace.gob.pe/documentos//srv...,2020-06-26,2020.0,295,20503300525,20503300525,386.0,20503300525,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,667558,2037565,https://prodapp2.seace.gob.pe/portalseace-uiwd...,2020-12-15,2020.0,5,20332970411,20332970411,,,...,,,,,,,,,,
7496,743189,2080139,https://prodapp2.seace.gob.pe/portalseace-uiwd...,2021-10-21,2021.0,1,20521691191,20521691191,,,...,,,,,,,,,,
7497,839608,2142276,https://prodapp2.seace.gob.pe/portalseace-uiwd...,2022-09-14,2022.0,1,20100096341,20100096341,,,...,,,,,,,,,,
7498,706847,2058868,https://prodapp2.seace.gob.pe/portalseace-uiwd...,2021-05-31,2021.0,1,20600144252,20600144252,,,...,,,,,,,,,,


In [6]:
save_directory = documentation + r'\Main_sample\downloaded_pdfs'

# Create the directory if it doesn't exist
if not os.path.exists(save_directory):
    os.makedirs(save_directory)

In [7]:
# Override SSL settings
# ssl_.DEFAULT_CIPHERS += ':HIGH:!DH:!aNULL'

# Disable only DH cipher
requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = 'HIGH:!DH:!aNULL'

failed_downloads_df = pd.DataFrame(columns=['n_cod_contrato', 'urlcontrato', 'failed_download'])

# Download and save PDFs
for index, row in tqdm(OSCE_main_sample.iterrows(), total=OSCE_main_sample.shape[0]):
    url = row['urlcontrato']
    contract_code = row['n_cod_contrato']
    try:
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=InsecureRequestWarning)
            response = requests.get(url, verify=False)
            if response.status_code == 200:
                # Generate a name for the PDF based on the contract's code
                filename = os.path.join(save_directory, f'pdf_{contract_code}.pdf')
                with open(filename, 'wb') as f:
                    f.write(response.content)
            else:
                failed_downloads_df = pd.concat([failed_downloads_df,pd.DataFrame({'n_cod_contrato': contract_code, 'urlcontrato': url, 'failed_download': 1}, index=[0])], ignore_index=True)
                #failed_downloads_df = failed_downloads_df.append({'Contract Code': contract_code, 'URL': url}, ignore_index=True)
                print(f'Failed to download PDF {contract_code} from {url}. Status code: {response.status_code}')
    except requests.exceptions.SSLError as e:
        print(f"An SSL error occurred: {e} in contract {contract_code} with url: {url}")

  0%|                                                                                | 10/7500 [00:00<02:34, 48.49it/s]

Failed to download PDF 2008355 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/007a7972-8634-4d85-8b0e-6f7426e90c92. Status code: 404
Failed to download PDF 2006287 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/00cd9ad5-3c62-4075-b2a6-e650697288dc. Status code: 404
Failed to download PDF 2003330 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/01750b3f-9674-45f8-a7a4-644ea8e30daf. Status code: 404
Failed to download PDF 2025400 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/01cf55b1-0fe7-4033-9bed-1f9faab2e062. Status code: 404
Failed to download PDF 2006532 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/01e63e7a-ae05-49ee-a61e-f3cec69cd26a. Status code: 404
Failed to download PDF 2017187 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/022e262b-af91-4591-9e4a-237149d4afe8. Status code: 404
Failed to download PDF 2033785 from http://zonasegura.seace.gob.pe/documentos//srv

  0%|▏                                                                               | 22/7500 [00:00<02:22, 52.60it/s]

Failed to download PDF 2016315 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/04192377-860e-4e28-82b3-2b45668c9800. Status code: 404
Failed to download PDF 2011169 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/046379c9-51ea-4cdf-83a4-5934675c50a0. Status code: 404
Failed to download PDF 2014882 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/068d2a89-de53-41ab-a843-9836948ad200. Status code: 404
Failed to download PDF 2011181 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/06e0264c-8ae0-45b1-a181-c902db5bc8b5. Status code: 404
Failed to download PDF 2039549 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/07430560-ece4-424f-9290-16aa85ac530c. Status code: 404
Failed to download PDF 2004538 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/07a3f16a-f337-45e3-b595-8ac068c7b401. Status code: 404
Failed to download PDF 2005534 from http://zonasegura.seace.gob.pe/documentos//srv

  0%|▎                                                                               | 35/7500 [00:00<02:12, 56.16it/s]

Failed to download PDF 2168639 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/09be878e-ee91-4e68-aee1-ca4523d99fea. Status code: 404
Failed to download PDF 2008302 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/0aa825bd-1bc2-4d1c-8f7a-68cdde81e6b7. Status code: 404
Failed to download PDF 2017223 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/0ab7ce96-4358-47ac-8e68-75f5012031e0. Status code: 404
Failed to download PDF 2004611 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/0b3514c6-8a64-441c-9070-73f9dae983cb. Status code: 404
Failed to download PDF 2002722 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/0ccb448f-bf23-42ce-9dac-dfd200205d07. Status code: 404
Failed to download PDF 2047917 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/0d7d7ed1-1fd6-486d-a98d-ccc0e25887c7. Status code: 404
Failed to download PDF 2025169 from http://zonasegura.seace.gob.pe/documentos//srv

  1%|▌                                                                               | 47/7500 [00:00<02:16, 54.69it/s]

Failed to download PDF 2006984 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/0f51c018-8083-4be5-a054-4f988d640cf8. Status code: 404
Failed to download PDF 2006842 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/0feef6e3-5d0f-427e-b996-0c489f0f39f1. Status code: 404
Failed to download PDF 2016732 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/0fef59f5-28bc-4cbc-8f71-df9eaf63e20e. Status code: 404
Failed to download PDF 2014563 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/1019c9c5-0e62-40a3-b4c7-3ce8687c0889. Status code: 404
Failed to download PDF 2012122 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/113c95a2-8f05-4066-9dfb-9cfa887dabbe. Status code: 404
Failed to download PDF 2015512 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/1253a8e2-e177-4641-89a2-ee24f50ee565. Status code: 404
Failed to download PDF 2016673 from http://zonasegura.seace.gob.pe/documentos//srv

  1%|▋                                                                               | 60/7500 [00:01<02:11, 56.51it/s]

Failed to download PDF 2008547 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/13ec12c6-bf68-45f9-b2d2-532bc72cb783. Status code: 404
Failed to download PDF 2005200 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/145fd019-9130-4fcc-b166-b0825c2175cc. Status code: 404
Failed to download PDF 2006265 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/148e415b-64fe-41c4-99cb-480b7d55c88e. Status code: 404
Failed to download PDF 2012578 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/14a540f7-3b14-4793-96c9-cdaecf5e86f8. Status code: 404
Failed to download PDF 2007646 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/14df29c9-8382-43c9-ac9f-34bed262637c. Status code: 404
Failed to download PDF 2014176 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/1540d6e5-1c6b-4914-8950-e0b82f13acf6. Status code: 404
Failed to download PDF 2022960 from http://zonasegura.seace.gob.pe/documentos//srv

  1%|▊                                                                               | 72/7500 [00:01<02:09, 57.28it/s]

Failed to download PDF 2015718 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/1622b6b9-7fe4-4676-87ed-231af7e48d41. Status code: 404
Failed to download PDF 2006234 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/16325632-1312-42aa-bdf4-df56303c22c7. Status code: 404
Failed to download PDF 2007787 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/163ee66b-ddff-4860-a0d6-d5e6fc359882. Status code: 404
Failed to download PDF 2012601 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/16941fba-a42f-4e23-b20d-c6122bd29be1. Status code: 404
Failed to download PDF 2005273 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/16f93def-f0e7-423a-9044-d13aeea1c831. Status code: 404
Failed to download PDF 2016300 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/171ed191-961a-4b9d-b7b4-c9c8d03df2f3. Status code: 404
Failed to download PDF 2008273 from http://zonasegura.seace.gob.pe/documentos//srv


  1%|▊                                                                               | 78/7500 [00:01<02:08, 57.89it/s]

Failed to download PDF 2010334 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/18d3e516-471d-4723-aed1-bb7f593054c8. Status code: 404
Failed to download PDF 2011465 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/19241e0e-1c73-444d-b205-f4f3c3de5954. Status code: 404
Failed to download PDF 2005743 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/19f249a2-9ec0-48eb-932d-d6e38928618f. Status code: 404
Failed to download PDF 2019673 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/1a28a351-3a5e-47a8-911a-fb44cd54c02c. Status code: 404
Failed to download PDF 2011126 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/1a73002b-4be0-4441-9656-eaffc74472d2. Status code: 404
Failed to download PDF 2006177 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/1a8cf639-8bad-43fe-9b4e-23788c8fa943. Status code: 404
Failed to download PDF 2007817 from http://zonasegura.seace.gob.pe/documentos//srv

  1%|▉                                                                               | 92/7500 [00:01<02:02, 60.37it/s]

Failed to download PDF 2002628 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/203a2274-8cf9-4a9e-80cd-5dbe5eb72487. Status code: 404
Failed to download PDF 2029005 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/204bc3d7-cd20-4c90-b982-0504508ef3ec. Status code: 404
Failed to download PDF 2016621 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/20c7d692-b9aa-4f1e-a461-75138cbbdc23. Status code: 404
Failed to download PDF 2017259 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/211b214e-d001-439b-ae36-a334fd42b12e. Status code: 404
Failed to download PDF 2012185 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/2146afe7-9bf8-4d9a-9d12-02a4b24470fb. Status code: 404
Failed to download PDF 2016288 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/2259b2b9-d55d-493b-a985-68daa1e2dc94. Status code: 404
Failed to download PDF 2022430 from http://zonasegura.seace.gob.pe/documentos//srv

  2%|█▏                                                                             | 113/7500 [00:01<01:55, 63.91it/s]

Failed to download PDF 2008292 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/26750752-f4c6-4951-950b-57ec0be414f8. Status code: 404
Failed to download PDF 2011108 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/272763ce-d1e8-4fb9-9c68-ef8aed739a06. Status code: 404
Failed to download PDF 2029713 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/277396e9-f519-4385-8630-b90d8a4235c3. Status code: 404
Failed to download PDF 2044306 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/2854f153-6304-4f31-ba90-66fe4a89b3f0. Status code: 404
Failed to download PDF 2008254 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/2a34a702-b491-4021-a0a3-1a89a9def4cd. Status code: 404
Failed to download PDF 2028182 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/2a4711a3-aff6-4187-a565-840f9c1cbb0e. Status code: 404
Failed to download PDF 2013549 from http://zonasegura.seace.gob.pe/documentos//srv


  2%|█▎                                                                             | 120/7500 [00:02<01:55, 64.07it/s]

Failed to download PDF 2029923 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/30098ff3-b7f6-438a-8c48-6df78c38e3cd. Status code: 404
Failed to download PDF 2037013 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/30d3b0db-3ce8-4295-a5ab-c12e1b5f6c02. Status code: 404
Failed to download PDF 2011990 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/30dd4066-730e-4cfe-b1f8-76bfe9135c1d. Status code: 404
Failed to download PDF 2039516 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/315f2163-9255-41e9-9085-cf57bca12751. Status code: 404
Failed to download PDF 2012647 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/318066a8-2be1-4e51-84ad-c2f5c77f7193. Status code: 404
Failed to download PDF 2008165 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/31858d8b-c63c-42f0-9998-35688def0b67. Status code: 404
Failed to download PDF 2012078 from http://zonasegura.seace.gob.pe/documentos//srv

  2%|█▍                                                                             | 134/7500 [00:02<02:00, 61.28it/s]

Failed to download PDF 2007118 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/33babb5e-408d-45cf-bf72-c5024832d8fb. Status code: 404
Failed to download PDF 2008776 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/345d955d-72bc-499c-a1bc-0f7eb6833b2e. Status code: 404
Failed to download PDF 2037271 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/3498b192-42c6-4f07-a6a4-fe50c858c6af. Status code: 404
Failed to download PDF 2014718 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/34b014be-9144-49fb-9008-efa82687672d. Status code: 404
Failed to download PDF 2052184 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/34c0b7b6-100d-4ecf-bb2f-75df272b02be. Status code: 404
Failed to download PDF 2006592 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/34de9f47-6553-4edb-a8c6-17ab3bc84b68. Status code: 404
Failed to download PDF 2003276 from http://zonasegura.seace.gob.pe/documentos//srv

  2%|█▌                                                                             | 148/7500 [00:02<01:57, 62.76it/s]

Failed to download PDF 2005810 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/3bf86cef-e628-4f75-b837-b6198ea3baff. Status code: 404
Failed to download PDF 2012595 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/3ccc4904-7a8f-407f-9387-a5f19ba11ff8. Status code: 404
Failed to download PDF 2018458 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/3ceb5723-771b-4aab-996f-94049d7c781d. Status code: 404
Failed to download PDF 2020104 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/3dd63896-aa8a-425e-8806-ebb016d64541. Status code: 404
Failed to download PDF 2015580 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/3e19d543-7c23-4478-a098-04a6313c2a75. Status code: 404
Failed to download PDF 2003215 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/3e88e6e9-f621-455d-9533-2dc1d6c6c021. Status code: 404
Failed to download PDF 2023030 from http://zonasegura.seace.gob.pe/documentos//srv

  2%|█▋                                                                             | 162/7500 [00:02<02:00, 60.91it/s]

Failed to download PDF 2010708 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/44ab2bfc-b8c6-439e-b47e-535f9fcee7a6. Status code: 404
Failed to download PDF 2008352 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/44b10135-5fd3-401d-b8de-3fd8ed6bc75a. Status code: 404
Failed to download PDF 2054741 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/4508d699-195e-4973-bc65-0f1b5806981e. Status code: 404
Failed to download PDF 2008360 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/4566521b-981a-4071-b287-3fb116cc7e4c. Status code: 404
Failed to download PDF 2006043 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/4576d1e2-ab52-4cc8-ac70-c067e2e9b468. Status code: 404
Failed to download PDF 2035033 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/45847467-7a61-4024-9ed9-ecfcab3fbe14. Status code: 404
Failed to download PDF 2003202 from http://zonasegura.seace.gob.pe/documentos//srv

  2%|█▊                                                                             | 175/7500 [00:02<02:05, 58.30it/s]

Failed to download PDF 2034594 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/485bcc96-164d-441f-9794-7d54341d8b31. Status code: 404
Failed to download PDF 2013201 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/48b0aec4-cc4a-4c85-891a-7fbfe087738a. Status code: 404
Failed to download PDF 2002995 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/4912747a-a003-4bd2-9f82-09159f0d0ce2. Status code: 404
Failed to download PDF 2012087 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/49502421-76b6-4a0d-9edc-8c83090070d6. Status code: 404
Failed to download PDF 2011342 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/4a143581-8538-4913-babd-27ef611be0a1. Status code: 404
Failed to download PDF 2015627 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/4ae0bff3-2bac-457d-9632-0ba6a54cef7c. Status code: 404
Failed to download PDF 2028518 from http://zonasegura.seace.gob.pe/documentos//srv

  2%|█▉                                                                             | 187/7500 [00:03<02:10, 55.98it/s]

Failed to download PDF 2014378 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/4ea49652-a432-4bcd-8249-60801bc23a89. Status code: 404
Failed to download PDF 2026851 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/4f3d705c-30e3-44e5-8421-b45cd0de5785. Status code: 404
Failed to download PDF 2020227 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/4f7c349f-704f-4aeb-bc87-789e63885b62. Status code: 404
Failed to download PDF 2006369 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/4f982ef3-92ab-4034-90a3-373489b1d26a. Status code: 404
Failed to download PDF 2005982 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/4fde3931-cd29-4199-a4fd-30fe69cd38d7. Status code: 404
Failed to download PDF 2010970 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/4fe70c7a-1b13-4339-b4bb-05124e2d0772. Status code: 404
Failed to download PDF 2003050 from http://zonasegura.seace.gob.pe/documentos//srv

  3%|██                                                                             | 200/7500 [00:03<02:06, 57.72it/s]

Failed to download PDF 2011136 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/50ddca24-5e3b-461b-b302-126c9d8ec4bb. Status code: 404
Failed to download PDF 2010845 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/52519bbf-1c67-4791-9311-8dd4cea8bbda. Status code: 404
Failed to download PDF 2002823 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/525ae75f-eafd-4c00-a4f3-0751ee0ae8aa. Status code: 404
Failed to download PDF 2007795 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/531708fc-41ce-4e12-b136-8238ebc36923. Status code: 404
Failed to download PDF 2006428 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/5370a494-a8d8-45df-82b5-10478bd4700c. Status code: 404
Failed to download PDF 2015955 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/537d22a0-7cd2-4b1c-a90b-5b28fa5373fb. Status code: 404
Failed to download PDF 2006867 from http://zonasegura.seace.gob.pe/documentos//srv

  3%|██▎                                                                            | 214/7500 [00:03<01:58, 61.35it/s]

Failed to download PDF 2044299 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/570199ea-7f42-41c1-9bb1-8d190e44f7a5. Status code: 404
Failed to download PDF 2049658 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/578dbf25-24c8-49c7-a461-d9679cc2c7ab. Status code: 404
Failed to download PDF 2019401 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/58bec086-b2b9-4d0e-ae09-ba1b62984e8f. Status code: 404
Failed to download PDF 2016738 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/58fbc150-e463-48c3-8c64-e8c7e6f917f8. Status code: 404
Failed to download PDF 2014203 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/5931e8b4-acc1-4b9e-b39a-c41b8d763ed3. Status code: 404
Failed to download PDF 2003285 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/5a06cba1-f8cd-4ce6-981b-1dd5e07cceac. Status code: 404
Failed to download PDF 2007344 from http://zonasegura.seace.gob.pe/documentos//srv

  3%|██▍                                                                            | 228/7500 [00:03<02:01, 60.01it/s]

Failed to download PDF 2010542 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/5f7c2a0c-b1e8-4862-a74d-8b87412ba714. Status code: 404
Failed to download PDF 2015725 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/5fb66e4f-0e54-443a-b15e-09852cd9c857. Status code: 404
Failed to download PDF 2008622 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/5fd478e7-a77e-49e5-8b41-5c171590ec9b. Status code: 404
Failed to download PDF 2039580 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/5fdb366a-3d69-4e53-ae95-3b0a47e5db11. Status code: 404
Failed to download PDF 2017183 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/5fdb487c-568d-4ff9-bc4f-a5b34d415213. Status code: 404
Failed to download PDF 2011379 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/60a5f15c-08a9-433c-a39b-cbb329d173c3. Status code: 404
Failed to download PDF 2016115 from http://zonasegura.seace.gob.pe/documentos//srv

  3%|██▌                                                                            | 241/7500 [00:04<02:05, 57.83it/s]

Failed to download PDF 2019757 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/63e131e3-5d6f-439a-8ed4-6ad98c7c13a0. Status code: 404
Failed to download PDF 2006853 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/6411839f-38e8-4f3a-88dd-9a2a2f6f85ce. Status code: 404
Failed to download PDF 2005428 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/647cda39-0748-42fa-80e3-00f2c5ef8458. Status code: 404
Failed to download PDF 2012974 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/64a80bf7-9902-42e4-b3b3-a72766b4c1fc. Status code: 404
Failed to download PDF 2008950 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/64d3a13c-2c32-45df-bfcc-65d1ebabf03c. Status code: 404
Failed to download PDF 2005313 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/654b842c-29f8-4236-8e44-2b745ab79d96. Status code: 404
Failed to download PDF 2037902 from http://zonasegura.seace.gob.pe/documentos//srv

  3%|██▋                                                                            | 254/7500 [00:04<02:02, 59.14it/s]

Failed to download PDF 2016052 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/667069a3-534e-4d13-9edf-079fd0d6e5d3. Status code: 404
Failed to download PDF 2017218 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/66e213a1-d925-4bfd-a28d-4f1b5cdb30e3. Status code: 404
Failed to download PDF 2016001 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/676c93a8-6529-46bf-9efc-017938306f92. Status code: 404
Failed to download PDF 2005288 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/67e5fad4-6f7d-48ee-8f23-d2a4f8bd9be5. Status code: 404
Failed to download PDF 2007250 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/685d8852-db1e-4937-ab54-6ceecdd1fd2e. Status code: 404
Failed to download PDF 2016847 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/6926b2aa-eed8-455c-a932-d610053d0494. Status code: 404
Failed to download PDF 2016855 from http://zonasegura.seace.gob.pe/documentos//srv

  4%|██▊                                                                            | 267/7500 [00:04<02:02, 58.86it/s]

Failed to download PDF 2012194 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/6c892538-579b-4122-a7f4-c924dbdd59cb. Status code: 404
Failed to download PDF 2018624 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/6cc42982-241e-4d06-a672-617ab7170302. Status code: 404
Failed to download PDF 2010034 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/6cd3d606-7a02-4a24-9d4a-2197bf125662. Status code: 404
Failed to download PDF 2002717 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/6d1a9e64-8e43-47f0-9c03-303d5ec66be8. Status code: 404
Failed to download PDF 2016191 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/6d625a73-85f9-4e5a-b75d-fc080f800bcd. Status code: 404
Failed to download PDF 2012300 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/6e5571a3-2d8d-4b25-b231-c9b9035e8e05. Status code: 404
Failed to download PDF 2008214 from http://zonasegura.seace.gob.pe/documentos//srv

  4%|██▉                                                                            | 279/7500 [00:04<02:09, 55.63it/s]

Failed to download PDF 2047422 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/702da2d6-79ce-4cc6-a8e4-90000f741224. Status code: 404
Failed to download PDF 2005281 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/70588da1-a209-48f0-9e2d-4916920eca80. Status code: 404
Failed to download PDF 2014197 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/708da2b2-eb95-4f9f-b631-2d9c2210a291. Status code: 404
Failed to download PDF 2009579 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/71a5d94a-f3a3-490e-abb9-b9e251b71426. Status code: 404
Failed to download PDF 2013882 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/71d73487-dc17-47f8-aab4-e93ad0c533c4. Status code: 404
Failed to download PDF 2004528 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/7220d415-550f-4a40-b0b9-5d2796e252f4. Status code: 404
Failed to download PDF 2008594 from http://zonasegura.seace.gob.pe/documentos//srv

  4%|███                                                                            | 291/7500 [00:05<02:15, 53.37it/s]

Failed to download PDF 2012269 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/750b1097-f5e2-4838-a716-e64884662919. Status code: 404
Failed to download PDF 2013162 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/75617b27-265f-41f2-9b61-fddaa610c1b2. Status code: 404
Failed to download PDF 2011133 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/757ffe93-0fc2-4c77-9077-2f30bb42e4c0. Status code: 404
Failed to download PDF 2006351 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/76648910-47c2-460c-87e3-38bdc00915a0. Status code: 404
Failed to download PDF 2022156 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/767a5b1b-eb35-4bb0-84ab-a1ca4ea58b88. Status code: 404
Failed to download PDF 2025163 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/76b0ff46-6446-498e-b9db-fd4ad108f010. Status code: 404
Failed to download PDF 2038853 from http://zonasegura.seace.gob.pe/documentos//srv

  4%|███▏                                                                           | 303/7500 [00:05<02:19, 51.51it/s]

Failed to download PDF 2006833 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/790dada1-e8a7-4fe5-8267-4858f7e3a1f7. Status code: 404
Failed to download PDF 2018319 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/79abbaf6-65c9-4f2b-b2e8-41fcbeead96b. Status code: 404
Failed to download PDF 2017801 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/7a3eaf39-88c8-4d31-a009-f3b5c9deb2d6. Status code: 404
Failed to download PDF 2007336 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/7a61ad06-ce56-4904-ba1b-665175a9e5d7. Status code: 404
Failed to download PDF 2013715 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/7b34cc9a-6d2b-47b4-9c64-c6d8cb2f143f. Status code: 404
Failed to download PDF 2012150 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/7b70fb66-f2e8-4285-9dcf-aec0eb86fae3. Status code: 404
Failed to download PDF 2014468 from http://zonasegura.seace.gob.pe/documentos//srv


  4%|███▎                                                                           | 309/7500 [00:05<02:21, 50.97it/s]

Failed to download PDF 2023909 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/7e16e123-675e-4472-9f9c-cc1a8f73b383. Status code: 404
Failed to download PDF 2017769 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/7e348d5a-dc19-4333-8171-0d9544dd925d. Status code: 404
Failed to download PDF 2041995 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/7e6db6c8-c5e6-40c3-868c-0e5d5240d0b3. Status code: 404
Failed to download PDF 2051644 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/7eb163a3-d5b7-4a34-a694-22f221df25a9. Status code: 404
Failed to download PDF 2018387 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/7f4b3009-5fd6-48e7-b26b-a86e54bf48b5. Status code: 404
Failed to download PDF 2024791 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/7f6c815a-da25-4e14-9836-af8f8498a62b. Status code: 404
Failed to download PDF 2035541 from http://zonasegura.seace.gob.pe/documentos//srv

  4%|███▍                                                                           | 321/7500 [00:05<02:20, 50.92it/s]

Failed to download PDF 2030499 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/84054651-3292-435d-a1ad-9a1f6330f256. Status code: 404
Failed to download PDF 2016709 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/8453ee1f-08c1-49d0-aca7-e9d581687f51. Status code: 404
Failed to download PDF 2014331 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/854444f4-d1c1-47e1-846f-0eb6f53d3126. Status code: 404
Failed to download PDF 2008343 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/856edc52-8fad-48dc-9c99-1798368ee5e4. Status code: 404
Failed to download PDF 2021475 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/85d360f5-5017-4e39-892b-782baced4964. Status code: 404
Failed to download PDF 2011188 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/85e64a83-628c-49b1-9078-9d1216906693. Status code: 404
Failed to download PDF 2015701 from http://zonasegura.seace.gob.pe/documentos//srv

  4%|███▌                                                                           | 333/7500 [00:05<02:17, 52.16it/s]

Failed to download PDF 2024542 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/8a56b4c3-c42e-43d3-b84f-e7a60f963d53. Status code: 404
Failed to download PDF 2006604 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/8a645a78-10ca-40ff-889a-d2ed9a6e6fa3. Status code: 404
Failed to download PDF 2012180 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/8aec04be-315c-4efc-b404-7ad6e0ce28b4. Status code: 404
Failed to download PDF 2012314 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/8b179e46-6c35-42b8-affe-89f14eae8c05. Status code: 404
Failed to download PDF 2020028 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/8b2a2bdf-b263-4b3a-8e21-6c76e945a540. Status code: 404
Failed to download PDF 2003113 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/8b716e5f-23de-4b94-9094-b843e7e87525. Status code: 404
Failed to download PDF 2029204 from http://zonasegura.seace.gob.pe/documentos//srv

  5%|███▋                                                                           | 345/7500 [00:06<02:13, 53.79it/s]

Failed to download PDF 2043205 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/90f271fb-c4ad-4498-b309-2840f14af90b. Status code: 404
Failed to download PDF 2005644 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/911f42fc-ba35-479d-8731-7546de403a43. Status code: 404
Failed to download PDF 2013766 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/9143a566-efeb-4545-bdee-592367e03337. Status code: 404
Failed to download PDF 2011080 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/9146b470-e87c-4b46-b5b3-c12badf02c15. Status code: 404
Failed to download PDF 2028619 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/91a9bdd4-22bb-4e3f-afcd-3145b0e49c82. Status code: 404
Failed to download PDF 2011228 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/924e1a85-ce30-4573-9772-4541d96a0fcc. Status code: 404
Failed to download PDF 2016201 from http://zonasegura.seace.gob.pe/documentos//srv

  5%|███▊                                                                           | 358/7500 [00:06<02:04, 57.35it/s]

Failed to download PDF 2014115 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/95a74d33-e522-4652-b705-3ec08fa28483. Status code: 404
Failed to download PDF 2012367 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/9667b28f-61d6-4172-b6ae-5a99d09dce08. Status code: 404
Failed to download PDF 2004169 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/96e8bdeb-8023-49ef-baaa-1fb84997607a. Status code: 404
Failed to download PDF 2008340 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/97465866-9ffe-41fd-8d1d-c29f2d4d8370. Status code: 404
Failed to download PDF 2013785 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/985b1eaf-136b-4e33-ae97-bb1562d4b882. Status code: 404
Failed to download PDF 2014753 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/98df0aec-3717-428f-8c47-7c1f8f9e81e0. Status code: 404
Failed to download PDF 2047019 from http://zonasegura.seace.gob.pe/documentos//srv

  5%|███▉                                                                           | 372/7500 [00:06<01:59, 59.87it/s]

Failed to download PDF 2016263 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/9c79d983-9a39-4fc1-8761-58480c1eb139. Status code: 404
Failed to download PDF 2028847 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/9c85fc7d-9372-4e44-bbbb-56df26ea0e7a. Status code: 404
Failed to download PDF 2009653 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/9ccea327-6736-41f4-9df2-4cd971c36989. Status code: 404
Failed to download PDF 2010080 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/9cf33419-8bb6-429c-8750-a956aa04dc0f. Status code: 404
Failed to download PDF 2003458 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/9d398b34-822d-484a-ab27-a1cbe312c6d9. Status code: 404
Failed to download PDF 2012143 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/9da31fa2-9d50-46ed-a1ed-dd91f1c445eb. Status code: 404
Failed to download PDF 2025195 from http://zonasegura.seace.gob.pe/documentos//srv

  5%|████                                                                           | 384/7500 [00:06<02:04, 57.14it/s]

Failed to download PDF 2007639 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/a271370e-4977-4982-ad8b-96613e6518f2. Status code: 404
Failed to download PDF 2034950 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/a2f7d7dd-5aa4-4a6a-a28d-769a5affd95f. Status code: 404
Failed to download PDF 2012431 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/a3ddaeab-0ce6-42b5-b5de-9bdb52fd9e07. Status code: 404
Failed to download PDF 2004004 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/a4160560-67bd-4da4-8d09-8317d3d0e5c4. Status code: 404
Failed to download PDF 2023246 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/a42697ad-da42-4da9-972f-d89679869167. Status code: 404
Failed to download PDF 2039532 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/a4daf48e-46f5-4eac-97f3-854afd6ef39e. Status code: 404
Failed to download PDF 2016301 from http://zonasegura.seace.gob.pe/documentos//srv

  5%|████▏                                                                          | 396/7500 [00:06<02:07, 55.87it/s]

Failed to download PDF 2003403 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/a874bc97-e2a4-438c-aebf-6d0714298f23. Status code: 404
Failed to download PDF 2011929 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/a8c24c54-4fb4-4bf7-890b-7d95895d52d5. Status code: 404
Failed to download PDF 2011948 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/a8e07b22-b346-4e5c-82f4-42ff14b4cd62. Status code: 404
Failed to download PDF 2007451 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/a91e5710-b222-489e-b08a-285a3f73dcfb. Status code: 404
Failed to download PDF 2024726 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/a9fa9806-c217-4b67-b74b-f1bb130449ea. Status code: 404
Failed to download PDF 2008569 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/a9fef9d4-f679-4f86-9c1d-5a824ca11906. Status code: 404
Failed to download PDF 2009634 from http://zonasegura.seace.gob.pe/documentos//srv

  5%|████▎                                                                          | 410/7500 [00:07<01:56, 61.04it/s]

Failed to download PDF 2006304 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/adc06826-f2d8-41a3-a36c-73c3b85ff69f. Status code: 404
Failed to download PDF 2012171 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/ae3094dc-6a61-44a0-af5e-42c1e8f5c9f5. Status code: 404
Failed to download PDF 2008729 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/aec81a9d-db58-4ca9-a2dc-6e1f64f8add7. Status code: 404
Failed to download PDF 2014218 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/af5f96cb-267f-4dd7-8aeb-66e8f54f31d1. Status code: 404
Failed to download PDF 2038676 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/af7b69eb-3526-4e42-81c1-3c2044c231bf. Status code: 404
Failed to download PDF 2031887 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/b012d79f-d50e-4795-9337-d6b627fd3e87. Status code: 404
Failed to download PDF 2040593 from http://zonasegura.seace.gob.pe/documentos//srv

  6%|████▍                                                                          | 424/7500 [00:07<01:57, 60.48it/s]

Failed to download PDF 2015120 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/b4a37749-dc63-494f-b9a0-e3c25b319358. Status code: 404
Failed to download PDF 2014729 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/b553dec8-589f-4923-a916-a2231fbe080a. Status code: 404
Failed to download PDF 2009117 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/b5e5ee86-159c-499f-88eb-b7804c70a338. Status code: 404
Failed to download PDF 2006411 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/b66c39f9-9356-4396-b6a2-8b2306e3c847. Status code: 404
Failed to download PDF 2037832 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/b6850c94-1387-42b0-8874-4ee8471aff2c. Status code: 404
Failed to download PDF 2014798 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/b6e96e9c-d5c1-478b-aa67-1d0903379f48. Status code: 404
Failed to download PDF 2007907 from http://zonasegura.seace.gob.pe/documentos//srv

  6%|████▌                                                                          | 438/7500 [00:07<01:56, 60.64it/s]

Failed to download PDF 2005987 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/badda260-c571-4e39-970a-4676b3dac271. Status code: 404
Failed to download PDF 2047641 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/baf9da7e-c779-42a0-98d8-15fa5a2ea8e8. Status code: 404
Failed to download PDF 2008953 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/bbafd277-6d9e-40b6-acce-96e5457b37c8. Status code: 404
Failed to download PDF 2023837 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/bc88cf8f-fc59-4a67-b121-73695b763318. Status code: 404
Failed to download PDF 2013155 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/bcce7b6f-1724-4dda-a576-68e9df6e3778. Status code: 404
Failed to download PDF 2043075 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/bd0de192-b087-492f-a2cc-26d144d73b64. Status code: 404
Failed to download PDF 2018365 from http://zonasegura.seace.gob.pe/documentos//srv

  6%|████▊                                                                          | 452/7500 [00:07<01:55, 61.27it/s]

Failed to download PDF 2006338 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/bee6b879-b926-4d2d-9f49-5b175b2c901b. Status code: 404
Failed to download PDF 2037044 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/bf5aefb1-1da0-4b5b-ab52-99195137ba64. Status code: 404
Failed to download PDF 2016239 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/bfb154b7-23d3-4f83-b7e2-ca014ce591e2. Status code: 404
Failed to download PDF 2036020 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/bfbe7320-5763-4fc0-a9d8-d6c10f0ed6ba. Status code: 404
Failed to download PDF 2040873 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/c0e70e40-af5c-4b72-a69d-243e3e2f15fa. Status code: 404
Failed to download PDF 2031914 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/c14577fb-d0b7-4e05-a9df-b2b12eab6a0d. Status code: 404
Failed to download PDF 2024772 from http://zonasegura.seace.gob.pe/documentos//srv

  6%|████▉                                                                          | 466/7500 [00:08<01:56, 60.35it/s]

Failed to download PDF 2004069 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/c3d98951-11d9-43bd-9714-0ef8978668f4. Status code: 404
Failed to download PDF 2016297 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/c3eb5373-7e39-4b45-9f6f-645a6076a56b. Status code: 404
Failed to download PDF 2017735 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/c44580d0-935d-488b-87c6-1b6715bb1f89. Status code: 404
Failed to download PDF 2005427 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/c460b126-2b55-4b16-9c85-b7bd549c3e75. Status code: 404
Failed to download PDF 2007254 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/c506948b-9ceb-456b-8e26-5811d8034332. Status code: 404
Failed to download PDF 2025728 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/c58f749d-338d-40e1-b16d-a0708c42ef40. Status code: 404
Failed to download PDF 2019958 from http://zonasegura.seace.gob.pe/documentos//srv

  6%|█████                                                                          | 480/7500 [00:08<01:59, 58.96it/s]

Failed to download PDF 2012172 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/c7e148e0-7d48-4895-9d43-f15153a8367d. Status code: 404
Failed to download PDF 2015105 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/c7fc3877-c664-45e6-8b0e-3b61c82317a8. Status code: 404
Failed to download PDF 2017147 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/c8f9ac95-8c68-4bc6-9441-f6e6d4f947db. Status code: 404
Failed to download PDF 2014963 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/c93d453e-6fea-45cc-adbc-38cf416d3b37. Status code: 404
Failed to download PDF 2004854 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/c9683714-564a-4b55-b24a-ee65cbb8cd4e. Status code: 404
Failed to download PDF 2011375 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/c99c312e-261a-460a-9b8d-77000efac1cf. Status code: 404
Failed to download PDF 2016765 from http://zonasegura.seace.gob.pe/documentos//srv


  6%|█████                                                                          | 486/7500 [00:08<01:59, 58.82it/s]

Failed to download PDF 2015074 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/cd461a7f-9d20-4b57-945f-79ee3ff9a508. Status code: 404
Failed to download PDF 2014741 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/cd8b257f-e382-4671-9b2a-90b5b0de4237. Status code: 404
Failed to download PDF 2016428 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/cdc3a28c-6a5d-4501-8fbb-85730500511a. Status code: 404
Failed to download PDF 2036092 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/ce499c4d-77de-4379-a996-8cb10e299af4. Status code: 404
Failed to download PDF 2007346 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/ce6e8eb6-f347-44a9-8f4d-61c29101d63d. Status code: 404
Failed to download PDF 2010822 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/ce8d2f53-9dc5-4346-b6ac-437cee7788e9. Status code: 404
Failed to download PDF 2065888 from http://zonasegura.seace.gob.pe/documentos//srv

  7%|█████▎                                                                         | 500/7500 [00:08<01:55, 60.46it/s]

Failed to download PDF 2041812 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/d01e6c08-ac8d-4404-bd7b-7deb58b09eed. Status code: 404
Failed to download PDF 2006506 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/d04483eb-981c-4e28-9d33-1127a284f7d0. Status code: 404
Failed to download PDF 2016730 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/d07952ce-3805-4a2e-b1c1-8ab0115b044f. Status code: 404
Failed to download PDF 2051477 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/d0c1cf12-421a-4631-8879-cb3d03a584f9. Status code: 404
Failed to download PDF 2013933 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/d0e7af8a-e691-4db7-b117-fc3531f9232f. Status code: 404
Failed to download PDF 2003396 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/d19107ad-cb37-4dc7-95d8-0237ae3822ca. Status code: 404
Failed to download PDF 2006821 from http://zonasegura.seace.gob.pe/documentos//srv

  7%|█████▍                                                                         | 513/7500 [00:08<02:01, 57.55it/s]

Failed to download PDF 2003235 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/d6b379a8-8e95-47a7-80ca-34c544b8881c. Status code: 404
Failed to download PDF 2003521 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/d731de14-766d-4d47-8f93-c29276e0e23b. Status code: 404
Failed to download PDF 2008375 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/d732a173-a52d-4934-9bfb-53b5d5a65c1c. Status code: 404
Failed to download PDF 2014794 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/d738b5f0-bf66-4d28-9e63-74e03e382753. Status code: 404
Failed to download PDF 2013611 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/d7ef29be-3f57-45ee-9566-a26ed48161e9. Status code: 404
Failed to download PDF 2010073 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/d7f4801c-c1f1-4138-86be-d3a91d6b194d. Status code: 404
Failed to download PDF 2005740 from http://zonasegura.seace.gob.pe/documentos//srv

  7%|█████▌                                                                         | 525/7500 [00:09<02:03, 56.29it/s]

Failed to download PDF 2035507 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/db86bc7b-0cbc-4314-906b-4c32978d1c86. Status code: 404
Failed to download PDF 2024502 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/dbdec785-74cc-40b0-93e8-df779c59f107. Status code: 404
Failed to download PDF 2012154 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/dd308422-673c-474f-af90-adef84406441. Status code: 404
Failed to download PDF 2008370 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/dd776a69-7d58-4413-8ec0-f92bdae9990d. Status code: 404
Failed to download PDF 2060083 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/ddeaa16c-f2f4-483e-886b-98e4a88a21e3. Status code: 404
Failed to download PDF 2022698 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/dee239d6-4dae-4651-93f5-1618a139a3f3. Status code: 404
Failed to download PDF 2036296 from http://zonasegura.seace.gob.pe/documentos//srv

  7%|█████▋                                                                         | 537/7500 [00:09<02:02, 56.63it/s]

Failed to download PDF 2015866 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/e1b75b8b-e764-4d70-9847-d423e39162c8. Status code: 404
Failed to download PDF 2010495 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/e1e04f12-4fac-4426-baf2-d244de497ff5. Status code: 404
Failed to download PDF 2023729 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/e214242b-153d-48c5-95f9-27932582522c. Status code: 404
Failed to download PDF 2003253 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/e22ce8d7-1525-4179-9c5f-9c6196883bc7. Status code: 404
Failed to download PDF 2012900 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/e2e54ab9-d79d-4b5d-8b19-cff6336ef7d4. Status code: 404
Failed to download PDF 2014149 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/e5d0c540-5740-4ea2-9a9f-3957e6157a86. Status code: 404
Failed to download PDF 2003942 from http://zonasegura.seace.gob.pe/documentos//srv

  7%|█████▊                                                                         | 549/7500 [00:09<02:03, 56.23it/s]

Failed to download PDF 2055793 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/e76ecc43-67c3-4b86-86ea-2d58e2f81f61. Status code: 404
Failed to download PDF 2014582 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/e831a548-2ef3-468a-8860-1ace543ff10e. Status code: 404
Failed to download PDF 2020693 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/e8b47855-e188-4ca6-b819-0b9a75dc165f. Status code: 404
Failed to download PDF 2007265 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/e8f058c3-c65b-4878-ab9e-bde8b005f5ca. Status code: 404
Failed to download PDF 2019676 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/e8f696b8-053f-4b56-aa2e-bcfc534d7dba. Status code: 404
Failed to download PDF 2012542 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/e9522dc9-7242-43cb-8717-2d8b3a289ba8. Status code: 404
Failed to download PDF 2038640 from http://zonasegura.seace.gob.pe/documentos//srv

  7%|█████▉                                                                         | 562/7500 [00:09<01:57, 58.98it/s]

Failed to download PDF 2010490 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/eae08505-dceb-4497-8701-42d9b4967ac3. Status code: 404
Failed to download PDF 2015377 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/eb2298da-3668-48e1-8dfb-07a5b7f8ddfa. Status code: 404
Failed to download PDF 2008563 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/eb2e0ca9-ab95-410c-8bfc-288e93cad408. Status code: 404
Failed to download PDF 2016959 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/ebbd0b1f-d45e-419c-845b-8b4f1273cb40. Status code: 404
Failed to download PDF 2015697 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/ec28c98d-3d31-4f79-b38a-9da29f979967. Status code: 404
Failed to download PDF 2042848 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/ec459f4a-331f-4838-b169-08c9968375ee. Status code: 404
Failed to download PDF 2014089 from http://zonasegura.seace.gob.pe/documentos//srv

  8%|██████                                                                         | 576/7500 [00:09<01:51, 61.93it/s]

Failed to download PDF 2025409 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/efab42cc-7b67-4040-b33f-14dfacd546b0. Status code: 404
Failed to download PDF 2018669 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/f033917f-16a9-49d4-a808-ba73d2ccf678. Status code: 404
Failed to download PDF 2029741 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/f054c80d-5f67-4d01-bee5-d76120dcc25b. Status code: 404
Failed to download PDF 2069226 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/f09cda99-5b3d-4938-bad3-27fc0ad92e6d. Status code: 404
Failed to download PDF 2016256 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/f168b27f-2391-478b-bb5d-d1c8b9cd3ca9. Status code: 404
Failed to download PDF 2023068 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/f16b90a1-22b1-455a-b7ec-63ebfe2d83c1. Status code: 404
Failed to download PDF 2009961 from http://zonasegura.seace.gob.pe/documentos//srv

  8%|██████▏                                                                        | 590/7500 [00:10<01:50, 62.39it/s]

Failed to download PDF 2009895 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/f613a4d7-8f24-4265-868b-a1dff272beae. Status code: 404
Failed to download PDF 2015321 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/f6a69647-e67e-42f7-8c52-b523fefb54c4. Status code: 404
Failed to download PDF 2015803 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/f7296917-01ad-4e93-8ef8-ac1ea199560d. Status code: 404
Failed to download PDF 2018489 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/f77502af-5fba-4132-8aa5-61660509b937. Status code: 404
Failed to download PDF 2012282 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/f7f77e74-095b-4578-a665-3bb95824bf47. Status code: 404
Failed to download PDF 2017739 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/f82b1f75-1fe6-40c0-8056-9b8a5d2d5c17. Status code: 404
Failed to download PDF 2008453 from http://zonasegura.seace.gob.pe/documentos//srv

  8%|██████▎                                                                        | 604/7500 [00:10<01:52, 61.16it/s]

Failed to download PDF 2006497 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/fbcaec85-a467-4172-b8f5-30723d9ca626. Status code: 404
Failed to download PDF 2020057 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/fbd21ec0-df5d-46bb-9cf3-f06c68529b2d. Status code: 404
Failed to download PDF 2035811 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/fcb0fa3e-dd6a-4a8d-b715-59a3263f75d3. Status code: 404
Failed to download PDF 2049171 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/fd14f4bc-4a95-48f1-8670-a796fd28144f. Status code: 404
Failed to download PDF 2006033 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/fd189823-dfb6-446f-a5ab-46ebbb6e01e1. Status code: 404
Failed to download PDF 2018176 from http://zonasegura.seace.gob.pe/documentos//srv/nfs4/contratos/fd215f12-32a9-4abb-a990-16d26ebf012f. Status code: 404
Failed to download PDF 2047204 from http://zonasegura.seace.gob.pe/documentos//srv

 46%|██████████████████████████████████▎                                       | 3479/7500 [1:49:20<4:27:10,  3.99s/it]

Failed to download PDF 1281362 from http://zonasegura.seace.gob.pe/documentos/mon\docs\contratos\2020\10007\363697220052020154518.pdf. Status code: 404


 49%|█████████████████████████████████████▌                                      | 3706/7500 [1:56:36<36:57,  1.71it/s]

Failed to download PDF 1291996 from http://zonasegura.seace.gob.pe/documentos/mon\docs\contratos\2020\10215\363773214082020135731.pdf. Status code: 404


 51%|█████████████████████████████████████▉                                    | 3847/7500 [1:59:46<1:44:42,  1.72s/it]

Failed to download PDF 1300298 from http://zonasegura.seace.gob.pe/documentos/mon\docs\contratos\2020\10486\363773209092020181459.pdf. Status code: 404


 56%|█████████████████████████████████████████▍                                | 4205/7500 [2:13:05<1:15:42,  1.38s/it]

Failed to download PDF 1291636 from http://zonasegura.seace.gob.pe/documentos/mon\docs\contratos\2020\22\364466711082020163110.pdf. Status code: 404


100%|████████████████████████████████████████████████████████████████████████████| 7500/7500 [3:17:55<00:00,  1.58s/it]


In [8]:
failed_downloads_df.drop('urlcontrato',
  axis='columns', inplace=True)

# OCR Text Extraction

**OCR extraction with Tesseract in Windows requires to have the program installed from this [link](https://github.com/UB-Mannheim/tesseract/wiki).<br> After installing, we need to specify the location of the exe file as below.**

In [9]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

**We first define the directory where the pdfs are located.**

In [10]:
pdfs = documentation + r'\Main_sample\downloaded_pdfs'

**We span through the files of the base pdf directory, and its subdirectories, storing the filepath of each document.**

In [11]:
filenames = []

for base, dirs, files in os.walk(pdfs):
    for filename in files:
        if (filename.lower().endswith('.pdf')):
            filenames.append(os.path.join(base, filename))

random.choices(filenames, k=5)

['C:\\Users\\matia\\OneDrive - Universidad del Pacífico\\01-Medidas_emergencia_PE\\01-DATA_PERU\\04-DATA_DOCUMENTATION\\Main_sample\\downloaded_pdfs\\pdf_1280466.pdf',
 'C:\\Users\\matia\\OneDrive - Universidad del Pacífico\\01-Medidas_emergencia_PE\\01-DATA_PERU\\04-DATA_DOCUMENTATION\\Main_sample\\downloaded_pdfs\\pdf_2151431.pdf',
 'C:\\Users\\matia\\OneDrive - Universidad del Pacífico\\01-Medidas_emergencia_PE\\01-DATA_PERU\\04-DATA_DOCUMENTATION\\Main_sample\\downloaded_pdfs\\pdf_2116665.pdf',
 'C:\\Users\\matia\\OneDrive - Universidad del Pacífico\\01-Medidas_emergencia_PE\\01-DATA_PERU\\04-DATA_DOCUMENTATION\\Main_sample\\downloaded_pdfs\\pdf_2133613.pdf',
 'C:\\Users\\matia\\OneDrive - Universidad del Pacífico\\01-Medidas_emergencia_PE\\01-DATA_PERU\\04-DATA_DOCUMENTATION\\Main_sample\\downloaded_pdfs\\pdf_2112068.pdf']

**We open each path and extract the text using the fitz package (a simple pdf reader). <br>Then we identify if the pdf was a scanned document (an image without *"highlight-able"* text) by setting a length threshold. <br>Only if the read pdf's text has a length below the threshold, we proceed by re-extracting the text using the Tesseract OCR package. <br>Each text is stored in a dataframe containing its filename, text, extraction type and file id.**

In [12]:
# Create a dataframe to store the texts of each PDF
pdf_texts_df = pd.DataFrame(columns = ['filename', 'text', 'extraction_type'])
broken_pdfs_df = pd.DataFrame(columns=['n_cod_contrato', 'broken_pdf'])

# Loop through every file in the directory
for filename in tqdm(filenames):
    if filename.lower().endswith('.pdf'):
        try:
        
            # Read the text directly from the PDF file
            reader = fitz.open(filename)
            pdf_text = ''

            for page in reader:
                pdf_text+=page.get_text()+' '
            
            pdf_texts_df.loc[filenames.index(filename), 'extraction_type'] = 'PDF_Reader'
            
            if len(re.sub(r'[^a-zA-Z]', '', pdf_text))<1000:
            
                # Open the PDF file
                with open(filename, 'rb') as file:
                    pdf_bytes = file.read()

                # Convert the PDF to images
                images = convert_from_bytes(pdf_bytes)

                # Use OCR to extract text from each image/page
                pdf_text = ''
                for i, image in enumerate(images):
                    text = pytesseract.image_to_string(image)
                    pdf_text+=text+' '
            
                pdf_texts_df.loc[filenames.index(filename), 'extraction_type'] = 'OCR'

            # Clean the extracted text

            clean_text = re.sub('\$+', ' ', pdf_text)  # Replace multiple \$ with a space
            clean_text = re.sub('\n+', ' ', clean_text)  # Replace multiple newlines with one space
            clean_text = re.sub('\.+', '.', clean_text)  # Replace multiple . with one space
            clean_text = re.sub('\,+', ',', clean_text)  # Replace multiple newlines with one space
            clean_text = clean_text.replace(';', ' ')  # Replace semicolons with spaces
            clean_text = re.sub(' +', ' ', clean_text)  # Replace multiple spaces with one
            clean_text = re.sub(r'[^a-zA-ZÀ-ÿ0-9 \,\.\/\:]', '', clean_text)
        
            # Store the joined text in the dataframe, using the filename (without .pdf) as the key
            pdf_texts_df.loc[filenames.index(filename), 'filename'] = filename[:-4]
            pdf_texts_df.loc[filenames.index(filename), 'text'] = clean_text
            
        except Exception as e:
            error_message = str(e)
            if 'cannot open broken document' in error_message:  # Replace 'FileDataError' with the actual error message you expect
                print(f"A FileDataError occurred: {e} in {filename}")
                broken_pdfs_df = pd.concat([broken_pdfs_df, pd.DataFrame({'n_cod_contrato': [filename.replace(pdfs+'\\pdf_', '').replace('.pdf','')], 'broken_pdf': 1}, index=[0])], ignore_index=True)
                #broken_pdfs_df = pd.concat([broken_pdfs_df, pd.DataFrame({'Contract Code': filename.str.replace(pdfs+'\\pdf_', '', regex=False)}, index=[0])], ignore_index=True)
            else:
                print(f"An unspecified error occurred: {e} in {filename}")
                broken_pdfs_df = pd.concat([broken_pdfs_df, pd.DataFrame({'n_cod_contrato': [filename.replace(pdfs+'\\pdf_', '').replace('.pdf','')], 'broken_pdf': 1}, index=[0])], ignore_index=True)


  0%|                                                                                         | 0/6887 [00:00<?, ?it/s]

A FileDataError occurred: cannot open broken document in C:\Users\matia\OneDrive - Universidad del Pacífico\01-Medidas_emergencia_PE\01-DATA_PERU\04-DATA_DOCUMENTATION\Main_sample\downloaded_pdfs\pdf_1146041.pdf


  0%|                                                                              | 6/6887 [01:12<30:06:44, 15.75s/it]

A FileDataError occurred: cannot open broken document in C:\Users\matia\OneDrive - Universidad del Pacífico\01-Medidas_emergencia_PE\01-DATA_PERU\04-DATA_DOCUMENTATION\Main_sample\downloaded_pdfs\pdf_1149185.pdf


  1%|▌                                                                            | 50/6887 [12:17<22:08:07, 11.66s/it]

A FileDataError occurred: cannot open broken document in C:\Users\matia\OneDrive - Universidad del Pacífico\01-Medidas_emergencia_PE\01-DATA_PERU\04-DATA_DOCUMENTATION\Main_sample\downloaded_pdfs\pdf_1159482.pdf


  1%|█                                                                            | 91/6887 [27:49<40:06:38, 21.25s/it]

An unspecified error occurred: cannot open empty document in C:\Users\matia\OneDrive - Universidad del Pacífico\01-Medidas_emergencia_PE\01-DATA_PERU\04-DATA_DOCUMENTATION\Main_sample\downloaded_pdfs\pdf_1163559.pdf


  1%|█                                                                            | 99/6887 [31:20<52:10:35, 27.67s/it]

A FileDataError occurred: cannot open broken document in C:\Users\matia\OneDrive - Universidad del Pacífico\01-Medidas_emergencia_PE\01-DATA_PERU\04-DATA_DOCUMENTATION\Main_sample\downloaded_pdfs\pdf_1163987.pdf


  2%|█▎                                                                          | 122/6887 [36:55<28:28:44, 15.16s/it]

A FileDataError occurred: cannot open broken document in C:\Users\matia\OneDrive - Universidad del Pacífico\01-Medidas_emergencia_PE\01-DATA_PERU\04-DATA_DOCUMENTATION\Main_sample\downloaded_pdfs\pdf_1166393.pdf
A FileDataError occurred: cannot open broken document in C:\Users\matia\OneDrive - Universidad del Pacífico\01-Medidas_emergencia_PE\01-DATA_PERU\04-DATA_DOCUMENTATION\Main_sample\downloaded_pdfs\pdf_1166398.pdf


  2%|█▋                                                                          | 155/6887 [50:18<59:38:26, 31.89s/it]

A FileDataError occurred: cannot open broken document in C:\Users\matia\OneDrive - Universidad del Pacífico\01-Medidas_emergencia_PE\01-DATA_PERU\04-DATA_DOCUMENTATION\Main_sample\downloaded_pdfs\pdf_1168385.pdf


  3%|█▉                                                                          | 178/6887 [59:48<53:45:43, 28.85s/it]

A FileDataError occurred: cannot open broken document in C:\Users\matia\OneDrive - Universidad del Pacífico\01-Medidas_emergencia_PE\01-DATA_PERU\04-DATA_DOCUMENTATION\Main_sample\downloaded_pdfs\pdf_1170592.pdf


  3%|█▉                                                                        | 186/6887 [1:02:50<59:01:45, 31.71s/it]

A FileDataError occurred: cannot open broken document in C:\Users\matia\OneDrive - Universidad del Pacífico\01-Medidas_emergencia_PE\01-DATA_PERU\04-DATA_DOCUMENTATION\Main_sample\downloaded_pdfs\pdf_1171333.pdf


  3%|██▍                                                                       | 224/6887 [1:18:06<47:54:05, 25.88s/it]

A FileDataError occurred: cannot open broken document in C:\Users\matia\OneDrive - Universidad del Pacífico\01-Medidas_emergencia_PE\01-DATA_PERU\04-DATA_DOCUMENTATION\Main_sample\downloaded_pdfs\pdf_1174730.pdf


  4%|██▊                                                                       | 264/6887 [1:38:16<61:20:07, 33.34s/it]

A FileDataError occurred: cannot open broken document in C:\Users\matia\OneDrive - Universidad del Pacífico\01-Medidas_emergencia_PE\01-DATA_PERU\04-DATA_DOCUMENTATION\Main_sample\downloaded_pdfs\pdf_1177078.pdf


  5%|███▊                                                                      | 353/6887 [2:21:34<43:40:38, 24.06s/it]


KeyboardInterrupt: 

In [None]:
pdf_texts_df['n_cod_contrato'] = pdf_texts_df['filename'].str.replace(pdfs+'\\pdf_', '', regex=False)
pdf_texts_df.drop('filename', axis='columns', inplace=True)

In [None]:
main_analysis_dfs = data_pro + r'\main_analysis_dfs'

# Create the directory if it doesn't exist
if not os.path.exists(main_analysis_dfs):
    os.makedirs(main_analysis_dfs)

In [None]:
display(failed_downloads_df)
display(broken_pdfs_df)
display(pdf_texts_df)

In [None]:
pdf_texts_df.to_excel(main_analysis_dfs + r'\pdf_texts.xlsx', index = False)
failed_downloads_df.to_excel(main_analysis_dfs + r'\failed_downloads.xlsx', index = False)
broken_pdfs_df.to_excel(main_analysis_dfs + r'\broken_pdfs.xlsx', index = False)

# Preparing Data for Unit Price Extraction

In [17]:
extraction_df = pd.read_excel(data_pro + r'\main_analysis_dfs\pdf_texts.xlsx')

In [24]:
extraction_df

Unnamed: 0,text,extraction_type,n_cod_contrato
0,ee rey ela par ee res contrato n0012018hh se...,OCR,1147865.0
1,4 see . hospital peru ministerio. de salud ca...,OCR,1148734.0
2,es ieee a x : eae . be ap pe peru ministerio...,OCR,1149147.0
3,fox hospital . 7m peru ministerio de salud...,OCR,1149733.0
4,ey : ea we : hospital ee peru ministerio:de...,OCR,1149901.0
...,...,...,...
3935,nes peru ministeriode salud 8 licitacion publi...,OCR,2168959.0
3936,"patricia , piedra z on errod 0 i minsterio s...",OCR,2169593.0
3937,"see vi sli eea aes cit , iceministerio de rede...",OCR,2170324.0
3938,marina de guerra del peru nt direcion ejecutiv...,OCR,2170926.0


In [30]:
extraction_df['text'].fillna('NA', inplace=True)
extraction_df['text'] = extraction_df['text'].str.lower()
extraction_df['text'] = extraction_df['text'].apply(lambda x: re.sub('\s+', ' ', x).strip())
extraction_df['text'] = extraction_df['text'].apply(lambda x: re.sub(r'(\.|\,)\1+', r'\1', x).strip())
extraction_df.drop('extraction_type', axis='columns', inplace=True)

In [31]:
# Split a text into smaller chunks of size n, preferably ending at the end of a sentence
def create_chunks(text, n, tokenizer):
    tokens = tokenizer.encode(text)
    """Yield successive n-sized chunks from text."""
    i = 0
    while i < len(tokens):
        # Find the nearest end of sentence within a range of 0.9 * n and 1.1 * n tokens
        j = min(i + int(1.1 * n), len(tokens))
        while j > i + int(0.9 * n):
            # Decode the tokens and check for full stop or newline
            chunk = tokenizer.decode(tokens[i:j])
            if chunk.endswith(".") or chunk.endswith("\n"):
                break
            j -= 1
        # If no end of sentence found, use n tokens as the chunk size
        if j == i + int(0.9 * n):
            j = min(i + n, len(tokens))
        yield tokens[i:j]
        i = j

In [32]:
# Initialise tokenizer
tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')

prompt=[]

for i in tqdm(range(len(extraction_df['text']))):
    chunks = create_chunks(extraction_df.loc[i,'text'], 1500, tokenizer)
    text_chunks = [tokenizer.decode(chunk) for chunk in chunks]
    if len(text_chunks)>=2:
        prompt.append(' '.join(text_chunks[:1]+text_chunks[-1:]))
    else:
        prompt.append(' '.join(text_chunks))
extraction_df['text'] = prompt

100%|██████████████████████████████████████████████████████████████████████████████| 3940/3940 [01:31<00:00, 43.11it/s]


In [33]:
extraction_df['text'] = extraction_df['text'] + '---->'

In [34]:
os.environ['OPENAI_API_KEY'] = ""

In [35]:
openai.api_key = os.getenv("OPENAI_API_KEY")

In [36]:
# Iterate through each row in DataFrame
for index, row in tqdm(extraction_df.iterrows(), total=extraction_df.shape[0]):
    user_content = row['text']
    time.sleep(2) 
    
    try:
        # Call OpenAI API
        completion = openai.ChatCompletion.create(
            model="ft:gpt-3.5-turbo-0613:personal::7wfSDkB5",
            request_timeout = 100,
            messages=[
                {"role": "system", "content": "Dado el texto extraido de un contrato, extrae los precios unitarios de los bienes comprados."},
                {"role": "user", "content": user_content}
            ]
        )
    
        # Extract the generated message and store it in DataFrame
        generated_message = completion.choices[0].message['content']
        extraction_df.at[index, 'gpt_unit_prices'] = generated_message

    except openai.error.RateLimitError as e:
        retry_time = e.retry_after if hasattr(e, 'retry_after') else 1
        print(f"Rate limit error. Retrying in {retry_time} seconds...")
        time.sleep(retry_time)
        completion = openai.ChatCompletion.create(
            model="ft:gpt-3.5-turbo-0613:personal::7wfSDkB5",
            request_timeout = 100,
            messages=[
                {"role": "system", "content": "Dado el texto extraido de un contrato, extrae los precios unitarios de los bienes comprados."},
                {"role": "user", "content": user_content}
            ]
        )
    
        # Extract the generated message and store it in DataFrame
        generated_message = completion.choices[0].message['content']
        extraction_df.at[index, 'gpt_unit_prices'] = generated_message
        
    except openai.error.ServiceUnavailableError as e:
        retry_time = e.retry_after if hasattr(e, 'retry_after') else 1
        print(f"Service Unavailable error. Retrying in {retry_time} seconds...")
        time.sleep(retry_time)
        completion = openai.ChatCompletion.create(
            model="ft:gpt-3.5-turbo-0613:personal::7wfSDkB5",
            request_timeout = 100,
            messages=[
                {"role": "system", "content": "Dado el texto extraido de un contrato, extrae los precios unitarios de los bienes comprados."},
                {"role": "user", "content": user_content}
            ]
        )
    
        # Extract the generated message and store it in DataFrame
        generated_message = completion.choices[0].message['content']
        extraction_df.at[index, 'gpt_unit_prices'] = generated_message

    except openai.error.APIError as e:
        retry_time = e.retry_after if hasattr(e, 'retry_after') else 1
        print(f"API error occurred. Retrying in {retry_time} seconds...")
        time.sleep(retry_time)
        completion = openai.ChatCompletion.create(
            model="ft:gpt-3.5-turbo-0613:personal::7wfSDkB5",
            request_timeout = 100,
            messages=[
                {"role": "system", "content": "Dado el texto extraido de un contrato, extrae los precios unitarios de los bienes comprados."},
                {"role": "user", "content": user_content}
            ]
        )
    
        # Extract the generated message and store it in DataFrame
        generated_message = completion.choices[0].message['content']
        extraction_df.at[index, 'gpt_unit_prices'] = generated_message

    except OSError as e:
        retry_time = 1  # Adjust the retry time as needed
        print(f"Connection error occurred: {e}. Retrying in {retry_time} seconds...")      
        time.sleep(retry_time)
        completion = openai.ChatCompletion.create(
            model="ft:gpt-3.5-turbo-0613:personal::7wfSDkB5",
            request_timeout = 100,
            messages=[
                {"role": "system", "content": "Dado el texto extraido de un contrato, extrae los precios unitarios de los bienes comprados."},
                {"role": "user", "content": user_content}
            ]
        )
    
        # Extract the generated message and store it in DataFrame
        generated_message = completion.choices[0].message['content']
        extraction_df.at[index, 'gpt_unit_prices'] = generated_message
        
    except requests.Timeout as e:
        retry_time = 1  # Adjust the retry time as needed
        print(f"Timeout error occurred: {e}. Retrying in {retry_time} seconds...")      
        time.sleep(retry_time)
        completion = openai.ChatCompletion.create(
            model="ft:gpt-3.5-turbo-0613:personal::7wfSDkB5",
            request_timeout = 100,
            messages=[
                {"role": "system", "content": "Dado el texto extraido de un contrato, extrae los precios unitarios de los bienes comprados."},
                {"role": "user", "content": user_content}
            ]
        )
    
        # Extract the generated message and store it in DataFrame
        generated_message = completion.choices[0].message['content']
        extraction_df.at[index, 'gpt_unit_prices'] = generated_message
        
    except Exception as e:
        retry_time = 1
        print(f"An unexpected error occurred: {e}. Retrying in {retry_time} seconds...")
        time.sleep(retry_time)
        completion = openai.ChatCompletion.create(
            model="ft:gpt-3.5-turbo-0613:personal::7wfSDkB5",
            request_timeout = 100,
            messages=[
                {"role": "system", "content": "Dado el texto extraido de un contrato, extrae los precios unitarios de los bienes comprados."},
                {"role": "user", "content": user_content}
            ]
        )
    
        # Extract the generated message and store it in DataFrame
        generated_message = completion.choices[0].message['content']
        extraction_df.at[index, 'gpt_unit_prices'] = generated_message
        
# Print the updated DataFrame to check if it worked
extraction_df

 68%|███████████████████████████████████████████████████▊                        | 2683/3940 [2:05:35<55:01,  2.63s/it]

API error occurred. Retrying in 1 seconds...


100%|████████████████████████████████████████████████████████████████████████████| 3940/3940 [3:02:26<00:00,  2.78s/it]


Unnamed: 0,text,n_cod_contrato,gpt_unit_prices
0,ee rey ela par ee res contrato n0012018hh serv...,1147865.0,8.0; 16.0; 16.0; 8.0; 16.2; 7.5; 7.5; 16.2; 7...
1,4 see . hospital peru ministerio. de salud cay...,1148734.0,0.28; 0.25; 0.22; 0.12; 0.12; 0.35; 0.35; 0.3...
2,es ieee a x : eae . be ap pe peru ministerio i...,1149147.0,5276.66; 3650.0 \n\n###\n\n
3,fox hospital . 7m peru ministerio de salud cay...,1149733.0,4.667 \n\n###\n\n
4,ey : ea we : hospital ee peru ministerio:de sa...,1149901.0,0.3; 4.0; 1.0; 1.0; 1.0; 3.35; 2.3; 2.5; 1.0;...
...,...,...,...
3935,nes peru ministeriode salud 8 licitacion publi...,2168959.0,\n\n###\n\n
3936,"patricia , piedra z on errod 0 i minsterio ss ...",2169593.0,6.726; 6.785 \n\n###\n\n
3937,"see vi sli eea aes cit , iceministerio de rede...",2170324.0,135.0; 48.0; 67.0; 75.0; 67.0 \n\n###\n\n
3938,marina de guerra del peru nt direcion ejecutiv...,2170926.0,0.08; 2.42; 0.38; 2.2; 0.84 \n\n###\n\n


In [37]:
extraction_df.to_excel(main_analysis_dfs + r'\extraction_df.xlsx', index = False)