In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.diputados.gob.mx/LeyesBiblio/index.htm"
response = requests.get(url)

# Check if request was successful

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
else:
    print('Failed to retrieve the webpage')


In [2]:
# finding all the tr elements in the soup and printing its length

tres = soup.find_all('tr')
print(len(tres))

316


In [3]:
from bs4 import BeautifulSoup

tr = tres[0]

def get_info(html_element):
    info_dict = {}
    
    # Get the index value
    index = html_element.find('font', color="#595843").get_text(strip=True)
    info_dict['Index'] = index
    
    # Get the title and its link
    title_tag = html_element.find('a')
    if title_tag:  # Check if the title tag exists
        title = title_tag.get_text(strip=True)
        title_link = title_tag['href']
    else:
        title = None
        title_link = None

    info_dict['Title'] = title
    info_dict['Title Link'] = title_link
    
    # Get the publication date
    pub_dates = [p.get_text(strip=True) for p in html_element.find_all('p') if 'DOF' in p.get_text()]
    if pub_dates:
        publication_date = pub_dates[0].split()[-1]
    else:
        publication_date = None

    info_dict['Publication Date'] = publication_date
    
    # Get the links for the PDF, MOV PDF, and WORD formats
    links = html_element.find_all('a')
    link_dict = {}
    for link in links:
        link_text = link.get_text(strip=True)
        if "PDF" == link_text:  # Desktop PDF
            link_dict['PDF Desktop'] = link['href']
        elif "PDF Android/iOS" in link_text:  # Mobile PDF
            link_dict['PDF Mobile'] = link['href']
        elif "WORD" in link_text:  # Word link
            link_dict['WORD'] = link['href']
            
    info_dict['Links'] = link_dict
    
    return info_dict

# Test the function with your HTML
html_str = tr.prettify()  # Insert your HTML string here
soup = BeautifulSoup(html_str, 'html.parser')
info_dict = get_info(soup)
print(info_dict)


{'Index': 'No.', 'Title': None, 'Title Link': None, 'Publication Date': None, 'Links': {}}


In [5]:
all_info = []
failed = []

for tr in tres:
    try:
        info_dict = get_info(tr)
        all_info.append(info_dict)
    except:
        failed.append(tr)
        continue
    

In [6]:
notitle = 0
for doc in all_info:
    title = doc['Title']
    if title:
        print(title)
    else:
        notitle += 1
print("Witout title: ---- \n")
print(notitle)

CONSTITUCIÓN Política de los Estados 
			Unidos Mexicanos
CÓDIGO Civil Federal
CÓDIGO de 
			Comercio
CÓDIGO de Justicia Militar
CÓDIGO Federal de Procedimientos Civiles
CÓDIGO Fiscal de la Federación
CÓDIGOMilitar de Procedimientos Penales
CÓDIGO Nacional de Procedimientos Civiles y Familiares
CÓDIGO Nacional 
			de Procedimientos Penales
CÓDIGO Penal Federal
ESTATUTO de Gobierno del Distrito Federal
IMPUESTO sobre Servicios Expresamente Declarados de Interés Público 
			por Ley, en los que Intervengan Empresas Concesionarias de Bienes 
			del Dominio Directo de la Nación(LEY que establece, reforma y 
			adiciona las disposiciones relativas a diversos impuestos)
LEY Aduanera
LEY Agraria
LEY de Adquisiciones, Arrendamientos 
			y Servicios del Sector Público
LEY de Aeropuertos
LEY de Aguas Nacionales
LEY de Ahorro y Crédito Popular
LEY de Amnistía
LEY de Amnistía
LEY de Amparo, 
			Reglamentaria de los artículos 103 y 107 de la Constitución Política 
			de los Estados Unidos Mexicanos


In [7]:
wl = 0
non = 0

for info in all_info:
    if info['Links']:
        print(info['Links']['PDF Desktop'])
        wl += 1
    else:
        non += 1

print("With links: ---- \n")
print(wl)
print("Without links: ---- \n")
print(non)

pdf/CPEUM.pdf
pdf/2_110121.pdf
pdf/CCom.pdf
pdf/CJM.pdf
pdf/CFPC.pdf
pdf/CFF.pdf
pdf/CMPP.pdf
pdf/CNPCF.pdf
pdf/CNPP.pdf
pdf/CPF.pdf
pdf/10_270614.pdf
pdf/79.pdf
pdf/LAdua.pdf
pdf/LAgra.pdf
pdf/14_200521.pdf
pdf/LAero.pdf
pdf/LAN.pdf
pdf/17_200521.pdf
pdf/LAmn_220420.pdf
pdf/19.pdf
pdf/LAmp.pdf
pdf/21.pdf
pdf/LAREFAM.pdf
pdf/LASoc.pdf
pdf/LAPP_150618.pdf
pdf/24_171215.pdf
pdf/LAC.pdf
pdf/LAAT.pdf
pdf/LBOGM.pdf
pdf/LCEC_120419.pdf
pdf/27_011220.pdf
pdf/238.pdf
pdf/LCJPJF.pdf
pdf/28.pdf
pdf/LCM.pdf
pdf/30.pdf
pdf/LCID_061120.pdf
pdf/31_300118.pdf
pdf/235_030621.pdf
pdf/LDSCA.pdf
pdf/33.pdf
pdf/LDFEFM.pdf
pdf/252.pdf
pdf/LEMEFAM.pdf
pdf/LEN.pdf
pdf/LEG_110814.pdf
pdf/246.pdf
pdf/35.pdf
pdf/36_200521.pdf
pdf/LFEA_200521.pdf
pdf/LFRCF_200521.pdf
pdf/LFCC_200120.pdf
pdf/LFIV_230518.pdf
pdf/LFLL_190118.pdf
pdf/LFAAR.pdf
pdf/69_200521.pdf
pdf/LHidro_200521.pdf
pdf/LICal_010720.pdf
pdf/LIF_2023.pdf
pdf/LIH.pdf
pdf/LIC.pdf
pdf/LISF.pdf
pdf/44_150618.pdf
pdf/LANSI.pdf
pdf/LCMM.pdf
pdf/LCFE.pdf
pd

In [9]:
# Downloading the PDFs

import os
import requests

base = "https://www.diputados.gob.mx/LeyesBiblio/"

# Creating the downloads folder if it doesn't exist

if not os.path.exists('downloads'):
    os.makedirs('downloads')

for inf in all_info:
    if inf['Links']:
        link = inf['Links']['PDF Desktop']
        download_url = base + link
        file_name = link.split('/')[-1]
        # If the file exists within the downloads folder, skip it
        if os.path.exists(os.path.join('downloads', file_name)):
            print("File already exists:%s" % file_name)
            continue
        print("Downloading file:%s" % file_name)
        r = requests.get(download_url)
        # store the file within the downloads folder in this directory
        with open(os.path.join('downloads', file_name), 'wb') as f:
            f.write(r.content)
    else:
        print("No link found for %s" % inf['Title'])
        continue

No link found for None
Downloading file:CPEUM.pdf
Downloading file:2_110121.pdf
Downloading file:CCom.pdf
Downloading file:CJM.pdf
Downloading file:CFPC.pdf
Downloading file:CFF.pdf
Downloading file:CMPP.pdf
Downloading file:CNPCF.pdf
Downloading file:CNPP.pdf
Downloading file:CPF.pdf
Downloading file:10_270614.pdf
Downloading file:79.pdf
Downloading file:LAdua.pdf
Downloading file:LAgra.pdf
Downloading file:14_200521.pdf
Downloading file:LAero.pdf
Downloading file:LAN.pdf
Downloading file:17_200521.pdf
Downloading file:LAmn_220420.pdf
Downloading file:19.pdf
Downloading file:LAmp.pdf
Downloading file:21.pdf
Downloading file:LAREFAM.pdf
Downloading file:LASoc.pdf
Downloading file:LAPP_150618.pdf
Downloading file:24_171215.pdf
Downloading file:LAC.pdf
Downloading file:LAAT.pdf
Downloading file:LBOGM.pdf
Downloading file:LCEC_120419.pdf
Downloading file:27_011220.pdf
Downloading file:238.pdf
Downloading file:LCJPJF.pdf
Downloading file:28.pdf
Downloading file:LCM.pdf
Downloading file:30.

In [11]:
# Creating a list object to store the path to each file
# in the downloads folder

# Importing the os module
import os

# Creating a list object to store the path to each file
# in the downloads folder
file_paths = []

# Looping through the files in the downloads folder
for file in os.listdir('downloads'):
    # Appending the path of each file to the file_paths list
    file_paths.append(os.path.join('downloads', file))

# Printing the file_paths list
print(file_paths)

['downloads/LOCFCRL_060120.pdf', 'downloads/LFAR.pdf', 'downloads/LISSSTE.pdf', 'downloads/LOSCM.pdf', 'downloads/LFRPE_200521.pdf', 'downloads/CNPCF.pdf', 'downloads/LFFPMN_130420.pdf', 'downloads/28.pdf', 'downloads/LOAM.pdf', 'downloads/LHidro_200521.pdf', 'downloads/19.pdf', 'downloads/LAmn_220420.pdf', 'downloads/216_200521.pdf', 'downloads/LGT.pdf', 'downloads/LIMJ.pdf', 'downloads/LIGIEx.pdf', 'downloads/LFVV.pdf', 'downloads/LNSIJPA.pdf', 'downloads/LAero.pdf', 'downloads/LICal_010720.pdf', 'downloads/LOPGJDF.pdf', 'downloads/64_090318.pdf', 'downloads/LRArt6_MDR_300518.pdf', 'downloads/31_300118.pdf', 'downloads/LGDFS.pdf', 'downloads/LGS.pdf', 'downloads/LNMASCMP_200521.pdf', 'downloads/LFDA.pdf', 'downloads/LFRM.pdf', 'downloads/105.pdf', 'downloads/LSS.pdf', 'downloads/44_150618.pdf', 'downloads/LFTAIP.pdf', 'downloads/LIC.pdf', 'downloads/LRASCAP_200521.pdf', 'downloads/LORCME_200521.pdf', 'downloads/268.pdf', 'downloads/LFTSE.pdf', 'downloads/202.pdf', 'downloads/LSNIEG_2

In [22]:
import pdfplumber
n = 10
my_pdf_path = file_paths[n]
my_pdf = pdfplumber.open(my_pdf_path)

In [23]:
my_pdf

<pdfplumber.pdf.PDF at 0x7fea2f857310>

In [24]:
info = my_pdf.doc.info[0]  # Access the first item in the list

decoded_info = {}
for key, value in info.items():
    if isinstance(value, bytes):
        try:
            decoded_info[key] = value.decode('utf-8')  # Try decoding with utf-8
        except UnicodeDecodeError:
            decoded_info[key] = value.decode('latin-1')  # Fallback to latin-1 if utf-8 fails
    else:
        decoded_info[key] = value

print(decoded_info)


{'Author': 'Cámara de Diputados del H. Congreso de la Unión', 'CreationDate': "D:20180919124412-05'00'", 'Creator': 'Microsoft® Office Word 2007', 'ModDate': "D:20180919124431-05'00'", 'Producer': 'Microsoft® Office Word 2007', 'Title': 'Ley de Amnistía'}


In [28]:
import pdfplumber

def extract_pdf_info(pdf):
    pdf_info = {
        "author": pdf.metadata.get("Author", ""),
        "title": pdf.metadata.get("Title", ""),
        "subject": pdf.metadata.get("Subject", ""),
        "keywords": pdf.metadata.get("Keywords", ""),
        "producer": pdf.metadata.get("Producer", ""),
        "created_date": pdf.metadata.get("CreationDate", ""),
        "modified_date": pdf.metadata.get("ModDate", ""),
        "page_count": len(pdf.pages),
    }
    
    page_texts = []
    
    for page_number in range(len(pdf.pages)):
        page = pdf.pages[page_number]
        text = page.extract_text()
        page_texts.append(text)
    
    pdf_info["page_texts"] = page_texts
    
    return pdf_info

# Example usage:
pdf_path = "codigocivilfederal.pdf"

with pdfplumber.open(pdf_path) as pdf:
    pdf_info = extract_pdf_info(pdf)

# Access the PDF information and page texts
# print("PDF Information:")
# for key, value in pdf_info.items():
#     print(f"{key}: {value}")

# print("Text Content for Each Page:")
# for page_number, page_text in enumerate(pdf_info["page_texts"]):
#     print(f"Page {page_number + 1} text:")
#     print(page_text)


In [30]:
size = 0
path = 'downloads/'

for file in os.listdir(path):
    size += os.path.getsize(os.path.join(path, file))

print('Size of downloads folder: ' + str(size / 1000000) + ' MB')

Size of downloads folder: 252.816344 MB
