In [1]:
#pip install requests
#pip install beautifulsoup4
#pip install pandas

In [2]:
import requests
import pandas as pd
import time
import os
import numpy as np

In [3]:
# Load the CSV file
file_path = "orgaos_interlegis(2025) - api_url.csv"

# Read the CSV file into a DataFrame
api_url= pd.read_csv(file_path)

# Assuming the URLs are in the first column (adjust column name if needed)
api_urls = api_url.iloc[:, 0].dropna().tolist()


In [4]:
# Function to fetch data from API
def fetch_data(api_url):
    try:
        response = requests.get(api_url)  # Send a GET request to the API URL
        response.raise_for_status()  # Raise an exception for bad response status codes
        return response.json()  # Return JSON data if the request is successful
    except Exception as e:
        print(f"Failed to fetch data from {api_url}: {e}")
        return None

In [5]:
# Function to save data for each API URL
def save_data(data, api_url, page_number):
    if data is None:  # Check if data is None
        print(f"No data to save for {api_url}, page {page_number}.")
        return  # Return early if data is None
    city_name = api_url.split(".")[1]  # Extract city name from the URL
    city_file = f"SAPL-Cidades_2025/{city_name}_data.csv"  # File name for the city data
    city_df = pd.DataFrame(data)  # Create a DataFrame from the fetched data
    if os.path.exists(city_file):  # If the file already exists, append data to it
        city_df.to_csv(city_file, mode='a', header=False, index=False)
    else:  # If the file doesn't exist, create it and write data to it
        city_df.to_csv(city_file, index=False)

In [6]:
# Function to iterate through pages and fetch data
def fetch_all_data(api_url, start_page=1):
    all_data = []  # Initialize an empty list to store all data
    next_url = api_url  # Initialize the next URL to the provided API URL
    page_number = start_page  # Initialize the page number
    while next_url:
        print(f"Processing page {page_number} of {api_url}...")  # Print current page being processed
        data = fetch_data(next_url)  # Fetch data from the current URL
        if not data:  # If data retrieval fails, exit the loop
            break
        all_data.extend(data["results"])  # Extend the list with data from the current page
        next_url = data["pagination"]["links"].get("next")  # Get the URL for the next page, if available
        if next_url:
            time.sleep(0.2)  # Wait for X seconds before making the next request to respect API limits
            page_number += 1  # Increment the page number
            # Save data for each page
            save_data(data["results"], api_url, page_number)
            # Update progress after processing each page
            with open(progress_file, "w") as file:
                file.write(f"{i}\n")  # Write the last processed API URL
                file.write(f"{page_number}\n")  # Write the last processed page
                file.write(next_url if next_url else "")  # Write the last processed next URL
    return all_data  # Return the collected data

In [7]:
# Initialize an empty DataFrame
final_df = pd.DataFrame()

# Check if progress file exists
progress_file = "progress.txt"
if os.path.exists(progress_file):
    with open(progress_file, "r") as file:
        progress_data = file.readlines()
    last_processed_index = int(progress_data[0].strip())  # Read the index of the last processed API URL
    last_processed_page = int(progress_data[1].strip())  # Read the last processed page
    last_processed_next_url = progress_data[2].strip()  # Read the last processed next URL
else:
    last_processed_index = 0  # If the progress file doesn't exist, start from the beginning
    last_processed_page = 1  # Start from the first page
    last_processed_next_url = api_urls[0]

# Loop through each API URL starting from the last processed index
for i in range(last_processed_index, len(api_urls)):
    api_url = api_urls[i]
    print(f"Processing {api_url}...")  # Print current API URL being processed
    city_name = api_url.split(".")[1]  # Extract city name from the URL
    if last_processed_next_url == api_url:
        city_data = fetch_all_data(api_url, start_page=last_processed_page)  # Fetch data from the API URL starting from the last processed page
    else:
        city_data = fetch_all_data(last_processed_next_url, start_page=last_processed_page)  # Fetch data from the API URL
    if city_data:
        city_df = pd.DataFrame(city_data)
        city_df["City"] = city_name
        city_df["API_URL"] = api_url
        city_df.to_csv(f"SAPL-Cidades_2025/{city_name}_data.csv", index=False)
        final_df = pd.concat([final_df, city_df], ignore_index=True)
    # Update last processed next URL and page for progress
    if i < len(api_urls) - 1:  # If not the last URL in the list
        last_processed_next_url = api_urls[i + 1]  # Update last processed next URL
        last_processed_page = 1  # Reset page number for the next URL


Processing https://sapl.vilanovadosmartirios.ma.leg.br/api/materia/materialegislativa/...
Processing page 10 of https://sapl.vilanovadosmartirios.ma.leg.br/api/materia/materialegislativa/?page=10...
Processing https://sapl.pimenteiras.pi.leg.br/api/materia/materialegislativa/...
Processing page 1 of https://sapl.pimenteiras.pi.leg.br/api/materia/materialegislativa/...
Processing https://sapl.chapadadonorte.mg.leg.br/api/materia/materialegislativa/...
Processing page 1 of https://sapl.chapadadonorte.mg.leg.br/api/materia/materialegislativa/...
Processing https://sapl.borrazopolis.pr.leg.br/api/materia/materialegislativa/...
Processing page 1 of https://sapl.borrazopolis.pr.leg.br/api/materia/materialegislativa/...
Processing https://sapl.santaterezadotocantins.to.leg.br/api/materia/materialegislativa/...
Processing page 1 of https://sapl.santaterezadotocantins.to.leg.br/api/materia/materialegislativa/...
Processing https://sapl.carangola.mg.leg.br/api/materia/materialegislativa/...
Proc

In [8]:
# Save the final DataFrame
# List to store DataFrames from each CSV file
allcities = []

# Iterate over each file in the folder
for file_name in os.listdir('SAPL-Cidades_2025'):
    if file_name.endswith(".csv"):  # Check if the file is a CSV file
        file_path = os.path.join('SAPL-Cidades_2025', file_name)  # Get the full file path
        onecity = pd.read_csv(file_path, dtype=str)  # Read the CSV file into a DataFrame
        allcities.append(onecity)  # Append the DataFrame to the list

# Concatenate all DataFrames into a single one
merged_df = pd.concat(allcities, ignore_index=True)
merged_df.to_csv("ProjCidades2025.csv", index = False)