# Brazilian congress scraper

This notebook downloads the data from the votings in the Brazilian Chamber of Deputies in 2024 and saves it in the total_data_2024.csv file.

It uses the API dadosabertos provided by the Chamber of Deputies.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from datetime import datetime
import calendar
import os

In [3]:
# Function to get voting data from the Chamber API by month
def get_votes_by_month(year=2024, month=1):
    # Formatting month to always have two digits
    formatted_month = f"{month:02d}"

    # Setting the first and last day of the month
    last_day = calendar.monthrange(year, month)[1]

    days = list(range(1, last_day + 1))
    formatted_days = [f"{day:02d}" for day in days]
    dates = [f"{year}-{formatted_month}-{day}" for day in formatted_days]
    data = []

    for i in range(len(dates)-1):
        url = f"https://dadosabertos.camara.leg.br/api/v2/votacoes?dataInicio={dates[i]}&dataFim={dates[i]}&ordem=DESC&ordenarPor=dataHoraRegistro"

        print(f"Querying API: {url}")
        response = requests.get(url)

        if response.status_code == 200:
            temp_data = response.json()['dados']
            print(f"Found {len(temp_data)} votes for {dates[i]}")
            data+=temp_data
        else:
            print(f"Error getting data for {calendar.month_name[month]}/{year}: {response.status_code}")
    return data

In [19]:
# Function to get details of a specific vote
def get_votes(vote_id):
    url = f"https://dadosabertos.camara.leg.br/api/v2/votacoes/{vote_id}/votos"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()['dados']
    else:
        print(f"Error getting votes for voting {vote_id}: {response.status_code}")
        return []

In [24]:
def fetch_voting_data(year, month):
    voting_data = get_votes_by_month(year, month)
    all_votes = []

    for i, voting in enumerate(voting_data):
        vote_id = voting['id']
        description = voting.get('proposicaoObjeto', voting.get('descricao', 'No description'))
        date = voting.get('data', 'Date not available')

        print(f"  [{i+1}/{len(voting_data)}] {date}")

        votes = get_votes(vote_id)
        if not votes:
            continue

        for vote in votes:
            vote_info = {
                'id_votacao': vote_id,
                'data': date,
                'descricao': description,
                'deputado_id': vote.get('deputado_','').get('id'),
                'deputado_nome': vote.get('deputado_', '').get('nome'),
                'partidoSigla': vote.get('deputado_', '').get('siglaPartido'),
                'uf': vote.get('deputado_', '').get('siglaUf'),
                'voto': vote.get('tipoVoto')
            }
            all_votes.append(vote_info)
    df_votes = pd.DataFrame(all_votes)
    # If there are no votes, end the function
    if df_votes.empty:
        print(f"No votes recorded for {calendar.month_name[month]}/{year}")
        return None

    # Folder to save the month's results
    month_folder = os.path.join('/chamber_results_2024/', f'{year}_{month:02d}_{calendar.month_name[month]}')
    os.makedirs(month_folder, exist_ok=True)

    # Save raw data
    df_votes.to_csv(os.path.join(month_folder, 'complete_data.csv'), index=False)

    return None


In [25]:
#fetching data from January to December
for month in range(1, 13):
    fetch_voting_data(2024, month)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  [804/958] 2024-06-19
  [805/958] 2024-06-19
  [806/958] 2024-06-19
  [807/958] 2024-06-19
  [808/958] 2024-06-19
  [809/958] 2024-06-19
  [810/958] 2024-06-19
  [811/958] 2024-06-19
  [812/958] 2024-06-19
  [813/958] 2024-06-19
  [814/958] 2024-06-19
  [815/958] 2024-06-19
  [816/958] 2024-06-19
  [817/958] 2024-06-19
  [818/958] 2024-06-19
  [819/958] 2024-06-19
  [820/958] 2024-06-19
  [821/958] 2024-06-19
  [822/958] 2024-06-19
  [823/958] 2024-06-19
  [824/958] 2024-06-19
  [825/958] 2024-06-19
  [826/958] 2024-06-19
  [827/958] 2024-06-19
  [828/958] 2024-06-25
  [829/958] 2024-06-25
  [830/958] 2024-06-25
  [831/958] 2024-06-25
  [832/958] 2024-06-25
  [833/958] 2024-06-25
  [834/958] 2024-06-25
  [835/958] 2024-06-25
  [836/958] 2024-06-25
  [837/958] 2024-06-25
  [838/958] 2024-06-25
  [839/958] 2024-06-25
  [840/958] 2024-06-25
  [841/958] 2024-06-25
  [842/958] 2024-06-25
  [843/958] 2024-06-25
  [844/958] 202

In [28]:
#load monthly data
def load_data_month(month):
  month_formated = f"{month:02d}"
  month_name = calendar.month_name[month]
  data = pd.read_csv(os.path.join(month_folder,'complete_data.csv'))
  return data
#joining monthly data
def total_data():
  months = [1,2,3,4,5,6,7,8,9,10,11,12]
  data = pd.DataFrame()
  for month in months:
    try:
      data_month = load_data_month(month)
      data = pd.concat([data, data_month], ignore_index=True)
    except FileNotFoundError:
      print(f"File not found for the month {month}")
  return data


#saving total_data
data = total_data()
data.to_csv(os.path.join('/chamber_results_new_2024','total_data_2024.csv'), index=False)

File not found for the month 1
