We will use the [API](https://dadosabertos.camara.leg.br/swagger/api.html) in order to retrieve information about the deputies and theirs speechs. Then, we can merge them all into a single dataset.

In [0]:
import requests
import pandas as pd
import numpy as np

import sys

## Retrieving the speeches

First, we must retrieve the id information about all the deputies from the 52nd to 56th legislatures (only legislatures with speeches currently available)

In [0]:
legislatures = [52, 53, 54, 55, 56]

In [0]:
# Create a dictionary that contains the deputies of the legislature
deputies_legislatures = dict()
for l in legislatures:
  
  # Get all deputies of the legislature
  query = 'https://dadosabertos.camara.leg.br/api/v2/deputados?idLegislatura='+str(l)
  response = requests.get(query)
  json_data = response.json()
   
  # Add each speech to the list
  deputies_legislatures[l] = json_data

In [0]:
for l, json_data in deputies_legislatures.items(): 
  deputies = []
  for data in json_data['dados']:
    deputies.append([data['id'], data['siglaPartido'], data['siglaUf']])
  deputies_legislatures[l] = deputies

In [0]:
years = {
    "2019": ('2019-01-01', '2019-06-11'),
    "2018": ('2018-01-01', '2018-12-31'),
    "2017": ('2017-01-01', '2017-12-31'),
    "2016": ('2016-01-01', '2016-12-31'),
    "2015": ('2015-01-01', '2015-12-31'),
    "2014": ('2014-01-01', '2014-12-31'),
    "2013": ('2013-01-01', '2013-12-31'),
    "2012": ('2012-01-01', '2012-12-31'),
    "2011": ('2011-01-01', '2011-12-31'),
    "2010": ('2010-01-01', '2010-12-31'),
    "2009": ('2009-01-01', '2009-12-31'),
    "2008": ('2008-01-01', '2008-12-31'),
    "2007": ('2007-01-01', '2007-12-31'),
    "2006": ('2006-01-01', '2006-12-31'),
    "2005": ('2005-01-01', '2005-12-31'),
    "2004": ('2004-01-01', '2004-12-31'),
    "2003": ('2003-01-01', '2003-12-31'),
}

---
Now that we have the ids from each deputy, we can make a query that returns all speeches given between 1st January 2003 and 30 June this year (2019).

In [0]:
for year, intervals in years.items(): 
  speeches = []
  deputies = []
  if(2003 <= int(year) <= 2006):
      deputies = deputies_legislatures[52]
  elif(2007 <= int(year) <= 2010):
      deputies = deputies_legislatures[53]
  elif(2011 <= int(year) <= 2014):
      deputies = deputies_legislatures[54]
  elif(2015 <= int(year) <= 2018):
      deputies = deputies_legislatures[55]
  elif(2019 <= int(year) <= 2022):
      deputies = deputies_legislatures[56]
  total = len(deputies)
  # Create a dictionary that contains
  # id : speech1, speech2
  for i, dep in enumerate(deputies):
    sys.stdout.write("\rGetting %i of %i" % (i, total))
    sys.stdout.flush()

    # Get all speechs given by the current deputy
    query = 'https://dadosabertos.camara.leg.br/api/v2/deputados/'+str(dep[0])+\
            '/discursos?dataInicio='+str(intervals[0])+'&dataFim='+str(intervals[1])+'&ordenarPor=dataHoraInicio&ordem=ASC'
    response = requests.get(query)
    speech_json = response.json()

    # Add each speech to the list
    speeches += [[ dep[0], speech['transcricao'], dep[1], dep[2] ] for speech in speech_json['dados']]

  sys.stdout.write("\rExtraction complete! %i speeches retrieved." % len(speeches))
  
  # Create the dataset
  dataset = pd.DataFrame(speeches, columns=['id', 'speech', 'party', 'state'])

  # Create a copy
  dataset1 = dataset.copy()

  # We split the data using the signal " - " and retrieve the last string
  # obtained in that split
  dataset['speech'] = dataset.speech.str.split(" - ", n=2, expand=True)[2]
  
  size = dataset.shape[0]

  null_speeches = dataset.loc[dataset.speech.isna()]

  print( "There are %i null speeches" % len(null_speeches))

  dataset.dropna(inplace=True)
  dataset1 = dataset1.loc[null_speeches.index]

  # We split the data using the pattern "\r\n\r\n" and retrieve the last string
  # obtained in that split
  dataset1['speech'] = dataset1.speech.str.split("\r\n\r\n", n=1, expand=True)[1]

  # Merge datasets
  dataset = dataset.append(dataset1)

  # Save dataset
  dataset['speech'].replace('', np.nan, inplace=True)
  dataset.dropna(subset=['speech'], inplace=True)
  dataset.to_csv('speeches'+str(year)+'.csv',  sep=';')

In [10]:
pd.read_csv('speeches2004.csv', sep=';', index_col=1)

Unnamed: 0_level_0,Unnamed: 0,speech,party,state
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
73764,1,"Deputado Abelardo Lupion, recebo o pedido do P...",PFL,PR
73764,4,"77% da sua produção são exportados. Em 2003, r...",PFL,PR
73764,5,Também quero dar meu fraterno abraço ao querid...,PFL,PR
73764,7,e a piscicultura está nas mãos dos pequenos pr...,PFL,PR
73764,8,"Deputado Abelardo Lupion, a Presidência indaga...",PFL,PR
73764,9,"Fiscal, no valor de 44 milhões e 80 mil reais,...",PFL,PR
73764,10,Está claro.,PFL,PR
73764,11,"Obrigado, Deputado Abelardo Lupion.",PFL,PR
73764,13,quando vive o dia-a-dia no seu Estado - relati...,PFL,PR
73764,14,"e entendemos sua posição -, gostaríamos de dei...",PFL,PR
