In [1]:
import http.client
import json
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import numpy as np


## Getting the main table

In [5]:
conn = http.client.HTTPSConnection("www.congreso.es")

payload = "_diputadomodule_idLegislatura=-1&_diputadomodule_genero=0&_diputadomodule_grupo=all&_diputadomodule_tipo=0&_diputadomodule_nombre=&_diputadomodule_apellidos=&_diputadomodule_formacion=all&_diputadomodule_filtroProvincias=%5B%5D&_diputadomodule_nombreCircunscripcion="

headers = {
    'accept': "application/json, text/javascript, */*; q=0.01",
    'accept-language': "en-US,en;q=0.6",
    'content-type': "application/x-www-form-urlencoded; charset=UTF-8",
    'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36",
    'x-requested-with': "XMLHttpRequest"
    }
conn.request("POST", "/busqueda-de-diputados?p_p_id=diputadomodule&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_resource_id=searchDiputados&p_p_cacheability=cacheLevelPage", payload, headers)

res = conn.getresponse()

data = res.read()

info = data.decode("utf-8")

responseObject = json.loads(info)['data']


## Adding features to the table

In [None]:
# 1h 30m

codParlamentario = [x['codParlamentario'] for x in responseObject] # lista de codigos de los parlamentarios

for idx,codParlamentario in enumerate(codParlamentario):
    row = {}

    url = f'https://www.congreso.es/busqueda-de-diputados?p_p_id=diputadomodule&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&_diputadomodule_mostrarFicha=true&codParlamentario={codParlamentario}&idLegislatura=XIV&mostrarAgenda=false'
    
    response = requests.get(url)
    
    soup = BeautifulSoup(response.text, 'html.parser')

    # adding item to dictionary: date of birth
    try:
        responseObject[idx]['dateOfBirth'] = soup.find_all('p')[1].text.strip()[10:39]
    except:
        responseObject[idx]['dateOfBirth'] = None

    # adding item to dictionary: legislatures
    try:
        if soup.find_all('p')[2].text.strip()[-1] == 's':
            responseObject[idx]['legislaturas'] = [x for x in soup.find_all('p')[2].text.strip()[15:-23].replace('y','').replace(',','').split(' ') if x != '']
        else:
            responseObject[idx]['legislaturas'] = [x for x in soup.find_all('p')[2].text.strip()[15:-12
            ].replace('y','').replace(',','').split(' ') if x != '']
    except: 
        responseObject[idx]['legislaturas'] = None

    # adding item to dictionary: email
    try:
        responseObject[idx]['mailto'] = soup.find('div',{'class':'email-dip'}).find('a')['href'][7:]
    except:
        responseObject[idx]['mailto'] = None
    
    # adding item to dictionary: imagen url
    try:
        responseObject[idx]['img'] = 'www.congreso.es' + soup.find('img',{'class':'card-img-top'})['src']
    except:
        responseObject[idx]['img'] = None

    # adding items to dictionary: social media
    try:
        social_media = soup.find('div',{'class':'rrss-dip'})
        for tag in social_media:
            if tag.find('img') != -1:
                responseObject[idx][tag.find('img')['alt']] = tag['href']
    except:
        pass

            


In [None]:
data_profile = pd.DataFrame(responseObject)

date = datetime.now().strftime("%d-%m-%Y %H%M%S") # get local time as string

filePath = f"../Diputados - Historical Data/Congreso Español {date} RAW DATA- Webscrape.csv"

data_profile.to_csv(filePath, index=False)

## Cleaning the table

In [None]:
data_profile = pd.DataFrame(responseObject)

for idx,row in enumerate(data_profile['legislaturas']):
    for legislatura in row:
        responseObject[idx][str('legislatura_')+legislatura] = legislatura

data_profile = pd.DataFrame(responseObject)

columns_to_drop = ['fchBaja','idLegislatura','legislaturas']

data_profile.drop(columns_to_drop, axis=1, inplace=True)

data_profile.grupo = data_profile.grupo.apply(lambda x: x[20:])

data_profile['dateOfBirth'] = data_profile['dateOfBirth'].apply(lambda x: pd.to_datetime(x))

data_profile['years'] = data_profile.dateOfBirth.apply(
    lambda x: 
        ((pd.Timestamp.now() - x) / np.timedelta64(1, 'Y')).__floor__()
        if type(x) == pd._libs.tslibs.timestamps.Timestamp
        else x)


## SAVE DATA AS A CSV FILE

In [None]:
date = datetime.now().strftime("%d-%m-%Y %H%M%S") # get local time as string

filePath = f"../Diputados - Historical Data/Congreso Español {date} - Webscrape.csv"

data_profile.to_csv(filePath, index=False)

## ANSWER BASIC QUESTIONS
* How many deputies are there?
* What is the sex rate
* What are the main political parties distribution?
* What is the age distribution
* Do younger deputies have more social media?
* What percentage uses which social media?


In [39]:
import plotly_express as px

In [25]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [None]:
data_profile = pd.read_csv('../Diputados - Historical Data/Congreso Español 22-08-2022 141639 - Webscrape.csv')

In [31]:
print(f'There are {data_profile.shape[0]} deputies')

There are 349 deputies


In [35]:
print(f'There are {data_profile[data_profile.genero==1].shape[0]} male deputies, and {data_profile[data_profile.genero==2].shape[0]} female deputies')

There are 201 male deputies, and 148 female deputies


In [110]:
grupo_mapped = {
    'Ciudadanos':'Derecha',
    'Confederal de Unidas Podemos-En Comú Podem-Galicia en Común':'Izquierda',
    'Euskal Herria Bildu':'Izquierda',
    'Mixto':'Derecha',
    'Plural':'Izquierda',
    'Popular en el Congreso':'',
    'Republicano':'Derecha',
    'Socialista':'Izquierda',
    'Socialista del Congreso':'Izquierda',
    'VOX':'Derecha',
    'Vasco (EAJ-PNV)':'Derecha'}

data_profile['partido'] = data_profile['grupo'].map(grupo_mapped)

data_profile.head()

Unnamed: 0,apellidos,formacion,apellidosNombre,genero,fchAlta,grupo,idCircunscripcion,nombreCircunscripcion,nombre,codParlamentario,dateOfBirth,legislaturas,mailto,img,twitter,facebook,personal-web,legislatura_IX,legislatura_X,legislatura_XI,legislatura_XII,legislatura_XIII,legislatura_XIV,instagram,legislatura_V,legislatura_VI,legislatura_VII,legislatura_VIII,linkedin,youtube,legislatura_II,legislatura_III,years,@@@@@@@@@@@@@,partido
0,Ábalos Meco,PSOE,"Ábalos Meco, José Luis",1,27/11/2019,Socialista,46,Valencia/València,José Luis,267,1959-12-09,"['IX', 'X', 'XI', 'XII', 'XIII', 'XIV']",,www.congreso.es/docu/imgweb/diputados/267_14.jpg,https://twitter.com/abalosmeco,https://www.facebook.com/jose.l.meco,http://joseluisabalos.wordpress.com/,IX,X,XI,XII,XIII,XIV,,,,,,,,,,62.0,Izquierda,Izquierda
1,Abascal Conde,Vox,"Abascal Conde, Santiago",1,20/11/2019,VOX,28,Madrid,Santiago,43,1976-04-14,"['XIII', 'XIV']",,www.congreso.es/docu/imgweb/diputados/43_14.jpg,https://twitter.com/Santi_ABASCAL,,,,,,,XIII,XIV,,,,,,,,,,46.0,Derecha,Derecha
2,Aceves Galindo,PSOE,"Aceves Galindo, José Luis",1,20/11/2019,Socialista,40,Segovia,José Luis,56,1970-05-21,"['XIII', 'XIV']",joseluis.aceves@congreso.es,www.congreso.es/docu/imgweb/diputados/56_14.jpg,https://twitter.com/JLAceves,https://www.facebook.com/joseluisaceves,,,,,,XIII,XIV,,,,,,,,,,52.0,Izquierda,Izquierda
3,Agirretxea Urresti,EAJ-PNV,"Agirretxea Urresti, Joseba Andoni",1,21/11/2019,Vasco (EAJ-PNV),20,Gipuzkoa,Joseba Andoni,99,1966-08-06,"['IX', 'X', 'XI', 'XII', 'XIII', 'XIV']",,www.congreso.es/docu/imgweb/diputados/99_14.jpg,https://twitter.com/jagirretxea,https://www.facebook.com/joseba.agirretxea,,IX,X,XI,XII,XIII,XIV,,,,,,,,,,56.0,Derecha,Derecha
4,Aizcorbe Torra,Vox,"Aizcorbe Torra, Juan José",1,25/11/2019,VOX,8,Barcelona,Juan José,151,1959-07-09,['XIV'],,www.congreso.es/docu/imgweb/diputados/151_14.jpg,https://twitter.com/JuanjoAizcorbe,,,,,,,,XIV,,,,,,,,,,63.0,Derecha,Derecha


In [111]:
data_profile.groupby(by=['grupo','formacion','partido']).count().reset_index().sort_values(['codParlamentario'], ascending = False)

Unnamed: 0,grupo,formacion,partido,apellidos,apellidosNombre,genero,fchAlta,idCircunscripcion,nombreCircunscripcion,nombre,codParlamentario,dateOfBirth,legislaturas,mailto,img,twitter,facebook,personal-web,legislatura_IX,legislatura_X,legislatura_XI,legislatura_XII,legislatura_XIII,legislatura_XIV,instagram,legislatura_V,legislatura_VI,legislatura_VII,legislatura_VIII,linkedin,youtube,legislatura_II,legislatura_III,years,@@@@@@@@@@@@@
24,Socialista,PSOE,Izquierda,95,95,95,95,95,95,95,95,95,95,89,95,82,31,8,7,13,20,23,73,95,15,0,0,0,4,1,2,0,0,95,95
19,Popular en el Congreso,PP,,85,85,85,85,85,85,85,85,84,85,74,85,73,28,3,15,29,28,37,51,85,25,1,4,8,15,7,1,0,0,84,85
27,VOX,Vox,Derecha,52,52,52,52,52,52,52,52,52,52,38,52,42,4,1,2,2,0,0,21,52,4,1,1,1,2,1,1,1,1,52,52
3,Confederal de Unidas Podemos-En Comú Podem-Gal...,UP,Izquierda,24,24,24,24,24,24,24,24,24,24,20,24,23,11,1,0,1,10,11,21,24,8,0,0,0,0,0,0,0,0,24,24
21,Republicano,ERC-S,Derecha,13,13,13,13,13,13,13,13,13,13,10,13,13,6,0,0,0,4,6,11,13,2,0,0,0,0,0,0,0,0,13,13
22,Socialista,PSC-PSOE,Izquierda,12,12,12,12,12,12,12,12,12,12,11,12,11,6,1,2,3,6,6,11,12,4,0,0,0,1,0,0,0,0,12,12
25,Socialista,PsdeG-PSOE,Izquierda,10,10,10,10,10,10,10,10,10,10,10,10,4,3,0,0,1,1,1,6,10,2,0,0,0,0,0,0,0,0,10,10
0,Ciudadanos,Cs,Derecha,9,9,9,9,9,9,9,9,9,9,8,9,9,2,0,0,0,1,2,9,9,0,0,0,0,0,0,0,0,0,9,9
2,Confederal de Unidas Podemos-En Comú Podem-Gal...,ECP-GUAYEM EL CANVI,Izquierda,7,7,7,7,7,7,7,7,7,7,7,7,7,4,0,0,0,3,3,6,7,1,0,0,0,0,0,0,0,0,7,7
28,Vasco (EAJ-PNV),EAJ-PNV,Derecha,6,6,6,6,6,6,6,6,6,6,0,6,5,4,0,2,2,4,5,6,6,0,0,0,0,1,0,0,0,0,6,6


In [115]:
fig = px.bar(
    data_profile.groupby(by=['grupo','formacion']).count().reset_index(),
    x = 'codParlamentario',
    y = 'grupo',
    color = 'partido'
    )

fig.update_layout(yaxis={'categoryorder':'total descending'})
fig.show()

In [70]:
fig = px.bar(
    data_profile.groupby(by=['formacion','grupo']).count().reset_index(),
    x = 'codParlamentario',
    y = 'formacion',
    color = 'grupo')

fig.update_layout(yaxis={'categoryorder':'total descending'})
fig.show()

## ASSUMPTIONS