In [69]:
import pandas as pd
import numpy as np
import json
import os
import requests, zipfile, io

## Download the data and extract it in the data folder

In [70]:
link = "http://data.assemblee-nationale.fr/static/openData/repository/15/amo/deputes_actifs_mandats_actifs_organes/AMO10_deputes_actifs_mandats_actifs_organes_XV.json.zip"

In [71]:
r = requests.get(link)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall(os.path.join(os.getcwd(), 'data'))

## Extract all the deputies infos from the csv files

The next cell scans through all the files with deputies info (they start with 'PA') and extracts the relevant data (name, family name, date of birth, political party...)
At the end we output a list of political parties with their respective code.

### Create the dataframe with the deputies information and the political parties information

In [72]:
column_names = ["code", "sex", "family name", "first name", "date of birth", "activity", "pol party"]
df_dep = pd.DataFrame(columns = column_names)
row_index = 0
non_declared = 'PO746314'

for path, subdirs, files in os.walk(os.path.join(os.getcwd(), 'data', 'json')):
    for name in files:
        if name[0:2] == 'PA':
            with open(os.path.join(path, name)) as f:
                data = json.load(f)
                l = [data['acteur']['uid']['#text']]
                if data['acteur']['etatCivil']['ident']['civ'] == 'M.':
                    l.append('male')
                else:
                    l.append('female')
                l.append(data['acteur']['etatCivil']['ident']['nom'])
                l.append(data['acteur']['etatCivil']['ident']['prenom'])
                l.append(data['acteur']['etatCivil']['infoNaissance']['dateNais'])
                l.append(data['acteur']['profession']['socProcINSEE']['famSocPro'])
                for i in range(len(data['acteur']['mandats']['mandat'])):
                    if data['acteur']['mandats']['mandat'][i]['typeOrgane'] == 'PARPOL':
                        l.append(data['acteur']['mandats']['mandat'][i]['organes']['organeRef'])
                if(len(l) == 6):
                    l.append(non_declared)                
                df_dep.loc[row_index] = l
                row_index = row_index + 1  
df_dep.loc[df_dep['activity'].str.len() < 5, 'activity'] = 'inconnu'
df_dep.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 573 entries, 0 to 572
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   code           573 non-null    object
 1   sex            573 non-null    object
 2   family name    573 non-null    object
 3   first name     573 non-null    object
 4   date of birth  573 non-null    object
 5   activity       573 non-null    object
 6   pol party      573 non-null    object
dtypes: object(7)
memory usage: 35.8+ KB


In [73]:
column_names = ["code", "name", "abreviated_name"]
df_polpar = pd.DataFrame(columns = column_names)
row_index = 0

for path, subdirs, files in os.walk(os.path.join(os.getcwd(), 'data', 'json')):
    for name in files:
        if name[0:2] == 'PO':
            with open(os.path.join(path, name)) as f:
                data = json.load(f)
                if data['organe']['codeType'] == 'PARPOL':
                    l = [data['organe']['uid']]
                    l.append(data['organe']['libelle'])
                    l.append(data['organe']['libelleAbrev'])
                    df_polpar.loc[row_index] = l
                    row_index = row_index + 1 
df_polpar.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19 entries, 0 to 18
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   code             19 non-null     object
 1   name             19 non-null     object
 2   abreviated_name  19 non-null     object
dtypes: object(3)
memory usage: 608.0+ bytes


## Count the number of members per political parties

In [74]:
#count the members per parties
df_members = df_dep['pol party'].value_counts().to_frame()
#extract the parties with less than 7 members
df_parties_to_replace = df_members.loc[df_members['pol party'] < 7]
#aggregate these parties into the not declared party (code = PO746314)
for party in df_parties_to_replace.index.to_list():
    df_dep.loc[df_dep['pol party'] == party, 'pol party'] = non_declared
    
#recount the number of members with the modification
df_members = df_dep['pol party'].value_counts().to_frame()
#remove the parties with few members
df_polpar = df_polpar.loc[df_polpar['code'].isin(df_members.loc[df_members['pol party'] >= 7].index.to_list())]
#add number of members in the political party dataframe
i = 0
for party in df_dep['pol party'].value_counts().to_frame().index.to_list():
    df_polpar.loc[df_polpar['code'] == party, 'members'] = int(df_dep['pol party'].value_counts().to_frame().values[i][0])
    i = i + 1
df_polpar['members'] = df_polpar['members'].astype(int)

Let's add a color per political party when we display them

In [75]:
df_polpar['color'] = ['tab:red', 'tab:olive', 'tab:green', 'tab:orange', 'tab:blue',
                      'tab:cyan', 'tab:brown', 'tab:purple', 'tab:pink', 'tab:grey']

Replace political party code by abreviated name in df_dep

In [76]:
update_list = df_polpar.drop(columns = ['name','members', 'color']).values.tolist()
for i in range(len(update_list)):
    df_dep.loc[df_dep['pol party'] == update_list[i][0], 'pol party'] = update_list[i][1]

### Arrange the activity categories

In [77]:
df_dep['activity'].unique()

array(['Cadres et professions intellectuelles supérieures',
       "Artisans, commerçants et chefs d'entreprise",
       'Agriculteurs exploitants', 'Ouvriers', 'Retraités',
       'Professions Intermédiaires', 'Employés',
       'Sans profession déclarée',
       'Autres personnes sans activité professionnelle', 'inconnu'],
      dtype=object)

In [78]:
df_dep.loc[df_dep['activity'] == 'Cadres et professions intellectuelles supérieures', 'activity'] = 'Cadres'
df_dep.loc[df_dep['activity'] == "Artisans, commerçants et chefs d'entreprise", 'activity'] = 'Entrepreneurs'
df_dep.loc[df_dep['activity'] == 'Professions Intermédiaires', 'activity'] = 'Prof. inter.'
df_dep.loc[df_dep['activity'] == 'Agriculteurs exploitants', 'activity'] = 'Agriculteurs'
df_dep.loc[df_dep['activity'] == 'Sans profession déclarée', 'activity'] = 'Non déclaré'
df_dep.loc[df_dep['activity'] == 'Autres personnes sans activité professionnelle', 'activity'] = 'Non déclaré'
df_dep.loc[df_dep['activity'] == 'inconnu', 'activity'] = 'Non déclaré'

In [79]:
df_dep['activity'].unique()

array(['Cadres', 'Entrepreneurs', 'Agriculteurs', 'Ouvriers', 'Retraités',
       'Prof. inter.', 'Employés', 'Non déclaré'], dtype=object)

save deputy and political parties dataframes as cvs

In [80]:
df_dep.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 573 entries, 0 to 572
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   code           573 non-null    object
 1   sex            573 non-null    object
 2   family name    573 non-null    object
 3   first name     573 non-null    object
 4   date of birth  573 non-null    object
 5   activity       573 non-null    object
 6   pol party      573 non-null    object
dtypes: object(7)
memory usage: 35.8+ KB


In [84]:
df_dep.sample(20)

Unnamed: 0,code,sex,family name,first name,date of birth,activity,pol party
106,PA607395,male,Travert,Stéphane,1969-10-12,Cadres,LAREM
23,PA266797,male,Gosselin,Philippe,1966-10-23,Cadres,REP
157,PA718728,female,Vanceunebrock,Laurence,1970-05-06,Employés,LAREM
98,PA606098,male,Marleix,Olivier,1971-02-06,Cadres,REP
355,PA720560,female,Goulet,Perrine,1978-03-19,Cadres,MODEM
200,PA719146,male,Acquaviva,Jean-Félix,1973-03-19,Cadres,RPS
314,PA720178,male,Garcia,Laurent,1970-05-31,Cadres,MODEM
523,PA722102,female,Vidal,Annie,1956-09-17,Cadres,LAREM
409,PA721046,female,Bassire,Nathalie,1968-01-22,Retraités,UDRL
15,PA223837,male,Brochand,Bernard,1938-06-05,Retraités,REP


In [81]:
df_polpar.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 0 to 18
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   code             10 non-null     object
 1   name             10 non-null     object
 2   abreviated_name  10 non-null     object
 3   members          10 non-null     int32 
 4   color            10 non-null     object
dtypes: int32(1), object(4)
memory usage: 440.0+ bytes


In [86]:
df_polpar.sample(10)

Unnamed: 0,code,name,abreviated_name,members,color
15,PO744864,"Union des démocrates, radicaux et libéraux",UDRL,24,tab:purple
3,PO684936,Europe Écologie Les Verts,EELV,13,tab:green
18,PO761294,La République en Marche,LAREM,272,tab:grey
9,PO710396,Les Républicains,REP,104,tab:blue
0,PO684926,Parti communiste français,PCF,15,tab:red
1,PO684932,Parti socialiste,PS,25,tab:olive
4,PO684938,Mouvement Démocrate,MODEM,66,tab:orange
12,PO744858,Régions et peuples solidaires,RPS,11,tab:brown
11,PO744856,La France Insoumise,FI,17,tab:cyan
16,PO746314,Non déclaré(s),ND,26,tab:pink


In [82]:
df_dep.to_csv(os.path.join(os.getcwd(), 'data') + '\df_dep.csv',index=False)

In [83]:
df_polpar.to_csv(os.path.join(os.getcwd(), 'data') + '\df_polpar.csv',index=False)