In [9]:
import pandas as pd
import numpy as np
import json
import os
import requests, zipfile, io
import shutil

## Download the data and extract it in the data folder

In [7]:
link = "http://data.assemblee-nationale.fr/static/openData/repository/15/amo/deputes_actifs_mandats_actifs_organes/AMO10_deputes_actifs_mandats_actifs_organes_XV.json.zip"

In [8]:
r = requests.get(link)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall(os.path.join(os.getcwd(), 'data'))

## Extract all the deputies infos from the csv files

The next cell scans through all the files with deputies info (they start with 'PA') and extracts the relevant data (name, family name, date of birth, political party...)
At the end we output a list of political parties with their respective code.

### Create the dataframe with the deputies information and the political parties information

In [9]:
column_names = ["code", "sex", "family name", "first name", "date of birth", "activity", "pol party", "dep", "num_dep", "circo"]
df_dep = pd.DataFrame(columns = column_names)
row_index = 0
non_declared = 'PO746314'

for path, subdirs, files in os.walk(os.path.join(os.getcwd(), 'data', 'json')):
    for name in files:
        if name[0:2] == 'PA':
            with open(os.path.join(path, name)) as f:
                data = json.load(f)
                print(data['acteur']['uid']['#text'])
                l = [data['acteur']['uid']['#text']]
                if data['acteur']['etatCivil']['ident']['civ'] == 'M.':
                    l.append('male')
                else:
                    l.append('female')
                l.append(data['acteur']['etatCivil']['ident']['nom'])
                l.append(data['acteur']['etatCivil']['ident']['prenom'])
                l.append(data['acteur']['etatCivil']['infoNaissance']['dateNais'])
                l.append(data['acteur']['profession']['socProcINSEE']['famSocPro'])
                for i in range(len(data['acteur']['mandats']['mandat'])):
                    if data['acteur']['mandats']['mandat'][i]['typeOrgane'] == 'PARPOL':
                        l.append(data['acteur']['mandats']['mandat'][i]['organes']['organeRef'])
                if(len(l) == 6):
                    l.append(non_declared)
                for i in range(len(data['acteur']['mandats']['mandat'])):
                    if((data['acteur']['mandats']['mandat'][i]['typeOrgane'] == 'ASSEMBLEE')&(data['acteur']['mandats']['mandat'][i]['infosQualite']['codeQualite'] == 'membre')):
                        l.append(data['acteur']['mandats']['mandat'][i]['election']['lieu']['departement'])
                        l.append(data['acteur']['mandats']['mandat'][i]['election']['lieu']['numDepartement'])
                        l.append(data['acteur']['mandats']['mandat'][i]['election']['lieu']['numCirco'])
                df_dep.loc[row_index] = l
                row_index = row_index + 1  
df_dep.loc[df_dep['activity'].str.len() < 5, 'activity'] = 'inconnu'
df_dep.info()

PA1008
PA1012
PA1029
PA1198
PA1206
PA1276
PA1327
PA1592
PA1630
PA1695
PA1809
PA1838
PA1874
PA2150
PA2155
PA223837
PA2377
PA2449
PA2492
PA2529
PA266776
PA266788
PA266793
PA266797
PA266808
PA267042
PA267200
PA267204
PA267260
PA267289
PA267306
PA267318
PA267324
PA267330
PA267337
PA267355
PA267378
PA267429
PA267440
PA267450
PA267527
PA267735
PA267766
PA267780
PA267785
PA267794
PA267901
PA268019
PA2796
PA2828
PA2952
PA2960
PA330008
PA330240
PA330788
PA330909
PA331582
PA331835
PA332228
PA332523
PA332614
PA332747
PA333285
PA333421
PA333818
PA334116
PA334768
PA335054
PA335612
PA335999
PA336175
PA336316
PA336439
PA337483
PA340343
PA340357
PA342196
PA342240
PA342415
PA342601
PA342935
PA343493
PA345722
PA346054
PA346876
PA405480
PA421348
PA508
PA588884
PA604
PA605036
PA605069
PA605084
PA605131
PA605518
PA605694
PA605963
PA605991
PA606098
PA606171
PA606202
PA606507
PA606639
PA607090
PA607155
PA607193
PA607395
PA607553
PA607595
PA607619
PA608016
PA608172
PA608292
PA608416
PA608641
PA608695
PA608741

In [10]:
df_dep.sample(20)

Unnamed: 0,code,sex,family name,first name,date of birth,activity,pol party,dep,num_dep,circo
230,PA719412,female,Le Feur,Sandrine,1991-03-18,Agriculteurs exploitants,PO761294,Finistère,29,4
74,PA340343,male,Saulignac,Hervé,1970-11-06,Cadres et professions intellectuelles supérieures,PO684932,Ardèche,7,1
434,PA721286,male,Ramadier,Alain,1958-07-08,Cadres et professions intellectuelles supérieures,PO710396,Seine-Saint-Denis,93,10
96,PA605963,male,Alauzet,Éric,1958-06-07,Cadres et professions intellectuelles supérieures,PO761294,Doubs,25,2
57,PA331835,female,Iborra,Monique,1945-03-08,Retraités,PO761294,Haute-Garonne,31,6
260,PA719676,female,Ressiguier,Muriel,1977-12-21,Employés,PO744856,Hérault,34,2
568,PA774960,female,Santiago,Isabelle,1965-09-20,Cadres et professions intellectuelles supérieures,PO684932,Val-de-Marne,94,9
453,PA721486,male,Naegelen,Christophe,1983-12-30,"Artisans, commerçants et chefs d'entreprise",PO744864,Vosges,88,3
499,PA721880,female,Guévenoux,Marie,1976-11-02,Cadres et professions intellectuelles supérieures,PO761294,Essonne,91,9
0,PA1008,male,David,Alain,1949-06-02,Cadres et professions intellectuelles supérieures,PO684932,Gironde,33,4


In [11]:
column_names = ["code", "name", "abreviated_name"]
df_polpar = pd.DataFrame(columns = column_names)
row_index = 0

for path, subdirs, files in os.walk(os.path.join(os.getcwd(), 'data', 'json')):
    for name in files:
        if name[0:2] == 'PO':
            with open(os.path.join(path, name)) as f:
                data = json.load(f)
                if data['organe']['codeType'] == 'PARPOL':
                    l = [data['organe']['uid']]
                    l.append(data['organe']['libelle'])
                    l.append(data['organe']['libelleAbrev'])
                    df_polpar.loc[row_index] = l
                    row_index = row_index + 1 
df_polpar.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19 entries, 0 to 18
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   code             19 non-null     object
 1   name             19 non-null     object
 2   abreviated_name  19 non-null     object
dtypes: object(3)
memory usage: 608.0+ bytes


## Count the number of members per political parties

In [12]:
#count the members per parties
df_members = df_dep['pol party'].value_counts().to_frame()
#extract the parties with less than 7 members
df_parties_to_replace = df_members.loc[df_members['pol party'] < 7]
#aggregate these parties into the not declared party (code = PO746314)
for party in df_parties_to_replace.index.to_list():
    df_dep.loc[df_dep['pol party'] == party, 'pol party'] = non_declared
    
#recount the number of members with the modification
df_members = df_dep['pol party'].value_counts().to_frame()
#remove the parties with few members
df_polpar = df_polpar.loc[df_polpar['code'].isin(df_members.loc[df_members['pol party'] >= 7].index.to_list())]
#add number of members in the political party dataframe
i = 0
for party in df_dep['pol party'].value_counts().to_frame().index.to_list():
    df_polpar.loc[df_polpar['code'] == party, 'members'] = int(df_dep['pol party'].value_counts().to_frame().values[i][0])
    i = i + 1
df_polpar['members'] = df_polpar['members'].astype(int)

## Add a color for each political party

Let's add a color per political party when we display them

In [13]:
df_polpar['color'] = ['tab:red', 'tab:olive', 'tab:green', 'tab:orange', 'tab:blue',
                      'tab:cyan', 'tab:brown', 'tab:purple', 'tab:pink', 'tab:grey']

Replace political party code by abreviated name in df_dep

In [14]:
update_list = df_polpar.drop(columns = ['name','members', 'color']).values.tolist()
for i in range(len(update_list)):
    df_dep.loc[df_dep['pol party'] == update_list[i][0], 'pol party'] = update_list[i][1]

## Arrange the activity categories

In [15]:
df_dep['activity'].unique()

array(['Cadres et professions intellectuelles supérieures',
       "Artisans, commerçants et chefs d'entreprise",
       'Agriculteurs exploitants', 'Ouvriers', 'Retraités',
       'Professions Intermédiaires', 'Employés',
       'Sans profession déclarée',
       'Autres personnes sans activité professionnelle', 'inconnu'],
      dtype=object)

In [16]:
df_dep.loc[df_dep['activity'] == 'Cadres et professions intellectuelles supérieures', 'activity'] = 'Cadres'
df_dep.loc[df_dep['activity'] == "Artisans, commerçants et chefs d'entreprise", 'activity'] = 'Entrepreneurs'
df_dep.loc[df_dep['activity'] == 'Professions Intermédiaires', 'activity'] = 'Prof. inter.'
df_dep.loc[df_dep['activity'] == 'Agriculteurs exploitants', 'activity'] = 'Agriculteurs'
df_dep.loc[df_dep['activity'] == 'Sans profession déclarée', 'activity'] = 'Non déclaré'
df_dep.loc[df_dep['activity'] == 'Autres personnes sans activité professionnelle', 'activity'] = 'Non déclaré'
df_dep.loc[df_dep['activity'] == 'inconnu', 'activity'] = 'Non déclaré'

In [17]:
df_dep['activity'].unique()

array(['Cadres', 'Entrepreneurs', 'Agriculteurs', 'Ouvriers', 'Retraités',
       'Prof. inter.', 'Employés', 'Non déclaré'], dtype=object)

## save deputy and political parties dataframes as cvs

In [18]:
df_dep.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 573 entries, 0 to 572
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   code           573 non-null    object
 1   sex            573 non-null    object
 2   family name    573 non-null    object
 3   first name     573 non-null    object
 4   date of birth  573 non-null    object
 5   activity       573 non-null    object
 6   pol party      573 non-null    object
 7   dep            573 non-null    object
 8   num_dep        573 non-null    object
 9   circo          573 non-null    object
dtypes: object(10)
memory usage: 49.2+ KB


In [19]:
df_dep.sample(20)

Unnamed: 0,code,sex,family name,first name,date of birth,activity,pol party,dep,num_dep,circo
386,PA720830,female,Thomas,Valérie,1968-01-21,Cadres,LAREM,Puy-de-Dôme,63,1
22,PA266793,male,Bony,Jean-Yves,1955-03-11,Agriculteurs,REP,Cantal,15,2
123,PA610775,male,Gomès,Philippe,1958-10-27,Cadres,ND,Nouvelle-Calédonie,988,2
404,PA720996,male,El Guerrab,M'jid,1983-04-25,Cadres,UDRL,Français établis hors de France,99,9
303,PA720074,female,Janvier,Caroline,1982-03-09,Cadres,LAREM,Loiret,45,2
214,PA719272,female,Meunier,Frédérique,1960-12-08,Cadres,REP,Corrèze,19,2
83,PA346054,male,Maquet,Emmanuel,1968-06-02,Cadres,REP,Somme,80,3
392,PA720892,female,Panot,Mathilde,1989-01-15,Cadres,FI,Val-de-Marne,94,10
8,PA1630,male,Herbillon,Michel,1951-03-06,Cadres,REP,Val-de-Marne,94,8
425,PA721202,male,Coquerel,Éric,1958-12-30,Cadres,FI,Seine-Saint-Denis,93,1


In [20]:
df_polpar.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 0 to 18
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   code             10 non-null     object
 1   name             10 non-null     object
 2   abreviated_name  10 non-null     object
 3   members          10 non-null     int32 
 4   color            10 non-null     object
dtypes: int32(1), object(4)
memory usage: 440.0+ bytes


In [21]:
df_polpar.sample(10)

Unnamed: 0,code,name,abreviated_name,members,color
18,PO761294,La République en Marche,LAREM,272,tab:grey
1,PO684932,Parti socialiste,PS,25,tab:olive
11,PO744856,La France Insoumise,FI,17,tab:cyan
3,PO684936,Europe Écologie Les Verts,EELV,13,tab:green
15,PO744864,"Union des démocrates, radicaux et libéraux",UDRL,24,tab:purple
4,PO684938,Mouvement Démocrate,MODEM,66,tab:orange
12,PO744858,Régions et peuples solidaires,RPS,11,tab:brown
0,PO684926,Parti communiste français,PCF,15,tab:red
16,PO746314,Non déclaré(s),ND,26,tab:pink
9,PO710396,Les Républicains,REP,104,tab:blue


In [22]:
df_dep.to_csv(os.path.join(os.getcwd(), 'data') + '\df_dep.csv',index=False)

In [1]:
df_polpar.to_csv(os.path.join(os.getcwd(), 'data') + '\df_polpar.csv',index=False)

NameError: name 'df_polpar' is not defined

# Load data about organs and organizations deputies belong to

In [5]:
column_names = ["code", "type", "name", "abreviated_name"]
df_organe = pd.DataFrame(columns = column_names)
row_index = 0

for path, subdirs, files in os.walk(os.path.join(os.getcwd(), 'data', 'json')):
    for name in files:
        if name[0:2] == 'PO':
            with open(os.path.join(path, name)) as f:
                data = json.load(f)
                l = [data['organe']['uid']]
                l.append(data['organe']['codeType'])
                l.append(data['organe']['libelle'])
                l.append(data['organe']['libelleAbrev'])
                df_organe.loc[row_index] = l
                row_index = row_index + 1 

In [6]:
df_organe.sample(20)

Unnamed: 0,code,type,name,abreviated_name
249,PO732758,DELEGBUREAU,Délégation chargée de la communication et de l...,COM
594,PO770077,MISINFO,Mission d'information sur la géopolitique des ...,GEOPOLMAT
461,PO746910,GE,Entrepreneuriat au féminin,ENTRFÉMIN
112,PO60175,ORGEXTPARL,Comité consultatif national d'éthique pour les...,30
421,PO744864,PARPOL,"Union des démocrates, radicaux et libéraux",UDRL
485,PO747084,GE,Modernisation des activités agricoles et struc...,MODERNAGRI
8,PO211493,COMSENAT,Commission des affaires sociales,SOCIC
114,PO60181,ORGEXTPARL,Conseil supérieur de la forêt et du bois,37
526,PO758409,ORGEXTPARL,Comité des rémunérations du comité d'organisat...,385
62,PO418755,API,Assemblée parlementaire de l'Union pour la Méd...,APEM


In [6]:
df_organe.to_csv(os.path.join(os.getcwd(), 'data') + '\df_organs.csv',index=False)

In [9]:
column_names = ["code_organe", "code_deputy"]
df_deputies_in_organe = pd.DataFrame(columns = column_names)
row_index = 0

for path, subdirs, files in os.walk(os.path.join(os.getcwd(), 'data', 'json')):
    for name in files:
        if name[0:2] == 'PA':
            with open(os.path.join(path, name)) as f:
                data = json.load(f)
                for i in range(len(data['acteur']['mandats']['mandat'])):
                    l = [data['acteur']['mandats']['mandat'][i]['organes']['organeRef']]
                    l.append(data['acteur']['uid']['#text'])
                    df_deputies_in_organe.loc[row_index] = l
                    row_index = row_index + 1
df_deputies_in_organe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16670 entries, 0 to 16669
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   code_organe  16670 non-null  object
 1   code_deputy  16670 non-null  object
dtypes: object(2)
memory usage: 390.7+ KB


In [10]:
df_deputies_in_organe.sample(20)

Unnamed: 0,code_organe,code_deputy
3602,PO746768,PA605518
23,PO710396,PA1012
788,PO733562,PA1874
11713,PO733472,PA720764
3531,PO733072,PA605036
5501,PO733642,PA718780
8827,PO717460,PA719740
2544,PO746669,PA331582
14554,PO774834,PA721734
4237,PO746763,PA608826


In [11]:
df_deputies_in_organe.to_csv(os.path.join(os.getcwd(), 'data') + '\df_deputies_in_organs.csv',index=False)

# Load data about votes

### Download the data and extract them in the data folder

In [1]:
link = 'http://data.assemblee-nationale.fr/static/openData/repository/15/loi/scrutins/Scrutins_XV.json.zip'

In [10]:
r = requests.get(link)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall(os.path.join(os.getcwd(), 'data', 'json', 'vote'))
#move the file in a parent folder
source_dir = os.path.join(os.getcwd(), 'data', 'json', 'vote', 'json')
target_dir = os.path.join(os.getcwd(), 'data', 'json', 'vote')
for file_name in os.listdir(source_dir):
    shutil.move(os.path.join(source_dir, file_name), target_dir)
#remove the source dir
os.rmdir(source_dir)