# Export huggingface dataset for frontend usage  

We want to make a static dataviz website.  
We need to have csv and json files availiable.  

## install libs

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1

## load dataset

### download

In [None]:
from datasets import load_dataset

dataset = load_dataset("the-french-artist/hatvp_declaration_list_archive", split="train")

Downloading readme:   0%|          | 0.00/937 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12556 [00:00<?, ? examples/s]

In [None]:
dataset

Dataset({
    features: ['civilite', 'prenom', 'nom', 'classement', 'type_mandat', 'qualite', 'type_document', 'departement', 'date_publication', 'date_depot', 'nom_fichier', 'url_dossier', 'open_data', 'statut_publication', 'id_origine', 'url_photo'],
    num_rows: 12556
})

In [None]:
df = dataset.to_pandas()

In [None]:
df.head()

Unnamed: 0,civilite,prenom,nom,classement,type_mandat,qualite,type_document,departement,date_publication,date_depot,nom_fichier,url_dossier,open_data,statut_publication,id_origine,url_photo
0,Mme,Abbassia,HAKEM,hakemaaaabbassia4615,commune,Adjointe au maire de Nantes,di,44,2021-09-27,2020-09-01,hakem-abbassia-di16146-commune-nantes.pdf,/pages_nominatives/hakem-abbassia,hakem-abbassia-di16146-commune-nantes.xml,Livrée,,
1,Mme,Abbassia,HAKEM,hakemaaaabbassia4615,commune,Adjointe au maire de Nantes,dim,44,2021-09-28,2021-08-02,hakem-abbassia-dim16147-commune-nantes.pdf,/pages_nominatives/hakem-abbassia,hakem-abbassia-dim16147-commune-nantes.xml,Livrée,,
2,M.,Abdel Kader,CHEKHEMANI,chekhemaniaaaabdelkader1679,commune,Adjoint au maire de Rouen,di,76,2021-10-11,2020-10-14,chekhemani-abdel-kader-di16326-commune-rouen.pdf,/pages_nominatives/chekhemani-abdel-kader,chekhemani-abdel-kader-di16326-commune-rouen.xml,Livrée,,
3,M.,Abdel Kader,CHEKHEMANI,chekhemaniaaaabdelkader1679,commune,Adjoint au maire de Rouen,dim,76,2021-10-12,2021-07-14,chekhemani-abdel-kader-dim16327-commune-rouen.pdf,/pages_nominatives/chekhemani-abdel-kader,chekhemani-abdel-kader-dim16327-commune-rouen.xml,Livrée,,
4,M.,Abdelaziz,HAMIDA,HAMIDA Abdelaziz18939,commune,Maire de Goussainville,di,28,2021-08-10,2020-10-03,hamida-abdelaziz-di15206-commune-goussainville...,/pages_nominatives/hamida-abdelaziz-18939,hamida-abdelaziz-di15206-commune-goussainville...,Livrée,,


### add gender feature

In [None]:
def get_gender(civility):
    if civility == 'Mme':
        return 'female'
    return 'male'

# Determine the gender for each 'prenom'
#df['gender'] = df['prenom'].apply(lambda x: d.get_gender(x))
df['gender'] = df['civilite'].apply(lambda x: get_gender(x))

### inspect duplicates

Each row is not a unique person, just an update.  
We find that the number of unique people is twice lower, meaning an average of 2 updates per people.  

To get meaningful results, we only keep the latest declaration for each unique undividual.  

In [None]:
len(df)

12556

In [None]:
len(df['classement'].unique())

6284

In [None]:
import pandas as pd

# convert date_depot to datetime
df['date_depot'] = pd.to_datetime(df['date_depot'])

We sort by dateDepot in asending order and group by the unique declarant id "classement", then we select the last sample to get the latest declaration for each declarant.

In [None]:
df[df['classement'] == 'hakemaaaabbassia4615']

Unnamed: 0,civilite,prenom,nom,classement,type_mandat,qualite,type_document,departement,date_publication,date_depot,nom_fichier,url_dossier,open_data,statut_publication,id_origine,url_photo,gender
0,Mme,Abbassia,HAKEM,hakemaaaabbassia4615,commune,Adjointe au maire de Nantes,di,44,2021-09-27,2020-09-01,hakem-abbassia-di16146-commune-nantes.pdf,/pages_nominatives/hakem-abbassia,hakem-abbassia-di16146-commune-nantes.xml,Livrée,,,female
1,Mme,Abbassia,HAKEM,hakemaaaabbassia4615,commune,Adjointe au maire de Nantes,dim,44,2021-09-28,2021-08-02,hakem-abbassia-dim16147-commune-nantes.pdf,/pages_nominatives/hakem-abbassia,hakem-abbassia-dim16147-commune-nantes.xml,Livrée,,,female


In [None]:
latest_declarations_df = df.sort_values('date_depot').groupby(['classement'], as_index=False).last()
latest_declarations_df[latest_declarations_df['classement'] == 'hakemaaaabbassia4615']

Unnamed: 0,classement,civilite,prenom,nom,type_mandat,qualite,type_document,departement,date_publication,date_depot,nom_fichier,url_dossier,open_data,statut_publication,id_origine,url_photo,gender
5400,hakemaaaabbassia4615,Mme,Abbassia,HAKEM,commune,Adjointe au maire de Nantes,dim,44,2021-09-28,2021-08-02,hakem-abbassia-dim16147-commune-nantes.pdf,/pages_nominatives/hakem-abbassia,hakem-abbassia-dim16147-commune-nantes.xml,Livrée,,,female


## Export datasets

We export 2 versions per dataset:  
* `full` includes all lines in the dataset, including mulitple updates per person  
* `latest` includes only the last update for each unique person  

### Top surnames

In [None]:
def save_top_surnames(df, filename):

  # Assuming latest_declarations_df.groupby(['prenom', 'gender']).size() is your Series
  grouped_counts = df.groupby(['prenom', 'gender']).size().sort_values(ascending=False).head(10)

  # Convert Series to DataFrame and set column name
  grouped_counts_df = grouped_counts.reset_index(name='count')

  # Print the top 10 entries with a named count column
  grouped_counts_df.to_csv(filename, index=False)


save_top_surnames(df, 'top_surnames_full.csv')
save_top_surnames(latest_declarations_df, 'top_surnames_latest.csv')

### Gender ratio

In [None]:
def save_gender_ratio(df, filename):
  gender_counts = df['gender'].value_counts()
  gender_counts.to_csv(filename)


save_gender_ratio(df, 'total_gender_ratio_full.csv')
save_gender_ratio(latest_declarations_df, 'total_gender_ratio_latest.csv')

### Gender ratio per mandate type

In [None]:
def save_gender_ratio_per_type_mandat(df, filename):

  # get ratio of females to males per type_mandat
  gender_counts_per_type_mandat = df.groupby(['type_mandat', 'gender']).size().unstack(fill_value=0).reset_index()
  gender_counts_per_type_mandat.columns = ['type_mandat', 'female_count', 'male_count']

  # not needed now
  # gender_counts_per_type_mandat['ratio_perc'] = gender_counts_per_type_mandat['female_count'] / gender_counts_per_type_mandat['male_count'] * 100

  gender_counts_per_type_mandat.to_csv(filename, index=False)

save_gender_ratio_per_type_mandat(df, 'gender_ratio_per_type_mandat_full.csv')
save_gender_ratio_per_type_mandat(latest_declarations_df, 'gender_ratio_per_type_mandat_latest.csv')

### Gender ratio per departement

In [None]:
def save_gender_ratio_per_departement(df, filename):
  # get ratio of females to males per type_mandat
  gender_counts_per_dept = df.groupby(['departement', 'gender']).size().unstack(fill_value=0).reset_index()
  gender_counts_per_dept.columns = ['departement', 'female_count', 'male_count']

  gender_counts_per_dept['women_perc'] = gender_counts_per_dept['female_count'] / (gender_counts_per_dept['female_count']+gender_counts_per_dept['male_count']) * 100

  del gender_counts_per_dept['male_count']
  del gender_counts_per_dept['female_count']

  gender_counts_per_dept.to_csv(filename, index=False)

save_gender_ratio_per_departement(df, 'women_percentage_per_departement_full.csv')
save_gender_ratio_per_departement(latest_declarations_df, 'women_percentage_per_departement_latest.csv')

## Download all datasets

In [None]:
from glob import glob
from google.colab import files

for filename in glob('*.csv'):
  files.download(filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>