# Export huggingface dataset for frontend usage  

We want to make a static dataviz website.  
We need to have csv and json files availiable.  

## install libs

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19

## load dataset

### download

In [None]:
from datasets import load_dataset

dataset = load_dataset("the-french-artist/hatvp_declaration_list_archive", split="train")

Downloading readme:   0%|          | 0.00/937 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12556 [00:00<?, ? examples/s]

In [None]:
dataset

Dataset({
    features: ['civilite', 'prenom', 'nom', 'classement', 'type_mandat', 'qualite', 'type_document', 'departement', 'date_publication', 'date_depot', 'nom_fichier', 'url_dossier', 'open_data', 'statut_publication', 'id_origine', 'url_photo'],
    num_rows: 12556
})

In [None]:
df = dataset.to_pandas()

In [None]:
df.head()

Unnamed: 0,civilite,prenom,nom,classement,type_mandat,qualite,type_document,departement,date_publication,date_depot,nom_fichier,url_dossier,open_data,statut_publication,id_origine,url_photo
0,Mme,Abbassia,HAKEM,hakemaaaabbassia4615,commune,Adjointe au maire de Nantes,di,44,2021-09-27,2020-09-01,hakem-abbassia-di16146-commune-nantes.pdf,/pages_nominatives/hakem-abbassia,hakem-abbassia-di16146-commune-nantes.xml,Livrée,,
1,Mme,Abbassia,HAKEM,hakemaaaabbassia4615,commune,Adjointe au maire de Nantes,dim,44,2021-09-28,2021-08-02,hakem-abbassia-dim16147-commune-nantes.pdf,/pages_nominatives/hakem-abbassia,hakem-abbassia-dim16147-commune-nantes.xml,Livrée,,
2,M.,Abdel Kader,CHEKHEMANI,chekhemaniaaaabdelkader1679,commune,Adjoint au maire de Rouen,di,76,2021-10-11,2020-10-14,chekhemani-abdel-kader-di16326-commune-rouen.pdf,/pages_nominatives/chekhemani-abdel-kader,chekhemani-abdel-kader-di16326-commune-rouen.xml,Livrée,,
3,M.,Abdel Kader,CHEKHEMANI,chekhemaniaaaabdelkader1679,commune,Adjoint au maire de Rouen,dim,76,2021-10-12,2021-07-14,chekhemani-abdel-kader-dim16327-commune-rouen.pdf,/pages_nominatives/chekhemani-abdel-kader,chekhemani-abdel-kader-dim16327-commune-rouen.xml,Livrée,,
4,M.,Abdelaziz,HAMIDA,HAMIDA Abdelaziz18939,commune,Maire de Goussainville,di,28,2021-08-10,2020-10-03,hamida-abdelaziz-di15206-commune-goussainville...,/pages_nominatives/hamida-abdelaziz-18939,hamida-abdelaziz-di15206-commune-goussainville...,Livrée,,


### convert dates to datetime type

In [None]:
import pandas as pd

# convert date_depot to datetime
df['date_depot'] = pd.to_datetime(df['date_depot'])
df['date_publication'] = pd.to_datetime(df['date_publication'])

### add gender feature

In [None]:
def get_gender(civility):
    if civility == 'Mme':
        return 'female'
    return 'male'

# Determine the gender for each 'prenom'
#df['gender'] = df['prenom'].apply(lambda x: d.get_gender(x))
df['gender'] = df['civilite'].apply(lambda x: get_gender(x))

### posting to publication time
How long between the two dates? (in days)

In [None]:
df['publication_delay'] = (df['date_publication'] - df['date_depot']).dt.days
df.head()

Unnamed: 0,civilite,prenom,nom,classement,type_mandat,qualite,type_document,departement,date_publication,date_depot,nom_fichier,url_dossier,open_data,statut_publication,id_origine,url_photo,gender,publication_delay
0,Mme,Abbassia,HAKEM,hakemaaaabbassia4615,commune,Adjointe au maire de Nantes,di,44,2021-09-27,2020-09-01,hakem-abbassia-di16146-commune-nantes.pdf,/pages_nominatives/hakem-abbassia,hakem-abbassia-di16146-commune-nantes.xml,Livrée,,,female,391.0
1,Mme,Abbassia,HAKEM,hakemaaaabbassia4615,commune,Adjointe au maire de Nantes,dim,44,2021-09-28,2021-08-02,hakem-abbassia-dim16147-commune-nantes.pdf,/pages_nominatives/hakem-abbassia,hakem-abbassia-dim16147-commune-nantes.xml,Livrée,,,female,57.0
2,M.,Abdel Kader,CHEKHEMANI,chekhemaniaaaabdelkader1679,commune,Adjoint au maire de Rouen,di,76,2021-10-11,2020-10-14,chekhemani-abdel-kader-di16326-commune-rouen.pdf,/pages_nominatives/chekhemani-abdel-kader,chekhemani-abdel-kader-di16326-commune-rouen.xml,Livrée,,,male,362.0
3,M.,Abdel Kader,CHEKHEMANI,chekhemaniaaaabdelkader1679,commune,Adjoint au maire de Rouen,dim,76,2021-10-12,2021-07-14,chekhemani-abdel-kader-dim16327-commune-rouen.pdf,/pages_nominatives/chekhemani-abdel-kader,chekhemani-abdel-kader-dim16327-commune-rouen.xml,Livrée,,,male,90.0
4,M.,Abdelaziz,HAMIDA,HAMIDA Abdelaziz18939,commune,Maire de Goussainville,di,28,2021-08-10,2020-10-03,hamida-abdelaziz-di15206-commune-goussainville...,/pages_nominatives/hamida-abdelaziz-18939,hamida-abdelaziz-di15206-commune-goussainville...,Livrée,,,male,311.0


### inspect duplicates

Each row is not a unique person, just an update.  
We find that the number of unique people is twice lower, meaning an average of 2 updates per people.  

To get meaningful results, we only keep the latest declaration for each unique undividual.  

In [None]:
len(df)

12556

In [None]:
len(df['classement'].unique())

6284

We sort by dateDepot in asending order and group by the unique declarant id "classement", then we select the last sample to get the latest declaration for each declarant.

In [None]:
df[df['classement'] == 'hakemaaaabbassia4615']

Unnamed: 0,civilite,prenom,nom,classement,type_mandat,qualite,type_document,departement,date_publication,date_depot,nom_fichier,url_dossier,open_data,statut_publication,id_origine,url_photo,gender,publication_delay
0,Mme,Abbassia,HAKEM,hakemaaaabbassia4615,commune,Adjointe au maire de Nantes,di,44,2021-09-27,2020-09-01,hakem-abbassia-di16146-commune-nantes.pdf,/pages_nominatives/hakem-abbassia,hakem-abbassia-di16146-commune-nantes.xml,Livrée,,,female,391.0
1,Mme,Abbassia,HAKEM,hakemaaaabbassia4615,commune,Adjointe au maire de Nantes,dim,44,2021-09-28,2021-08-02,hakem-abbassia-dim16147-commune-nantes.pdf,/pages_nominatives/hakem-abbassia,hakem-abbassia-dim16147-commune-nantes.xml,Livrée,,,female,57.0


In [None]:
latest_declarations_df = df.sort_values('date_depot').groupby(['classement'], as_index=False).last()
latest_declarations_df[latest_declarations_df['classement'] == 'hakemaaaabbassia4615']

Unnamed: 0,classement,civilite,prenom,nom,type_mandat,qualite,type_document,departement,date_publication,date_depot,nom_fichier,url_dossier,open_data,statut_publication,id_origine,url_photo,gender,publication_delay
5400,hakemaaaabbassia4615,Mme,Abbassia,HAKEM,commune,Adjointe au maire de Nantes,dim,44,2021-09-28,2021-08-02,hakem-abbassia-dim16147-commune-nantes.pdf,/pages_nominatives/hakem-abbassia,hakem-abbassia-dim16147-commune-nantes.xml,Livrée,,,female,57.0


## Export datasets

We export 2 versions per dataset:  
* `full` includes all lines in the dataset, including mulitple updates per person  
* `latest` includes only the last update for each unique person  

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12556 entries, 0 to 12555
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   civilite            12556 non-null  object        
 1   prenom              12556 non-null  object        
 2   nom                 12556 non-null  object        
 3   classement          12556 non-null  object        
 4   type_mandat         12556 non-null  object        
 5   qualite             12556 non-null  object        
 6   type_document       12556 non-null  object        
 7   departement         10645 non-null  object        
 8   date_publication    10898 non-null  datetime64[ns]
 9   date_depot          12098 non-null  datetime64[ns]
 10  nom_fichier         10974 non-null  object        
 11  url_dossier         12556 non-null  object        
 12  open_data           10895 non-null  object        
 13  statut_publication  12546 non-null  object    

In [None]:
df.head()

Unnamed: 0,civilite,prenom,nom,classement,type_mandat,qualite,type_document,departement,date_publication,date_depot,nom_fichier,url_dossier,open_data,statut_publication,id_origine,url_photo,gender,publication_delay
0,Mme,Abbassia,HAKEM,hakemaaaabbassia4615,commune,Adjointe au maire de Nantes,di,44,2021-09-27,2020-09-01,hakem-abbassia-di16146-commune-nantes.pdf,/pages_nominatives/hakem-abbassia,hakem-abbassia-di16146-commune-nantes.xml,Livrée,,,female,391.0
1,Mme,Abbassia,HAKEM,hakemaaaabbassia4615,commune,Adjointe au maire de Nantes,dim,44,2021-09-28,2021-08-02,hakem-abbassia-dim16147-commune-nantes.pdf,/pages_nominatives/hakem-abbassia,hakem-abbassia-dim16147-commune-nantes.xml,Livrée,,,female,57.0
2,M.,Abdel Kader,CHEKHEMANI,chekhemaniaaaabdelkader1679,commune,Adjoint au maire de Rouen,di,76,2021-10-11,2020-10-14,chekhemani-abdel-kader-di16326-commune-rouen.pdf,/pages_nominatives/chekhemani-abdel-kader,chekhemani-abdel-kader-di16326-commune-rouen.xml,Livrée,,,male,362.0
3,M.,Abdel Kader,CHEKHEMANI,chekhemaniaaaabdelkader1679,commune,Adjoint au maire de Rouen,dim,76,2021-10-12,2021-07-14,chekhemani-abdel-kader-dim16327-commune-rouen.pdf,/pages_nominatives/chekhemani-abdel-kader,chekhemani-abdel-kader-dim16327-commune-rouen.xml,Livrée,,,male,90.0
4,M.,Abdelaziz,HAMIDA,HAMIDA Abdelaziz18939,commune,Maire de Goussainville,di,28,2021-08-10,2020-10-03,hamida-abdelaziz-di15206-commune-goussainville...,/pages_nominatives/hamida-abdelaziz-18939,hamida-abdelaziz-di15206-commune-goussainville...,Livrée,,,male,311.0


In [None]:
df.to_csv('export.csv')

### number of postings/publications per day.

In [None]:
df['date_depot'].min()

Timestamp('2019-05-06 00:00:00')

In [None]:
df['date_depot'].max()

Timestamp('2024-06-11 00:00:00')

In [None]:
def get_publication_per_date(df, filename):
  date_range = pd.date_range(start=df['date_depot'].min(), end=df['date_depot'].max())
  date_range_df = pd.DataFrame({'date': date_range})

  # same for date_publication
  date_range_publication = pd.date_range(start=df['date_publication'].min(), end=df['date_publication'].max())
  date_range_publication_df = pd.DataFrame({'date': date_range_publication})

  # create a single date_rage_df with all dates from both DF
  date_range_df = date_range_df.merge(date_range_publication_df, how='outer')

  # count number of declarations per datedepot day (write 0 if None)
  date_range_df['date_depot_count'] = date_range_df['date'].map(df['date_depot'].value_counts())
  date_range_df['date_depot_count'].fillna(0, inplace=True)

  # same for date_publication
  date_range_df['date_publication_count'] = date_range_df['date'].map(df['date_publication'].value_counts())
  date_range_df['date_publication_count'].fillna(0, inplace=True)

  # sum number of publications by year-month
  date_range_df['date_m'] = date_range_df.date.dt.to_period('M')

  monthly_declaration_counts = pd.concat([
    date_range_df.groupby('date_m')['date_depot_count'].sum(),
    date_range_df.groupby('date_m')['date_publication_count'].sum()],
    axis=1
  ).reset_index()
  monthly_declaration_counts.columns = [
      'date',
      'date_depot_count',
      'date_publication_count'
  ]

  monthly_declaration_counts.to_csv(filename, index=False)

get_publication_per_date(df, 'publications_per_month_full.csv')
get_publication_per_date(latest_declarations_df, 'publications_per_month_latest.csv')

### average time between publication and posting

In [None]:
# get average 'publication_delay' per month and year
def get_average_publication_delay(df, filename):
  publication_delay_df =  df.groupby(df['date_depot'].dt.year)['publication_delay'].mean().to_frame().reset_index()
  publication_delay_df.columns = ['year', 'average_publication_delay']
  publication_delay_df.to_csv(filename, index=False)

get_average_publication_delay(df, 'average_publication_delay_full.csv')
get_average_publication_delay(latest_declarations_df, 'average_publication_delay_latest.csv')

## number of publications per departement  
To show how many people come from where

In [None]:
def get_publication_per_departement(df, filename):
  departement_count_df = df.groupby('departement')['classement'].count().to_frame().reset_index()
  departement_count_df.columns = ['departement', 'count']
  departement_count_df.to_csv(filename, index=False)

get_publication_per_departement(df, 'publications_per_departement_full.csv')
get_publication_per_departement(latest_declarations_df, 'publications_per_departement_latest.csv')

### document types

In [None]:
def get_document_type_count(df, filename):
  document_type_count_df = df.groupby('type_document')['classement'].count().to_frame().reset_index()
  document_type_count_df.columns = ['document_type', 'count']
  document_type_count_df.to_csv(filename, index=False)

get_document_type_count(df, 'document_type_count_full.csv')
get_document_type_count(latest_declarations_df, 'document_type_count_latest.csv')

## Download all datasets

In [None]:
from glob import glob
from google.colab import files

for filename in glob('*.csv'):
  files.download(filename)