## install libs

In [None]:
%%capture
!pip install datasets
!pip install -U huggingface_hub

In [None]:
# login to HF HUB
from huggingface_hub import login
from google.colab import userdata

login(userdata.get('HF_TOKEN'))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## load dataset into DF

In [None]:
from datasets import load_dataset

dataset = load_dataset("the-french-artist/hatvp_declarations_xml_plus_json", split='train')

In [None]:
dataset

Dataset({
    features: ['xml_sha1', 'declaration_xml', 'declaration_json', 'extracted_text'],
    num_rows: 10944
})

In [None]:
import pandas as pd

declaration_df = dataset.to_pandas()

In [None]:
del declaration_df['extracted_text']

In [None]:
declaration_df.head()

Unnamed: 0,xml_sha1,declaration_xml,declaration_json
0,0a0a9f2a6772942557ab5355d76af442f8f65e01,<declaration><dateDepot>11/07/2022 15:40:13</d...,"{""declaration"": {""dateDepot"": ""11/07/2022 15:4..."
1,0a0a9f2a6772942557ab5355d76af442f8f65e01,<declaration><dateDepot>27/11/2022 18:18:23</d...,"{""declaration"": {""dateDepot"": ""27/11/2022 18:1..."
2,0a0a9f2a6772942557ab5355d76af442f8f65e01,<declaration><dateDepot>19/08/2022 10:08:23</d...,"{""declaration"": {""dateDepot"": ""19/08/2022 10:0..."
3,0a0a9f2a6772942557ab5355d76af442f8f65e01,<declaration><dateDepot>04/10/2022 17:22:07</d...,"{""declaration"": {""dateDepot"": ""04/10/2022 17:2..."
4,0a0a9f2a6772942557ab5355d76af442f8f65e01,<declaration><dateDepot>20/09/2021 13:41:36</d...,"{""declaration"": {""dateDepot"": ""20/09/2021 13:4..."


## test a single declaration

In [None]:
first_test = declaration_df.head().declaration_json.to_list()[0]

In [None]:
import json

parsed_declaration = json.loads(first_test)
parsed_declaration

{'declaration': {'dateDepot': '11/07/2022 15:40:13',
  'uuid': '4344aaa1-874d-4e6d-9b1a-45f7725b710c',
  'origine': 'ADEL',
  'complete': 'true',
  'attachedFiles': {'attachedFiles': {'fileName': 'VUE_PDF_DU_RECEPISSE_DU_DEPOT_XML',
    'serverFileName': None,
    'base64EncodedContent': ''}},
  'declarationVersion': '20171221',
  'activConsultantDto': {'neant': 'true'},
  'activProfCinqDerniereDto': {'neant': 'true'},
  'activProfConjointDto': {'items': {'items': {'motif': {'id': 'CREATION',
      'label': None},
     'commentaire': None,
     'nomConjoint': '[Données non publiées]',
     'employeurConjoint': 'CENTRE HOSPITALIER DU HAUT-BUGEY',
     'activiteProf': 'Infirmière'}},
   'neant': 'false'},
  'fonctionBenevoleDto': {'neant': 'true'},
  'mandatElectifDto': {'items': {'items': [{'motif': {'id': 'CREATION',
       'label': None},
      'commentaire': 'REVENUS NETS IMPOSABLES        [Données non publiées]',
      'descriptionMandat': 'DEPUTE',
      'remuneration': {'brutNet':

In [None]:
declaration = parsed_declaration.get('declaration')
stock_data_list = None
# safely travel down to stock data list
if declaration is not None:
  participationFinanciereDto = declaration.get('participationFinanciereDto')
  if participationFinanciereDto is not None:
    participationFinanciereDto_items = participationFinanciereDto.get('items')
    if participationFinanciereDto_items is not None:
      stock_data_list = participationFinanciereDto_items.get('items')

stock_data_list

[{'commentaire': '[Données non publiées]',
  'nomSociete': 'ORANGE',
  'evaluation': '877',
  'remuneration': 'néant',
  'capitalDetenu': None,
  'nombreParts': '83',
  'actiConseil': 'Non'},
 {'commentaire': '[Données non publiées]',
  'nomSociete': 'CREDIT AGRICOLE SA',
  'evaluation': '2910',
  'remuneration': 'néant',
  'capitalDetenu': None,
  'nombreParts': '341',
  'actiConseil': 'Non'},
 {'commentaire': '[Données non publiées]',
  'nomSociete': 'AIRBUS',
  'evaluation': '1929',
  'remuneration': 'NEANT',
  'capitalDetenu': None,
  'nombreParts': '20',
  'actiConseil': 'Non'},
 {'commentaire': '[Données non publiées]',
  'nomSociete': "L'OREAL",
  'evaluation': '6552',
  'remuneration': 'NEANT',
  'capitalDetenu': None,
  'nombreParts': '20',
  'actiConseil': 'Non'}]

In [None]:
stock_data_list[0]

{'motif': {'id': 'CREATION', 'label': None},
 'commentaire': '[Données non publiées]',
 'nomSociete': 'ORANGE',
 'evaluation': '877',
 'remuneration': 'néant',
 'capitalDetenu': None,
 'nombreParts': '83',
 'actiConseil': 'Non'}

In [None]:
for curr_stock_data in stock_data_list:
  # remove the "motif" attribute that is not interesting
  if curr_stock_data.get('motif') is not None:
    del curr_stock_data['motif']

stock_data_df = pd.DataFrame(stock_data_list)
stock_data_df

Unnamed: 0,commentaire,nomSociete,evaluation,remuneration,capitalDetenu,nombreParts,actiConseil
0,[Données non publiées],ORANGE,877,néant,,83,Non
1,[Données non publiées],CREDIT AGRICOLE SA,2910,néant,,341,Non
2,[Données non publiées],AIRBUS,1929,NEANT,,20,Non
3,[Données non publiées],L'OREAL,6552,NEANT,,20,Non


### Make a function to get a DF of all stocks for a given declaration (or None if no stocks)

In [None]:
import json


def get_stock_df(string_declaration):
  parsed_declaration = json.loads(string_declaration)
  declaration = parsed_declaration.get('declaration')
  stock_data_list = None
  # safely travel down to stock data list
  if declaration is not None:
    participationFinanciereDto = declaration.get('participationFinanciereDto')
    if participationFinanciereDto is not None:
      participationFinanciereDto_items = participationFinanciereDto.get('items')
      if participationFinanciereDto_items is not None:
        stock_data_list = participationFinanciereDto_items.get('items')

  # early return if no stocks are found in the current declaration
  if stock_data_list is None:
    return None

  if not isinstance(stock_data_list, list):
    stock_data_list = [stock_data_list]

  # remove the "motif" attribute that is not interesting
  for curr_stock_data in stock_data_list:
    if curr_stock_data.get('motif') is not None:
      del curr_stock_data['motif']

  # make a dataframe from the list of dicts
  stocks_df =  pd.DataFrame(stock_data_list)

  # add some ID features
  stocks_df.insert(0, 'declarant_birthdate', parsed_declaration['declaration']['general']['declarant']['dateNaissance'])
  stocks_df.insert(0, 'declarant_surname', parsed_declaration['declaration']['general']['declarant']['prenom'])
  stocks_df.insert(0, 'declarant_name', parsed_declaration['declaration']['general']['declarant']['nom'])
  stocks_df.insert(0, 'declaration_publication_date', parsed_declaration['declaration']['dateDepot'])
  stocks_df.insert(0, 'declaration_uuid', parsed_declaration['declaration']['uuid'])

  # move "commentaire" column to the end
  commentaires = stocks_df.pop('commentaire')
  stocks_df.insert(len(stocks_df.columns), 'commentaire', commentaires)


  return stocks_df

In [None]:
get_stock_df(first_test)

Unnamed: 0,declaration_uuid,declaration_publication_date,declarant_name,declarant_surname,declarant_birthdate,nomSociete,evaluation,remuneration,capitalDetenu,nombreParts,actiConseil,commentaire
0,4344aaa1-874d-4e6d-9b1a-45f7725b710c,11/07/2022 15:40:13,ABAD,DAMIEN,05/04/1980,ORANGE,877,néant,,83,Non,[Données non publiées]
1,4344aaa1-874d-4e6d-9b1a-45f7725b710c,11/07/2022 15:40:13,ABAD,DAMIEN,05/04/1980,CREDIT AGRICOLE SA,2910,néant,,341,Non,[Données non publiées]
2,4344aaa1-874d-4e6d-9b1a-45f7725b710c,11/07/2022 15:40:13,ABAD,DAMIEN,05/04/1980,AIRBUS,1929,NEANT,,20,Non,[Données non publiées]
3,4344aaa1-874d-4e6d-9b1a-45f7725b710c,11/07/2022 15:40:13,ABAD,DAMIEN,05/04/1980,L'OREAL,6552,NEANT,,20,Non,[Données non publiées]


## Perform extraction on complete dataset  

We will collect a list of dataframes, one dataframe per declaration, and then concatenate them.  

In [None]:
string_json_list = declaration_df.declaration_json.to_list()

In [None]:
from tqdm.auto import tqdm

stock_df_list = []
for curr_json_declaration in tqdm(string_json_list):
  stock_df_list.append(get_stock_df(curr_json_declaration))

  0%|          | 0/10944 [00:00<?, ?it/s]

## Format and clean dataset

### Concat list of DFs into a single DF

In [None]:
complete_stocks_df = pd.concat(stock_df_list)
complete_stocks_df.reset_index(inplace=True, drop=True)

### Set name and surname to upper chars  

In [None]:
complete_stocks_df['declarant_name'] = complete_stocks_df['declarant_name'].str.upper()
complete_stocks_df['declarant_surname'] = complete_stocks_df['declarant_surname'].str.upper()
complete_stocks_df.head()

Unnamed: 0,declaration_uuid,declaration_publication_date,declarant_name,declarant_surname,declarant_birthdate,nomSociete,evaluation,remuneration,capitalDetenu,nombreParts,actiConseil,commentaire,nomOrganisationConseil
0,4344aaa1-874d-4e6d-9b1a-45f7725b710c,11/07/2022 15:40:13,ABAD,DAMIEN,05/04/1980,ORANGE,877,néant,,83,Non,[Données non publiées],
1,4344aaa1-874d-4e6d-9b1a-45f7725b710c,11/07/2022 15:40:13,ABAD,DAMIEN,05/04/1980,CREDIT AGRICOLE SA,2910,néant,,341,Non,[Données non publiées],
2,4344aaa1-874d-4e6d-9b1a-45f7725b710c,11/07/2022 15:40:13,ABAD,DAMIEN,05/04/1980,AIRBUS,1929,NEANT,,20,Non,[Données non publiées],
3,4344aaa1-874d-4e6d-9b1a-45f7725b710c,11/07/2022 15:40:13,ABAD,DAMIEN,05/04/1980,L'OREAL,6552,NEANT,,20,Non,[Données non publiées],
4,fa8d18ec-0db9-4a39-b1f4-caba0c31329b,27/11/2022 18:18:23,ABAD,DAMIEN,05/04/1980,ORANGE,877,néant,,83,Non,[Données non publiées],
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11055,19140875-1488-43e7-95a7-63d0b7212a19,25/06/2021 13:29:08,ZUILI,NICOLAS,08/03/1965,AXA,2173,0,,100,,COMPTE TITRE CDN,
11056,19140875-1488-43e7-95a7-63d0b7212a19,25/06/2021 13:29:08,ZUILI,NICOLAS,08/03/1965,BNP PARIBAS,10820,0,,200,,COMPTE TITRES CDN,
11057,19140875-1488-43e7-95a7-63d0b7212a19,25/06/2021 13:29:08,ZUILI,NICOLAS,08/03/1965,RENAULT,7073,0,,200,,COMPTE TITRE CDN,
11058,19140875-1488-43e7-95a7-63d0b7212a19,25/06/2021 13:29:08,ZUILI,NICOLAS,08/03/1965,SAINT GOBAIN,11238,0,,200,,COMPTE TITRES CDN,


### Convert data column to Datetime format

In [None]:
complete_stocks_df['declaration_publication_date'] = pd.to_datetime(complete_stocks_df['declaration_publication_date'], format='%d/%m/%Y %H:%M:%S')

### Create a unique ID for each declarant

In [None]:
# Combine 'declarant_name' and 'declarant_surname' and 'declarant_birthdate' to create a unique identifier for each declarant
complete_stocks_df['declarant_unique_id'] = complete_stocks_df['declarant_name'] + '-' + complete_stocks_df['declarant_surname'] +'-'+ complete_stocks_df['declarant_birthdate']
complete_stocks_df.head()

Unnamed: 0,declaration_uuid,declaration_publication_date,declarant_name,declarant_surname,declarant_birthdate,nomSociete,evaluation,remuneration,capitalDetenu,nombreParts,actiConseil,commentaire,nomOrganisationConseil,declarant_unique_id
0,4344aaa1-874d-4e6d-9b1a-45f7725b710c,2022-07-11 15:40:13,ABAD,DAMIEN,05/04/1980,ORANGE,877,néant,,83,Non,[Données non publiées],,ABAD-DAMIEN-05/04/1980
1,4344aaa1-874d-4e6d-9b1a-45f7725b710c,2022-07-11 15:40:13,ABAD,DAMIEN,05/04/1980,CREDIT AGRICOLE SA,2910,néant,,341,Non,[Données non publiées],,ABAD-DAMIEN-05/04/1980
2,4344aaa1-874d-4e6d-9b1a-45f7725b710c,2022-07-11 15:40:13,ABAD,DAMIEN,05/04/1980,AIRBUS,1929,NEANT,,20,Non,[Données non publiées],,ABAD-DAMIEN-05/04/1980
3,4344aaa1-874d-4e6d-9b1a-45f7725b710c,2022-07-11 15:40:13,ABAD,DAMIEN,05/04/1980,L'OREAL,6552,NEANT,,20,Non,[Données non publiées],,ABAD-DAMIEN-05/04/1980
4,fa8d18ec-0db9-4a39-b1f4-caba0c31329b,2022-11-27 18:18:23,ABAD,DAMIEN,05/04/1980,ORANGE,877,néant,,83,Non,[Données non publiées],,ABAD-DAMIEN-05/04/1980


We count the number of UUID to see how many declarations some declarants have:

In [None]:
# Group by the full name of the declarant and count unique 'declaration_version' values
unique_versions_per_declarant = complete_stocks_df.groupby('declarant_unique_id')['declaration_uuid'].nunique()
unique_versions_per_declarant.to_frame().sort_values('declaration_uuid', ascending=False)

Unnamed: 0_level_0,declaration_uuid
declarant_unique_id,Unnamed: 1_level_1
LISNARD-DAVID-02/02/1969,21
VIAUD-JÉRÔME-13/09/1977,12
KLINKERT-BRIGITTE-22/07/1956,9
LEVY-ARIEL-27/02/1990,9
BLANC-ETIENNE-29/08/1954,9
...,...
GÉRY-FABIEN-07/06/1972,1
GYGES-CHRISTOPHER-18/04/1984,1
GUYON-SÉBASTIEN-26/12/1979,1
GUYOD-STEPHANE-22/05/1970,1


Lets check the top declarant by number of unique declarations:

In [None]:
complete_stocks_df[complete_stocks_df['declarant_name'] == 'LISNARD'].sort_values('declaration_publication_date')

Unnamed: 0,declaration_uuid,declaration_publication_date,declarant_name,declarant_surname,declarant_birthdate,nomSociete,evaluation,remuneration,capitalDetenu,nombreParts,actiConseil,commentaire,nomOrganisationConseil,declarant_unique_id
6564,8ea4ed4c-f037-47b3-8b2d-92c6037074a7,2020-07-09 09:20:01,LISNARD,DAVID,02/02/1969,SCI [Données non publiées],990,0,100.0,99,,Monsieur Lisnard est nue propriétaire de 99 pa...,,LISNARD-DAVID-02/02/1969
6567,e3040ef1-99a3-4a75-8f89-06a5e4bc2e91,2020-09-10 19:34:24,LISNARD,DAVID,02/02/1969,SCI [Données non publiées] ...,990,0,100.0,99,,Monsieur Lisnard est nue propriétaire de 99 pa...,,LISNARD-DAVID-02/02/1969
6565,82ac415e-4933-4bae-9bdc-111b179b6277,2020-09-10 19:40:57,LISNARD,DAVID,02/02/1969,SCI [Données non publiées] [...,990,0,100.0,99,,Monsieur Lisnard est nue propriétaire de 99 pa...,,LISNARD-DAVID-02/02/1969
6568,40145004-6fa4-448a-b595-94df13ff73f5,2020-11-10 10:09:59,LISNARD,DAVID,02/02/1969,SCI [Données non publiées],990,0,100.0,99,,Monsieur Lisnard est nue propriétaire de 99 pa...,,LISNARD-DAVID-02/02/1969
6569,4f0ba7ae-46ec-4cfd-86c4-83d2f9f95739,2020-11-12 14:38:38,LISNARD,DAVID,02/02/1969,SCI [Données non publiées] ...,990,0,100.0,99,,Monsieur Lisnard est nue propriétaire de 99 pa...,,LISNARD-DAVID-02/02/1969
6570,92fbba6c-402d-42ce-8329-0b4f8f47cfdb,2021-01-12 16:35:03,LISNARD,DAVID,02/02/1969,SCI [Données non publiées],990,0,100.0,99,,Monsieur Lisnard est nue propriétaire de 99 pa...,,LISNARD-DAVID-02/02/1969
6571,8d5893fa-d322-4235-8a15-7d95703199bc,2021-01-12 16:36:39,LISNARD,DAVID,02/02/1969,SCI [Données non publiées] ...,990,0,100.0,99,,Monsieur Lisnard est nue propriétaire de 99 pa...,,LISNARD-DAVID-02/02/1969
6573,c1175acf-d2d0-4ffc-91c8-996b6044dac2,2021-07-28 16:36:04,LISNARD,DAVID,02/02/1969,SCI [Données non publiées],990,0,100.0,99,,Monsieur Lisnard est nue propriétaire de 99 pa...,,LISNARD-DAVID-02/02/1969
6572,48cb21e0-9349-4589-bbae-d7b72b58f5e4,2021-07-28 16:45:58,LISNARD,DAVID,02/02/1969,SCI [Données non publiées],990,0,100.0,99,,Monsieur Lisnard est nue propriétaire de 99 pa...,,LISNARD-DAVID-02/02/1969
6566,fc8a1b52-2045-4ad9-af0e-f72179962b68,2021-07-28 16:47:25,LISNARD,DAVID,02/02/1969,SCI [Données non publiées],990,0,100.0,99,,Monsieur Lisnard est nue propriétaire de 99 pa...,,LISNARD-DAVID-02/02/1969


### Convert numbers to number types

In [None]:
# check remaining features to clean
complete_stocks_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11060 entries, 0 to 11059
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   declaration_uuid              11060 non-null  object        
 1   declaration_publication_date  11060 non-null  datetime64[ns]
 2   declarant_name                11060 non-null  object        
 3   declarant_surname             11060 non-null  object        
 4   declarant_birthdate           11060 non-null  object        
 5   nomSociete                    11060 non-null  object        
 6   evaluation                    11060 non-null  int64         
 7   remuneration                  11060 non-null  object        
 8   capitalDetenu                 7011 non-null   float64       
 9   nombreParts                   11059 non-null  float64       
 10  actiConseil                   2752 non-null   object        
 11  commentaire                 

In [None]:
complete_stocks_df.head(3)

Unnamed: 0,declaration_uuid,declaration_publication_date,declarant_name,declarant_surname,declarant_birthdate,nomSociete,evaluation,remuneration,capitalDetenu,nombreParts,actiConseil,commentaire,nomOrganisationConseil,declarant_unique_id
0,4344aaa1-874d-4e6d-9b1a-45f7725b710c,2022-07-11 15:40:13,ABAD,DAMIEN,05/04/1980,ORANGE,877,néant,,83.0,Non,[Données non publiées],,ABAD-DAMIEN-05/04/1980
1,4344aaa1-874d-4e6d-9b1a-45f7725b710c,2022-07-11 15:40:13,ABAD,DAMIEN,05/04/1980,CREDIT AGRICOLE SA,2910,néant,,341.0,Non,[Données non publiées],,ABAD-DAMIEN-05/04/1980
2,4344aaa1-874d-4e6d-9b1a-45f7725b710c,2022-07-11 15:40:13,ABAD,DAMIEN,05/04/1980,AIRBUS,1929,NEANT,,20.0,Non,[Données non publiées],,ABAD-DAMIEN-05/04/1980


These features can be easily converted to numeric types:

In [None]:
complete_stocks_df['evaluation'] = pd.to_numeric(complete_stocks_df['evaluation'])

In [None]:
complete_stocks_df['nombreParts'] = pd.to_numeric(complete_stocks_df['nombreParts'])

In [None]:
complete_stocks_df['capitalDetenu'] = pd.to_numeric(complete_stocks_df['capitalDetenu'])

In [None]:
complete_stocks_df['nombreParts'] = pd.to_numeric(complete_stocks_df['nombreParts'])

The `remuneration` feature is fuzzy.  
We leave it raw.  

In [None]:
complete_stocks_df['remuneration'].value_counts()

remuneration
0           5669
Néant        214
NEANT        154
NS            95
neant         85
            ... 
655            1
32 euros       1
481,81         1
398,05         1
762            1
Name: count, Length: 2073, dtype: int64

### Assign labels to string features  

In [None]:
# check remaining features to clean
complete_stocks_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11060 entries, 0 to 11059
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   declaration_uuid              11060 non-null  object        
 1   declaration_publication_date  11060 non-null  datetime64[ns]
 2   declarant_name                11060 non-null  object        
 3   declarant_surname             11060 non-null  object        
 4   declarant_birthdate           11060 non-null  object        
 5   nomSociete                    11060 non-null  object        
 6   evaluation                    11060 non-null  int64         
 7   remuneration                  11060 non-null  object        
 8   capitalDetenu                 7011 non-null   float64       
 9   nombreParts                   11059 non-null  float64       
 10  actiConseil                   2752 non-null   object        
 11  commentaire                 

In [None]:
complete_stocks_df.head(3)

Unnamed: 0,declaration_uuid,declaration_publication_date,declarant_name,declarant_surname,declarant_birthdate,nomSociete,evaluation,remuneration,capitalDetenu,nombreParts,actiConseil,commentaire,nomOrganisationConseil,declarant_unique_id
0,4344aaa1-874d-4e6d-9b1a-45f7725b710c,2022-07-11 15:40:13,ABAD,DAMIEN,05/04/1980,ORANGE,877,néant,,83.0,Non,[Données non publiées],,ABAD-DAMIEN-05/04/1980
1,4344aaa1-874d-4e6d-9b1a-45f7725b710c,2022-07-11 15:40:13,ABAD,DAMIEN,05/04/1980,CREDIT AGRICOLE SA,2910,néant,,341.0,Non,[Données non publiées],,ABAD-DAMIEN-05/04/1980
2,4344aaa1-874d-4e6d-9b1a-45f7725b710c,2022-07-11 15:40:13,ABAD,DAMIEN,05/04/1980,AIRBUS,1929,NEANT,,20.0,Non,[Données non publiées],,ABAD-DAMIEN-05/04/1980


`actiConseil` is already clean enough:

In [None]:
complete_stocks_df.actiConseil.value_counts()

actiConseil
Non    2670
Oui      82
Name: count, dtype: int64

`nomOrganisationConseil` is seldom used and doesn't bring valuable insights. We delete the feature.  

In [None]:
complete_stocks_df.nomOrganisationConseil.value_counts()

nomOrganisationConseil
[Données non publiées]                                                                              6
aucune                                                                                              6
Mediascop 100%                                                                                      4
0                                                                                                   4
EILIS. 100% du capital détenu en tant que dirigeant fondateur non salarié de cette SASU.            3
Selarl Pascale BORDES Avocate; je détiens 100 % des parts sociales, soit 100 % du capital social    3
AMP CONSEILS LILLE 75% DU CAPITAL                                                                   2
AMP CONSEILS AIX 60% DU CAPITAL                                                                     2
MLEXPERT 100% DU CAPITAL                                                                            2
SAS Renaissance                                            

In [None]:
del complete_stocks_df['nomOrganisationConseil']

## Upload dataset to the HUB

Once clean and complete, we can safely upload the dataset to the HUB for further analysis.

In [None]:
from datasets import Dataset

complete_stocks_ds = Dataset.from_pandas(complete_stocks_df)

In [None]:
complete_stocks_ds

Dataset({
    features: ['declaration_uuid', 'declaration_publication_date', 'declarant_name', 'declarant_surname', 'declarant_birthdate', 'nomSociete', 'evaluation', 'remuneration', 'capitalDetenu', 'nombreParts', 'actiConseil', 'commentaire', 'declarant_unique_id'],
    num_rows: 11060
})

In [None]:
complete_stocks_ds.push_to_hub("the-french-artist/hatvp_stock_participationFinanciereDto")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/the-french-artist/hatvp_stock_participationFinanciereDto/commit/ab607c6316d4cd75212305c9d8f3257b417b06c5', commit_message='Upload dataset', commit_description='', oid='ab607c6316d4cd75212305c9d8f3257b417b06c5', pr_url=None, pr_revision=None, pr_num=None)

# Conclusion  

## Some features are incomplete  

The following features have missing values:  
````
commentaire
actiConseil
capitalDetenu
````

And `remuneration` is complete but contains fuzzy values (`32€` or `32 euros` or `32`, or `Néant`, etc...)

Next part here:  

https://colab.research.google.com/drive/1HNRCwOiMuZBy-e-y9pRSqUTPkRwNfpuK#scrollTo=DPgn2Ko6QhZ4