# XML to dict to DataFrame -- Worthless result

We want to see if we can convert a large xml to a json (yes!) and then make it into a DataFrame (no!) to display interesting stats.  

As a result, it turns out most columns only contain specific data about a single or a few samples. If the xml attributes were more rigorously ordered and named, we could easily interpret the dataset.  

Instead, we will use the JSON version to proceed further.

## Base files source:  

https://www.hatvp.fr/consulter-les-declarations/#open-data 

In [2]:
# Mount Google Drive if the XML file is stored there
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# first working version

In [None]:
import pandas as pd
import xml.etree.ElementTree as ET
from tqdm.auto import tqdm



# Path to the XML file
xml_file_path = '/content/drive/MyDrive/AI/HATVP/datasets/declarations.xml'

# Parse the XML file
tree = ET.parse(xml_file_path)
root = tree.getroot()

# Create an empty list to store dictionaries
declaration_list = []

# Iterate over the "declaration" objects with progress bar
for declaration in tqdm(root.findall('declaration'), desc='Parsing XML'):
    # Create a dictionary to store the attributes of the current "declaration"
    declaration_data = {}

    # Iterate over the attributes of the current "declaration"
    for attribute in declaration.iter():
        if attribute.tag != 'declaration':
            # Store the attribute value in the dictionary
            declaration_data[attribute.tag] = [attribute.text]  # Wrap scalar values in a list

    # Append the dictionary to the list
    declaration_list.append(declaration_data)

# Create a DataFrame from the list of dictionaries
df = pd.concat([pd.DataFrame(data) for data in declaration_list], ignore_index=True)

# Display the DataFrame
print(df)


Parsing XML:   0%|          | 0/10069 [00:00<?, ?it/s]

                 dateDepot                                  uuid origine  \
0      11/07/2022 15:40:13  4344aaa1-874d-4e6d-9b1a-45f7725b710c    ADEL   
1      27/11/2022 18:18:23  fa8d18ec-0db9-4a39-b1f4-caba0c31329b    ADEL   
2      19/08/2022 10:08:23  21916899-a643-428c-824e-3aedf9ee103e    ADEL   
3      04/10/2022 17:22:07  c40ab214-b21e-43b6-95a2-36d5860dc526    ADEL   
4      03/09/2021 10:41:48  ac693815-1521-4a79-a314-6bd830de8988    ADEL   
...                    ...                                   ...     ...   
10064  15/02/2022 11:07:33  a59cb0f9-4a44-438a-9929-0a4b41bc85ed    ADEL   
10065  17/06/2021 22:39:18  32f13c10-d64e-4697-9dca-f7a6950fdb59    ADEL   
10066  03/12/2020 23:48:29  5e7c9ce7-dc5c-4777-ae9f-39d40f3e02f4    ADEL   
10067  25/06/2021 13:29:08  19140875-1488-43e7-95a7-63d0b7212a19    ADEL   
10068  20/08/2022 08:26:16  d1129c22-a715-4df6-8c0f-cfc678f1518c    ADEL   

      complete attachedFiles                           fileName  \
0         true    \n

## save to csv

In [None]:
df.to_csv('/content/drive/MyDrive/AI/HATVP/datasets/declarations_extract.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10069 entries, 0 to 10068
Columns: 149 entries, dateDepot to resultatFiscal
dtypes: object(149)
memory usage: 11.4+ MB


# XML to data_dict

In [None]:
!pip install xmltodict

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xmltodict
  Downloading xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: xmltodict
Successfully installed xmltodict-0.13.0


In [None]:
import xmltodict
import json

# Load XML data from a file
with open('/content/drive/MyDrive/AI/HATVP/datasets/declarations.xml', 'r', encoding='utf-8') as xml_file:
    xml_data = xml_file.read()

# Convert XML to dictionary
data_dict = xmltodict.parse(xml_data)

file_path = '/content/drive/MyDrive/AI/HATVP/datasets/declarations_dict.json'

# Save data_dict to JSON file --utf-8 is important!
with open(file_path, 'w', encoding='utf-8') as json_file:
    json.dump(data_dict, json_file, ensure_ascii=False)


In [None]:
len(data_dict['declarations']['declaration'])

10069

## create log format text file

In [None]:
import json

file_path = '/content/drive/MyDrive/AI/HATVP/datasets/declarations_dict.json'

# Load data_dict from JSON file
with open(file_path, 'r', encoding='utf-8') as json_file:
    data_dict = json.load(json_file)


In [None]:
# export to a JSON per line and a EOL char at the end
with open('/content/drive/MyDrive/AI/HATVP/datasets/declarations_log.txt', 'w', encoding='utf-8') as file:
    for declaration in data_dict['declarations']['declaration']:
        json_str = json.dumps(declaration, ensure_ascii=False)
        file.write(json_str + '\n')

# flatten data_dict to declaration_list

## load data_dict

In [3]:
import json

file_path = '/content/drive/MyDrive/AI/HATVP/datasets/declarations_dict.json'

# Load data_dict from JSON file
with open(file_path, 'r', encoding='utf-8') as json_file:
    data_dict = json.load(json_file)


In [4]:
!pip install flatten_json

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting flatten_json
  Downloading flatten_json-0.1.13.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: flatten_json
  Building wheel for flatten_json (setup.py) ... [?25l[?25hdone
  Created wheel for flatten_json: filename=flatten_json-0.1.13-py3-none-any.whl size=7964 sha256=70fcc8b9e2d3f2001f1bbb7eeff55ec2b8351a02ea798ab0e55b567e3479c851
  Stored in directory: /root/.cache/pip/wheels/c7/55/89/0dbf87571194b7ed4228b018280a8312e5af2d8f5954504672
Successfully built flatten_json
Installing collected packages: flatten_json
Successfully installed flatten_json-0.1.13


In [5]:
from flatten_json import flatten
from tqdm.auto import tqdm

declaration_list = []
for declaration in tqdm(data_dict['declarations']['declaration']):
  declaration_list.append(flatten(declaration))

  0%|          | 0/10069 [00:00<?, ?it/s]

## export flattened dict to log format

The json `declaration` objects are saved to log format: in a text file, separated by OEL chars.  

In [6]:
# export to a JSON per line and a EOL char at the end
with open('/content/drive/MyDrive/AI/HATVP/datasets/declarations_log_flat.txt', 'w', encoding='utf-8') as file:
    for declaration in declaration_list:
        json_str = json.dumps(declaration, ensure_ascii=False)
        file.write(json_str + '\n')

### batch format

In [7]:
import json
import math

# Define the batch size
batch_size = 100

# Determine the number of batches required
num_batches = math.ceil(len(declaration_list) / batch_size)

# Save declaration objects in batches
for batch_num in range(num_batches):
    # Determine the start and end indices for the current batch
    start_index = batch_num * batch_size
    end_index = min((batch_num + 1) * batch_size, len(declaration_list))
    
    # Generate the file path for the current batch
    file_path = f'/content/drive/MyDrive/AI/HATVP/datasets/batches/declarations_log_batch{batch_num + 1}.txt'
    
    # Export batch to a JSON per line and an EOL char at the end
    with open(file_path, 'w', encoding='utf-8') as file:
        for i in range(start_index, end_index):
            declaration = declaration_list[i]
            json_str = json.dumps(declaration, ensure_ascii=False)
            file.write(json_str + '\n')


# unsuccessful attempts at a DF

In [None]:
# convert list of declaration flat dicts to a single DF
declaration_df = pd.DataFrame(declaration_list)
declaration_df

In [None]:
declaration_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10069 entries, 0 to 10068
Columns: 6932 entries, dateDepot to activConsultantDto_items_items_6_remuneration_montant_montant_6_montant
dtypes: float64(550), object(6382)
memory usage: 532.5+ MB


In [None]:
# Count NaN values per column
nan_counts = declaration_df.isna().sum()

print(nan_counts)

dateDepot                                                                      0
uuid                                                                           0
origine                                                                        0
complete                                                                       0
attachedFiles_attachedFiles_fileName                                         372
                                                                           ...  
activConsultantDto_items_items_6_remuneration_montant_montant_4_montant    10068
activConsultantDto_items_items_6_remuneration_montant_montant_5_annee      10068
activConsultantDto_items_items_6_remuneration_montant_montant_5_montant    10068
activConsultantDto_items_items_6_remuneration_montant_montant_6_annee      10068
activConsultantDto_items_items_6_remuneration_montant_montant_6_montant    10068
Length: 6932, dtype: int64


# Worthless result

We can see that the columns with the least amound of nan values (under 1000 with std being around 970) are worthless: only the "general" category is spared. This doesn't provide any insight, as most interesting values are censored for each individual.

In [None]:
column_stats = nan_counts.to_frame()
column_stats.columns = ['nan_counts']
# column_stats.nan_counts.describe()

column_stats[column_stats.nan_counts < 1000]

Unnamed: 0,nan_counts
dateDepot,0
uuid,0
origine,0
complete,0
attachedFiles_attachedFiles_fileName,372
declarationVersion,162
activConsultantDto_neant,95
activProfCinqDerniereDto_neant,95
activProfConjointDto_neant,95
fonctionBenevoleDto_neant,95
