In [None]:
import os
import json
import pandas as pd
from collections import defaultdict

In [None]:
# COLLEGAMENTO CON GOOGLE DRIVE
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def calculate_distributions(json_data):
    name_count = defaultdict(int)
    value_count = defaultdict(lambda: defaultdict(int))
    measure_count = defaultdict(int)

    # Iterare sulla lista principale
    for entry in json_data:
        # Verifica se l'entry è un dizionario con la chiave "errore"
        if isinstance(entry, dict) and "errore" in entry:
            print(f"Errore nel file: {entry['errore']}")
            continue  # Salta questa voce e passa alla successiva
        # Se non c'è un errore, procediamo con il calcolo delle distribuzioni
        if isinstance(entry, dict):
        # Ogni entry è un dizionario
          for key, item in entry.items():
            # Distribuzione dei "name" in specifications
            for spec_key, spec in item.get("specifications", {}).items():
                name = spec["name"]
                # Check if name is a list and convert it to a string if necessary
                if isinstance(name, list):
                    name = str(name)  # or any other appropriate string representation

                value = spec["value"]
                name_count[name] += 1
                value_count[name][value] += 1

            # Distribuzione delle "Measure"
            measure = item.get("Measure", "")
            if measure:
                measure_count[measure] += 1
        else:
            print("Trovata una voce non valida (non un dizionario):", entry)

    return name_count, value_count, measure_count

# Funzioni per aggregare i dati
def aggregate_dicts(global_dict, local_dict):
    for key, count in local_dict.items():
        global_dict[key] += count

def aggregate_nested_dicts(global_dict, local_dict):
    for outer_key, inner_dict in local_dict.items():
        for inner_key, count in inner_dict.items():
            global_dict[outer_key][inner_key] += count




In [None]:
# PROFILING PRIMA DELL'ALLINEAMENTO
# Percorso alla cartella con i file JSON
input_folder = "/content/drive/My Drive/Ingegneria-dei-dati/extracted_claims"
# Percorso della cartella per salvare i file delle distribuzioni
distributions_folder = "/content/drive/My Drive/Ingegneria-dei-dati/distributions"
# Creare la cartella di output se non esiste
os.makedirs(distributions_folder, exist_ok=True)

# Percorsi dei file di output
name_profiling_csv = os.path.join(distributions_folder, "name_profiling.csv")
value_profiling_csv = os.path.join(distributions_folder, "value_profiling.csv")
metrics_profiling_csv = os.path.join(distributions_folder, "metrics_profiling.csv")

# Variabili per aggregare i risultati di tutti i file json
all_name_counts = defaultdict(int)
all_value_counts = defaultdict(lambda: defaultdict(int))
all_measure_counts = defaultdict(int)

# Iterare sui file JSON nella cartella
for file_name in os.listdir(input_folder):
    if file_name.endswith(".json"):
        file_path = os.path.join(input_folder, file_name)
        with open(file_path, "r") as file:
            data = json.load(file)
            name_count, value_count, measure_count = calculate_distributions(data)

            # Aggregare i risultati dei singoli file json per ottenere le distribuzioni globali
            aggregate_dicts(all_name_counts, name_count)
            aggregate_nested_dicts(all_value_counts, value_count)
            aggregate_dicts(all_measure_counts, measure_count)

# Preparare i dati per il file name_profiling.csv
name_rows = [{"Name": name, "Count": count} for name, count in all_name_counts.items()]
name_df = pd.DataFrame(name_rows)
name_df.to_csv(name_profiling_csv, index=False)

# Preparare i dati per il file value_profiling.csv
value_rows = [
    {"Name": name, "Value": value, "Count": count}
    for name, values in all_value_counts.items()
    for value, count in values.items()
]
value_df = pd.DataFrame(value_rows)
value_df.to_csv(value_profiling_csv, index=False)

# Preparare i dati per il file metrics_profiling.csv
metric_rows = [{"Measure": measure, "Count": count} for measure, count in all_measure_counts.items()]
metric_df = pd.DataFrame(metric_rows)
metric_df.to_csv(metrics_profiling_csv, index=False)

print(f"File salvati: {name_profiling_csv}, {value_profiling_csv}, {metrics_profiling_csv}")

File salvati: /content/drive/My Drive/Ingegneria-dei-dati/distributions/name_profiling.csv, /content/drive/My Drive/Ingegneria-dei-dati/distributions/value_profiling.csv, /content/drive/My Drive/Ingegneria-dei-dati/distributions/metrics_profiling.csv


In [None]:
# PROFILING DOPO L'ALLINEAMENTO

# Percorso alla cartella con i file JSON
input_folder = "/content/drive/My Drive/Ingegneria-dei-dati/alligned_claims"
# Percorso della cartella per salvare i file delle distribuzioni
distributions_folder = "/content/drive/My Drive/Ingegneria-dei-dati/alligned_distributions"
# Percorsi dei file di output
name_profiling_csv = os.path.join(distributions_folder, "alligned_name_profiling.csv")
value_profiling_csv = os.path.join(distributions_folder, "alligned_value_profiling.csv")
metrics_profiling_csv = os.path.join(distributions_folder, "alligned_metrics_profiling.csv")

# Variabili per aggregare i risultati di tutti i file json
all_name_counts = defaultdict(int)
all_value_counts = defaultdict(lambda: defaultdict(int))
all_measure_counts = defaultdict(int)

# Iterare sui file JSON nella cartella
for file_name in os.listdir(input_folder):
    if file_name.endswith(".json"):
        file_path = os.path.join(input_folder, file_name)
        with open(file_path, "r") as file:
            data = json.load(file)
            print(json.dumps(data, indent=4))
            name_count, value_count, measure_count = calculate_distributions(data)

            # Aggregare i risultati dei singoli file json per ottenere le distribuzioni globali
            aggregate_dicts(all_name_counts, name_count)
            aggregate_nested_dicts(all_value_counts, value_count)
            aggregate_dicts(all_measure_counts, measure_count)

# Preparare i dati per il file name_profiling.csv
name_rows = [{"Name": name, "Count": count} for name, count in all_name_counts.items()]
name_df = pd.DataFrame(name_rows)
name_df.to_csv(name_profiling_csv, index=False)

# Preparare i dati per il file value_profiling.csv
value_rows = [
    {"Name": name, "Value": value, "Count": count}
    for name, values in all_value_counts.items()
    for value, count in values.items()
]
value_df = pd.DataFrame(value_rows)
value_df.to_csv(value_profiling_csv, index=False)

# Preparare i dati per il file metrics_profiling.csv
metric_rows = [{"Measure": measure, "Count": count} for measure, count in all_measure_counts.items()]
metric_df = pd.DataFrame(metric_rows)
metric_df.to_csv(metrics_profiling_csv, index=False)

print(f"File salvati: {name_profiling_csv}, {value_profiling_csv}, {metrics_profiling_csv}")


