In [1]:
import os
import json
import csv
import re
from collections import Counter, defaultdict

In [2]:
def sanitize_excel_formulas(value):
    """
    Restituisce il valore "sanitizzato" se inizia con
    +, -, = o @ (quest'ultimo talvolta è considerato formula in Excel).
    """
    if isinstance(value, str) and re.match(r'^[=+\-@]', value):
        return "'" + value  # Prepone l'apostrofo
    return value

def profile_claims(folder_path, output_csv="NAME_PROFILING.csv"):
    """
    Legge tutti i file JSON in 'folder_path', estrae le distribuzioni di:
      - name (da specifications)
      - value per ciascun name
      - Measure
    e produce un CSV con 2 colonne (KEY, COUNT).
    """

    # Contatori
    name_counts = Counter()
    value_counts = defaultdict(Counter)  # "name" -> Counter dei "value"
    measure_counts = Counter()

    # Scorri tutti i file .json nella cartella
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            # Scorri ogni elemento del file JSON
            for _obj_key, obj_value in data.items():
                specs = obj_value.get("specifications", {})

                for _, spec_item in specs.items():
                    name = spec_item.get("name")
                    val = spec_item.get("value")

                    # Aggiorna contatori name/value
                    if name:
                        name_counts[name] += 1
                        if val:
                            value_counts[name][val] += 1

                # Estraggo la Measure
                measure = obj_value.get("Measure")
                if measure:
                    measure_counts[measure] += 1

    # Scrittura del CSV
    # Usare delimiter=';' aiuta in locale italiano per avere le colonne separate correttamente in Excel
    with open(output_csv, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile, delimiter=';')  
        
        # Intestazione
        writer.writerow(["KEY", "COUNT"])

        # 1) Distribuzione dei "name"
        writer.writerow([sanitize_excel_formulas("-- Distribution of 'name' --"), ""])
        for nm, nm_count in name_counts.items():
            # Sanitizziamo sia nm che nm_count
            writer.writerow([sanitize_excel_formulas(nm), sanitize_excel_formulas(str(nm_count))])
        writer.writerow([])  # Riga vuota

        # 2) Distribuzione dei "value" per ciascun "name"
        writer.writerow([sanitize_excel_formulas("-- Distribution of 'value' for each 'name' --"), ""])
        for nm, val_counter in value_counts.items():
            # Riga di intestazione per ogni "name"
            writer.writerow([sanitize_excel_formulas(f"(name) {nm}"), ""])
            for val, val_count in val_counter.items():
                writer.writerow([sanitize_excel_formulas(val), sanitize_excel_formulas(str(val_count))])
            writer.writerow([])

        # 3) Distribuzione delle "Measure"
        writer.writerow([sanitize_excel_formulas("-- Distribution of 'Measure' --"), ""])
        for measure_key, measure_count in measure_counts.items():
            writer.writerow([sanitize_excel_formulas(measure_key), sanitize_excel_formulas(str(measure_count))])

    print(f"Profiling completato! File salvato come: {output_csv}")

In [4]:
# Esempio di utilizzo:
cartella_json = "../data/Gemini_claims/json"
output_file = "../data/profiling/DATA_HUNTERS_PROFILING.csv"
profile_claims(cartella_json, output_file)

Profiling completato! File salvato come: ../data/profiling/DATA_HUNTERS_PROFILING.csv
