In [2]:
import os
import json
import csv
import re
from collections import Counter, defaultdict

In [3]:
def sanitize_excel_formulas(value):
    """
    Restituisce il valore "sanitizzato" se inizia con
    +, -, = o @ (quest'ultimo talvolta è considerato formula in Excel).
    """
    if isinstance(value, str) and re.match(r'^[=+\-@]', value):
        return "'" + value  # Prepone l'apostrofo
    return value

def profile_claims(folder_path, output_csv="NAME_PROFILING.csv"):
    """
    Legge tutti i file JSON in 'folder_path', estrae le distribuzioni di:
      - name (da specifications)
      - value per ciascun name
      - Measure
    e produce un CSV con 2 colonne (KEY, COUNT).
    """

    # Contatori
    name_counts = Counter()
    value_counts = defaultdict(Counter)  # "name" -> Counter dei "value"
    measure_counts = Counter()

    # Scorri tutti i file .json nella cartella
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            # Scorri ogni elemento del file JSON
            for _obj_key, obj_value in data.items():
                specs = obj_value.get("specifications", {})

                for _, spec_item in specs.items():
                    name = spec_item.get("name")
                    val = spec_item.get("value")

                    # Aggiorna contatori name/value
                    if name:
                        name_counts[name] += 1
                        if val:
                            value_counts[name][val] += 1

                # Estraggo la Measure
                measure = obj_value.get("Measure")
                if measure:
                    measure_counts[measure] += 1

    # Scrittura del CSV
    # Usare delimiter=';' aiuta in locale italiano per avere le colonne separate correttamente in Excel
    with open(output_csv, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile, delimiter=';')  
        
        # Intestazione
        writer.writerow(["KEY", "COUNT"])

        # 1) Distribuzione dei "name"
        writer.writerow([sanitize_excel_formulas("-- Distribution of 'name' --"), ""])
        for nm, nm_count in name_counts.items():
            # Sanitizziamo sia nm che nm_count
            writer.writerow([sanitize_excel_formulas(nm), sanitize_excel_formulas(str(nm_count))])
        writer.writerow([])  # Riga vuota

        # 2) Distribuzione dei "value" per ciascun "name"
        writer.writerow([sanitize_excel_formulas("-- Distribution of 'value' for each 'name' --"), ""])
        for nm, val_counter in value_counts.items():
            # Riga di intestazione per ogni "name"
            writer.writerow([sanitize_excel_formulas(f"(name) {nm}"), ""])
            for val, val_count in val_counter.items():
                writer.writerow([sanitize_excel_formulas(val), sanitize_excel_formulas(str(val_count))])
            writer.writerow([])

        # 3) Distribuzione delle "Measure"
        writer.writerow([sanitize_excel_formulas("-- Distribution of 'Measure' --"), ""])
        for measure_key, measure_count in measure_counts.items():
            writer.writerow([sanitize_excel_formulas(measure_key), sanitize_excel_formulas(str(measure_count))])

    print(f"Profiling completato! File salvato come: {output_csv}")

In [8]:
# Esempio di utilizzo:
cartella_json = "../data/Gemini_claims/json"
output_file = "../data/profiling/DATA_HUNTERS_PROFILING.csv"
profile_claims(cartella_json, output_file)

Profiling completato! File salvato come: ../data/profiling/DATA_HUNTERS_PROFILING.csv


 PER OGNI FILE

In [4]:
def profile_claims_by_file(folder_path, output_csv="NAME_PROFILING_BY_FILE.csv"):
    """
    Per ciascun file .json nella cartella 'folder_path', calcola
    - la distribuzione di 'name'
    - la distribuzione di 'value' per ciascun 'name'
    - la distribuzione di 'Measure'

    e scrive tutto in *un unico CSV* (output_csv), in cui:
        Colonna A = KEY
        Colonna B = COUNT
        Colonna C = FILENAME (il nome del file JSON)
    """

    with open(output_csv, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile, delimiter=';')
        
        # Intestazione con le tre colonne richieste
        writer.writerow(["KEY", "COUNT", "FILENAME"])
        
        # Scorri tutti i file .json nella cartella
        for filename in sorted(os.listdir(folder_path)):
            if filename.endswith(".json"):
                file_path = os.path.join(folder_path, filename)
                
                # Inizializziamo i contatori per questo file
                name_counts = Counter()
                value_counts = defaultdict(Counter)
                measure_counts = Counter()
                
                # Leggiamo il JSON
                with open(file_path, "r", encoding="utf-8") as f:
                    data = json.load(f)

                # Popoliamo i contatori
                for _, obj_value in data.items():
                    specs = obj_value.get("specifications", {})
                    for _, spec_item in specs.items():
                        nm = spec_item.get("name")
                        val = spec_item.get("value")
                        if nm:
                            name_counts[nm] += 1
                            if val:
                                value_counts[nm][val] += 1

                    measure = obj_value.get("Measure")
                    if measure:
                        measure_counts[measure] += 1

                # Scriviamo i blocchi per *questo file*

                # 1) Distribuzione dei "name"
                writer.writerow([
                    sanitize_excel_formulas("-- Distribution of 'name' --"), 
                    "", 
                    sanitize_excel_formulas(filename)
                ])
                for nm, nm_count in name_counts.items():
                    writer.writerow([
                        sanitize_excel_formulas(nm),
                        sanitize_excel_formulas(str(nm_count)),
                        sanitize_excel_formulas(filename)
                    ])
                writer.writerow([])  # Riga vuota

                # 2) Distribuzione dei "value" per ciascun "name"
                writer.writerow([
                    sanitize_excel_formulas("-- Distribution of 'value' for each 'name' --"), 
                    "", 
                    sanitize_excel_formulas(filename)
                ])
                for nm, val_counter in value_counts.items():
                    writer.writerow([
                        sanitize_excel_formulas(f"(name) {nm}"), 
                        "", 
                        sanitize_excel_formulas(filename)
                    ])
                    for val, val_count in val_counter.items():
                        writer.writerow([
                            sanitize_excel_formulas(val),
                            sanitize_excel_formulas(str(val_count)),
                            sanitize_excel_formulas(filename)
                        ])
                    writer.writerow([])

                # 3) Distribuzione delle "Measure"
                writer.writerow([
                    sanitize_excel_formulas("-- Distribution of 'Measure' --"), 
                    "", 
                    sanitize_excel_formulas(filename)
                ])
                for measure_key, measure_count in measure_counts.items():
                    writer.writerow([
                        sanitize_excel_formulas(measure_key),
                        sanitize_excel_formulas(str(measure_count)),
                        sanitize_excel_formulas(filename)
                    ])
                writer.writerow([])  # Riga vuota per separare da file successivo

    print(f"Profiling by file completato! File salvato come: {output_csv}")

In [9]:
# Esempio di utilizzo:
output_file = "../data/profiling/DATA_HUNTERS_PROFILING_PER_FILE.csv"
profile_claims_by_file(cartella_json, output_file)

Profiling by file completato! File salvato come: ../data/profiling/DATA_HUNTERS_PROFILING_PER_FILE.csv


PER SINGOLO PAPER

In [5]:
def get_paper_code(filename):
    """
    Estrae la parte prima del primo underscore come 'paper code'.
    Es: '2403.19709_1_claims.json' -> '2403.19709'
    """
    # rsplit('_', 1) separerebbe dall'ultima underscore,
    # ma vogliamo la prima underscore, quindi split('_', 1)
    # Se i nomi dei file avessero formati più complessi, regola la logica di split.
    value = filename.split('_', 1)[0]
    # Oppure se è un numero (intero o decimale) con eventuale segno negativo
    if re.match(r'^-?\d+(\.\d+)?$', value):
        return "'" + value
    return value


def profile_claims_by_paper(folder_path, output_csv="NAME_PROFILING_BY_PAPER.csv"):
    """
    - Legge tutti i file JSON in 'folder_path'.
    - Estrae il 'paper code' dal nome file (prima dell'underscore).
    - Accorpa (somma) la distribuzione di name, value, measure di tutti i file
      che appartengono allo stesso paper code.
    - Scrive tutto in un unico CSV con colonna A=KEY, B=COUNT, C=PAPER.
    """

    # Dizionario per raggruppare i contatori *per paper code*
    # paper_dict[paper_code]["name_counts"] = Counter()
    # paper_dict[paper_code]["value_counts"] = { name: Counter() }
    # paper_dict[paper_code]["measure_counts"] = Counter()
    paper_dict = defaultdict(lambda: {
        "name_counts": Counter(),
        "value_counts": defaultdict(Counter),
        "measure_counts": Counter()
    })

    # Scorriamo tutti i file .json
    for filename in sorted(os.listdir(folder_path)):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            paper_code = get_paper_code(filename)  # es. "2403.19709"

            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            # Popoliamo i contatori per questo 'paper_code'
            for _, obj_value in data.items():
                specs = obj_value.get("specifications", {})
                for _, spec_item in specs.items():
                    nm = spec_item.get("name")
                    val = spec_item.get("value")
                    if nm:
                        paper_dict[paper_code]["name_counts"][nm] += 1
                        if val:
                            paper_dict[paper_code]["value_counts"][nm][val] += 1

                measure = obj_value.get("Measure")
                if measure:
                    paper_dict[paper_code]["measure_counts"][measure] += 1

    # Ora scriviamo un *unico CSV* con le distribuzioni *per paper*
    with open(output_csv, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile, delimiter=';')
        writer.writerow(["KEY", "COUNT", "PAPER"])

        # Iteriamo sui paper_code, ad es. "2403.19709", "2403.11629", ...
        for paper_code, counters in paper_dict.items():
            name_counts = counters["name_counts"]
            value_counts = counters["value_counts"]
            measure_counts = counters["measure_counts"]

            # 1) Distribuzione dei "name"
            writer.writerow([
                sanitize_excel_formulas("-- Distribution of 'name' --"), 
                "", 
                sanitize_excel_formulas(paper_code)
            ])
            for nm, nm_count in name_counts.items():
                writer.writerow([
                    sanitize_excel_formulas(nm),
                    sanitize_excel_formulas(str(nm_count)),
                    sanitize_excel_formulas(paper_code)
                ])
            writer.writerow([])

            # 2) Distribuzione dei "value" per ciascun "name"
            writer.writerow([
                sanitize_excel_formulas("-- Distribution of 'value' for each 'name' --"),
                "", 
                sanitize_excel_formulas(paper_code)
            ])
            for nm, val_counter in value_counts.items():
                writer.writerow([
                    sanitize_excel_formulas(f"(name) {nm}"),
                    "",
                    sanitize_excel_formulas(paper_code)
                ])
                for val, val_count in val_counter.items():
                    writer.writerow([
                        sanitize_excel_formulas(val),
                        sanitize_excel_formulas(str(val_count)),
                        sanitize_excel_formulas(paper_code)
                    ])
                writer.writerow([])

            # 3) Distribuzione delle "Measure"
            writer.writerow([
                sanitize_excel_formulas("-- Distribution of 'Measure' --"),
                "",
                sanitize_excel_formulas(paper_code)
            ])
            for measure_key, measure_count in measure_counts.items():
                writer.writerow([
                    sanitize_excel_formulas(measure_key),
                    sanitize_excel_formulas(str(measure_count)),
                    sanitize_excel_formulas(paper_code)
                ])
            writer.writerow([])

    print(f"Profiling by paper completato! File salvato come: {output_csv}")

In [10]:
# Esempio di utilizzo:
output_file = "../data/profiling/DATA_HUNTERS_PROFILING_PER_PAPER.csv"
profile_claims_by_paper(cartella_json, output_file)

Profiling by paper completato! File salvato come: ../data/profiling/DATA_HUNTERS_PROFILING_PER_PAPER.csv


In [12]:
# Definizione delle nuove cartelle
cartella_json_aligned = "../data/alignment/claims"
output_folder_aligned = "../data/profiling/aligned"

# Esecuzione delle funzioni con le nuove cartelle
profile_claims(cartella_json_aligned, os.path.join(output_folder_aligned, "luca_borrelli_rainer_cabral_ilao_aligned_profiling.csv"))
profile_claims_by_file(cartella_json_aligned, os.path.join(output_folder_aligned, "luca_borrelli_rainer_cabral_ilao_aligned_profiling_by_file.csv"))
profile_claims_by_paper(cartella_json_aligned, os.path.join(output_folder_aligned, "luca_borrelli_rainer_cabral_ilao_aligned_profiling_by_paper.csv"))

Profiling completato! File salvato come: ../data/profiling/aligned\luca_borrelli_rainer_cabral_ilao_aligned_profiling.csv
Profiling by file completato! File salvato come: ../data/profiling/aligned\luca_borrelli_rainer_cabral_ilao_aligned_profiling_by_file.csv
Profiling by paper completato! File salvato come: ../data/profiling/aligned\luca_borrelli_rainer_cabral_ilao_aligned_profiling_by_paper.csv
