In [15]:
import matplotlib
import os
import json
import pandas as pd
from bs4 import BeautifulSoup
import lxml
from io import StringIO

Analiziamo le statistiche dei json di sorgente di 10 papers scelti casualmente, in quanto numero di tabelle, e numero di references e footnotes di ogni tabella, identificandole con nome del paper (filename del json) + id table per avere una panoramica generale iniziale.

In [43]:
# Percorso alla cartella contenente i file JSON
folder_path = 'raw'

# Lista per mantenere i dati estratti
data = []

# Dizionario per tenere traccia del numero di tabelle per file
tables_per_file = {}

# Itera attraverso ogni file nella cartella specificata
for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        file_path = os.path.join(folder_path, filename)
        
        # Apri e leggi il contenuto del file JSON
        with open(file_path, 'r', encoding='utf-8') as file:
            content = json.load(file)
        
        # Conta le tabelle in questo file
        num_tables = len(content)
        tables_per_file[filename] = num_tables

        # Estrai il numero di riferimenti e note per ogni tabella
        for table_id, table_content in content.items():
            num_references = len(table_content.get('references', []))
            num_footnotes = len(table_content.get('footnotes', []))
            
            # Aggiungi i dati estratti alla lista
            data.append({
                'File': filename,
                'Table ID': table_id,
                'Number of References': num_references,
                'Number of Footnotes': num_footnotes
            })

# Converti la lista in un DataFrame per una migliore visualizzazione
df = pd.DataFrame(data)

# Mostra il conteggio delle tabelle per file
print("\nNumber of tables per file:")
total_tables = 0
for file, count in tables_per_file.items():
    print(f"{file}: {count}")
    total_tables += count

# Stampa il totale delle tabelle
print("\nTotal number of tables:", total_tables)

# Opzionale: salvare il DataFrame in un file CSV
df.to_csv('summary_of_tables.csv', index=False)


Number of tables per file:
2403.19709.json: 5
2404.16905.json: 3
2404.18739.json: 5
2405.13001.json: 4
2406.07914.json: 3
2407.02052.json: 4
2408.13739.json: 6
2409.02041.json: 3
2409.09785.json: 4
2409.11252.json: 3

Total number of tables: 40


Possiamo notare come abbiamo per i 10 papers selezionati, ben 40 tabelle totali.


In [44]:
# Mappa esterna per categorizzare le colonne
COLUMN_MAP = {
    "measure_terms": [
        "Baseline accuracy (GPT3.5-turbo)",
        "VS",
        "VS w. PN",
        "Baseline accuracy (traditional)",
        "Accuracy",
        "Mean",
        "Median",
        "SD",
        "F1",
        "Precision",
        "Recall",
        "w-avg F1",
        "Test Pos.F1*",
        "Test Pos.F1",
        "Eval Pos.F1**",
        "Δd∘\u200b(MAE)↓↓superscriptsubscriptΔ𝑑(MAE)absent\\Delta_{d}^{\\circ}\\text{(MAE)}\\downarrow",
        "Δd∘\u200b(MAE)↓↓\\Delta_{d}^{\\circ}\\text{(MAE)}\\downarrow",
        "Δa∘↓",
        "Δe∘↓",
        "Δd∘(MAE)",
        "WER%↓↓\\downarrow",
        "sWER%↓↓\\downarrow",
        "SR%↑↑\\uparrow",
        "CER%",
        "DER%",
        "cpCER%",
        "MOS-N (CI)",
        "MOS-C (CI)",
        "BLEU",
        "ROUGE-L",
        "METEOR",
        "BERTScore",
        "WER",
        "Pitch mean",
        "Pitch STD",
        "Energy mean",
        "Energy STD",
        "HTN ratio",
        "Speaker similarity",
        "RTF",
        "Delay",
        "TcpWER (%)",
         "WER (%)",  
        "cpWER (%)", 
        "Unweighted Accuracy (%)",
        "WER",  
        "Read Speech (ARL) WER", 
        "Read Speech (CSaLT) WER",
        "Conversational Speech WER"
    ],
    "specification_terms": [
        "TED talk",
        "BBC audio",
        "Number of Speakers",
        "WSJ",
        "Schnauzer",
        "whisper-medium",
        "Light GRU",
        "North West Regions",
        "test",
        "Wav2Vec2 (from scratch)",
        "Source",
        "CH-N",
        "North Sindh",
        "mms-1b",
        "L-S2",
        "Telephone",
        "Category",
        "Family",
        "# Params.",
        "whisper-large",
        "Chihuahua",
        "FFN Head HRA",
        "w/ IV (``Before'') & ST.",
        "Source Transcript",
        "French Poodle",
        "seamless-large",
        "Female",
        "Duration (sec)",
        "USM Basemodel",
        "Task-2 Baseline",
        "Male",
        "Accented English",
        "whisper-small",
        "System",
        "South Punjab",
        "Whisper-1.5B (first-pass) w/o LM",
        "Wav2Vec2 (pre-trained)",
        "Paired T-Test",
        "Number of samples (four emotions)",
        "Business news",
        "Airline info.",
        "Training set",
        "Full Fine-tuning",
        "Mean",
        "Test set",
        "L-S1",
        "Spatial LibriSpeech [22]",
        "Domain",
        "Capital Area",
        "Data",
        "Total",
        "Model",
        "Audiobooks",
        "SR%↑↑\\uparrow",
        "North Punjab",
        "mms-300m",
        "BitFit",
        "GR-N",
        "Residual Adapters",
        "w/ IV",
        "Noise",
        "South Sindh",
        "Controller variant",
        "- Weight unshared",
        "Linear Head HRA",
        "- Recurrent state",
        "Tedlium-3",
        "LoRA",
        "Baluchistan",
        "w/ IV (``Before'')",
        "Linear Head HRA (ours)",
        "Model variant",
        "w/ IV (``After'')",
        "Context",
        "seamless-medium",
        "train",
        "IndRNN",
        "Number of samples (all)",
        "LRS2",
        "Accent",
        "dev",
        "10.43",
        "whisper-base",
        "ATIS",
        "FFN Head HRA (ours)",
        "CORAAL",
        "Method",
        "eval",
        "9.61",
        "LibriSpeech",
        "CommonVoice",
        "whisper-tiny",
        "RNN",
        "CHiME4",
        "Majority",
        "Acc.",
        "F-1 measure",
        "N𝑁N-best Oracle",
        "Interview",
        "w/o IV",
        "SwitchBoard",
        "# segments",
         "Iteration",
        "Data(hrs)",
        "wavLM+conformer ED",
        "conv2d+ebranchformer ED",
        "Fusion",
        "ID",
        "Model based on Accent-ASR",
        "Dev",
        "Eval",
        "Model",
        "Metric",
        "Corpus", 
        "Sample Scale", 
        "Sys-1 RTTM", 
        "Sys-2 RTTM", 
        "Sys-3 RTTM", 
        "Sys-4 RTTM", 
        "NSD", 
        "Re-clustering", 
        "JDS",
        "train", 
        "test", 
        "Domain", 
        "Source", 
        "Training Set", 
        "Test Set", "# Pairs", "Length", 
        "System", "dev", 
        "eval",  
        "Number of samples (all)", 
        "Number of samples (four emotions)",  
        "Baseline accuracy (GPT3.5-turbo)", 
        "Baseline accuracy (traditional)",
        "Family", 
        "Model",  
        "Accent", 
        "Number of Speakers",  
        "Read Speech (ARL)", 
        "Read Speech (CSaLT)", "Conversational Speech",
        "Base Model", 
        "Fine-tuned"  
    ]
}

La funzione classify_table classifica una tabella HTML in base alla sua struttura (numero di righe, colonne, e intestazioni multiple) e al contesto fornito da caption e references.

La logica è:

Analisi della struttura della tabella:

- Determina il numero di righe e colonne.
- Verifica la presenza di intestazioni nidificate (rowspan, colspan) per identificare tabelle complesse.

Analisi del contesto testuale:

- Controlla se i termini di misura (es. "Accuracy") o specifica (es. "Model") sono menzionati nel caption o nelle references.

Classificazione:

- Se la tabella ha intestazioni multiple e relazioni complesse → "Cross-Nested" o "Nested Relational".
- Se i termini di misura sono trovati solo nel contesto → "Cross-table".
- Tabelle semplici con più righe e colonne → "Relational".
- Altri casi o errore → "Unknown".

In [81]:
def classify_table(table_html, caption, references):
    """
    Classifica il tipo di tabella in base alla struttura HTML e al contesto (caption e references).
    """
    try:
        soup = BeautifulSoup(table_html, 'html.parser')
        table = soup.find('table')
        if not table:
            return "Unknown"

        rows = table.find_all('tr')
        num_rows = len(rows)
        num_cols = len(rows[0].find_all(['th', 'td'])) if rows else 0

        # Identifica la struttura multi-intestazione
        headers = table.find_all('th')
        has_multi_header = any(header.get('rowspan') or header.get('colspan') for header in headers)

        # Recupera termini di misura e specifica dalla caption/references
        combined_text = (caption or "") + " " + " ".join(references or [])
        context_mentions_measure = any(term.lower() in combined_text.lower() for term in COLUMN_MAP["measure_terms"])
        context_mentions_specification = any(term.lower() in combined_text.lower() for term in COLUMN_MAP["specification_terms"])

        # Logica di classificazione
        if num_rows > 1 and num_cols > 1:
            if has_multi_header:
                # Controlla per Nested Relational
                if any('rowspan' in header.attrs for header in headers) or any('colspan' in header.attrs for header in headers):
                    return "Nested Relational"
                # Controlla per Cross-Nested
                return "Cross-Nested"
            else:
                # Differenziazione tra Relational e Cross-table
                if context_mentions_measure and not context_mentions_specification:
                    return "Cross-table"
                return "Relational"
        elif num_rows > 1 and num_cols == 1:
            # Tabelle con una sola colonna e più righe
            return "Nested Relational"
        elif num_rows == 1 and num_cols > 1:
            # Tabelle con una sola riga di dati e più colonne
            return "Cross-table"
        else:
            return "Unknown"
    except Exception as e:
        print(f"Errore durante la classificazione della tabella: {e}")
        return "Unknown"


In [82]:
from bs4 import BeautifulSoup

def deduce_measure(caption, references):
    """
    Deduce the Measure from the caption and references.
    """
    measure_terms = ["Accuracy", "Precision", "Recall", "F1-Score", "WER", "BLEU"]
    combined_text = (caption or "") + " " + " ".join(references or [])
    for term in measure_terms:
        if term.lower() in combined_text.lower():
            return term
    return "Unknown Measure"

def debug_table_content(headers, cells, row_idx, table_id, file_name):
    """
    Print debug information about inconsistent table rows.
    """
    print(f"DEBUG: Inconsistent row in table {table_id} of file {file_name}")
    print(f"Row index: {row_idx}")
    print(f"Headers ({len(headers)}): {headers}")
    print(f"Cells ({len(cells)}): {cells}")
    print(f"Caption: {table_id}")

def handle_nested_headers(rows):
    """
    Combina intestazioni nidificate in una singola lista di intestazioni utilizzabile.
    """
    headers = []
    for row in rows[:2]:  # Considera le prime due righe come intestazioni nidificate
        current_row = [cell.get_text(strip=True) for cell in row.find_all(['th', 'td'])]
        if len(headers) < len(current_row):
            headers = headers + [""] * (len(current_row) - len(headers))  # Riempie intestazioni mancanti
        for i, cell in enumerate(current_row):
            headers[i] += " " + cell if i < len(headers) else cell
    return [header.strip() for header in headers]


def extract_relational_claims(table_html, column_map):
    """
    Estrai claims da una tabella relazionale semplice.
    """
    claims = []
    soup = BeautifulSoup(table_html, 'html.parser')
    table = soup.find('table')
    rows = table.find_all('tr')

    # Intestazioni
    headers = [header.get_text(strip=True) for header in rows[0].find_all(['th', 'td'])]

    # Estrai i dati
    data_rows = rows[1:]  # Salta l'intestazione
    for row_idx, row in enumerate(data_rows):
        cells = [cell.get_text(strip=True) for cell in row.find_all('td')]

        # Salta righe con mismatch tra celle e intestazioni
        if len(cells) < len(headers):
            cells += [""] * (len(headers) - len(cells))  # Riempi celle mancanti con stringhe vuote
        elif len(cells) > len(headers):
            debug_table_content(headers, cells, row_idx, "Relational", "N/A")
            continue


        specifications = {}
        for i, header in enumerate(headers):
            if header in column_map["specification_terms"]:
                specifications[str(len(specifications))] = {"name": header, "value": cells[i]}

        for i, header in enumerate(headers):
            if header in column_map["measure_terms"]:
                claims.append({
                    "specifications": specifications,
                    "Measure": header,
                    "Outcome": cells[i]
                })

    return claims or [{
        "specifications": {},
        "Measure": "N/A",
        "Outcome": "No valid data"
    }]

def extract_nested_relational_claims(table_html, caption, references, column_map):
    """
    Estrai claims da una tabella nidificata relazionale.
    """
    claims = []
    soup = BeautifulSoup(table_html, 'html.parser')
    table = soup.find('table')
    rows = table.find_all('tr')

    # Deduce the measure dynamically
    measure = deduce_measure(caption, references)

    # Identifica le intestazioni nidificate
    headers = handle_nested_headers(rows)

    # Estrai i dati
    data_rows = rows[2:]  # Salta le prime due righe di intestazioni
    for row_idx, row in enumerate(data_rows):
        cells = [cell.get_text(strip=True) for cell in row.find_all('td')]

        if len(cells) < len(headers):
            cells += [""] * (len(headers) - len(cells))  # Riempi celle mancanti con stringhe vuote
        elif len(cells) > len(headers):
            debug_table_content(headers, cells, row_idx, "Nested Relational", "N/A")
            continue

        specifications = {}
        for i, header in enumerate(headers):
            if header in column_map["specification_terms"]:
                specifications[str(len(specifications))] = {"name": header, "value": cells[i]}

        for i, header in enumerate(headers):
            if header in column_map["measure_terms"]:
                claims.append({
                    "specifications": specifications,
                    "Measure": measure,
                    "Outcome": cells[i]
                })
    return claims or [{
        "specifications": {"caption": caption, "references": references},
        "Measure": "N/A",
        "Outcome": "No valid data"
    }]


def extract_cross_table_claims(table_html, caption, references, column_map):
    """
    Estrai claims da una cross-table.
    """
    claims = []
    soup = BeautifulSoup(table_html, 'html.parser')
    table = soup.find('table')
    rows = table.find_all('tr')

    # Deduce the measure dynamically
    measure = deduce_measure(caption, references)

    # Intestazioni di colonna (Dataset, ad esempio)
    col_headers = [cell.get_text(strip=True) for cell in rows[0].find_all(['th', 'td'])[1:]]

    # Intestazioni di riga (Metodi, ad esempio)
    row_headers = [row.find(['th', 'td']).get_text(strip=True) for row in rows[1:]]

    # Estrai i valori
    for row_idx, row in enumerate(rows[1:]):
        cells = row.find_all('td')[1:]
        for col_idx, cell in enumerate(cells):
            value = cell.get_text(strip=True)
            if value:
                claims.append({
                    "specifications": {
                        "0": {"name": "Method Name", "value": row_headers[row_idx]},
                        "1": {"name": "Dataset Name", "value": col_headers[col_idx]}
                    },
                    "Measure": measure,
                    "Outcome": value
                })

    return claims or [{
        "specifications": {"caption": caption, "references": references},
        "Measure": "N/A",
        "Outcome": "No valid data"
    }]


def extract_cross_nested_table_claims(table_html, caption, references, column_map):
    """
    Estrai claims da una cross-table nidificata.
    """
    claims = []
    soup = BeautifulSoup(table_html, 'html.parser')
    table = soup.find('table')
    rows = table.find_all('tr')

    if len(rows) < 3:  # Assicuriamoci che ci siano abbastanza righe per analizzare intestazioni e dati
        print("ERROR: Not enough rows in the table for cross-nested claims.")
        return [{
            "specifications": {"caption": caption, "references": references},
            "Measure": "N/A",
            "Outcome": "No valid data"
        }]

    # Deduce the measure dynamically
    measure = deduce_measure(caption, references)

    # Intestazioni di colonne nidificate
    col_headers = []
    try:
        for col in zip(*[row.find_all(['th', 'td'])[1:] for row in rows[:2]]):  # Prendi le prime 2 righe per intestazioni
            col_headers.append(" ".join(cell.get_text(strip=True) for cell in col))
    except IndexError as e:
        print(f"ERROR: Issue extracting column headers: {e}")
        return [{
            "specifications": {"caption": caption, "references": references},
            "Measure": "N/A",
            "Outcome": "No valid data"
        }]

    # Intestazioni di riga
    row_headers = []
    try:
        for row in rows[2:]:
            header_cell = row.find(['th', 'td'])
            if header_cell:
                row_headers.append(header_cell.get_text(strip=True))
    except IndexError as e:
        print(f"ERROR: Issue extracting row headers: {e}")
        return [{
            "specifications": {"caption": caption, "references": references},
            "Measure": "N/A",
            "Outcome": "No valid data"
        }]

    # Estrai i valori
    for row_idx, row in enumerate(rows[2:]):
        cells = row.find_all('td')[1:]  # Salta la prima cella per considerare i valori
        for col_idx, cell in enumerate(cells):
            try:
                value = cell.get_text(strip=True)
                if value:
                    claims.append({
                        "specifications": {
                            "0": {"name": "Method Name", "value": row_headers[row_idx]},
                            "1": {"name": "Dataset Name", "value": col_headers[col_idx]}
                        },
                        "Measure": measure,
                        "Outcome": value
                    })
            except IndexError as e:
                print(f"ERROR: Issue accessing cell at row {row_idx}, col {col_idx}: {e}")

    return claims or [{
        "specifications": {"caption": caption, "references": references},
        "Measure": "N/A",
        "Outcome": "No valid data"
    }]

In [84]:
def extract_claims_based_on_table_type(table_html, caption, references, column_map):
    """
    Estrai claims in base al tipo di tabella classificata.
    """
    claims = []
    table_type = classify_table(table_html,caption, references)
    if table_type == "Relational":
        claims = extract_relational_claims(table_html, column_map)
    elif table_type == "Nested Relational":
        claims = extract_nested_relational_claims(table_html,caption, references, column_map)
    elif table_type == "Cross-table":
        claims = extract_cross_table_claims(table_html, caption, references, column_map)
    elif table_type == "Cross-Nested":
        claims = extract_cross_nested_table_claims(table_html, caption, references, column_map)
    else:
        print("Unknown table type.")
    return claims

def process_json_files(input_folder, output_folder, column_map):
    """
    Analizza tutti i file JSON nella cartella e processa le tabelle.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for file_name in os.listdir(input_folder):
        if file_name.endswith('.json'):
            file_path = os.path.join(input_folder, file_name)
            with open(file_path, 'r', encoding='utf-8') as f:
                content = json.load(f)

            table_id = 1
            for table_key, table_data in content.items():
                table_html = table_data.get("table", "")
                caption = table_data.get("caption", "")
                references = table_data.get("references", [])

                # Classifica il tipo di tabella
                table_type = classify_table(table_html, caption, references)
                print(f"Processing table {table_id} in file {file_name}")
                print(f"Table type: {table_type}")

                # Estrai claims
                claims = extract_claims_based_on_table_type(
                    table_html, caption, references, column_map
                )

                # Debug: Mostra il numero di claims generati
                print(f"Number of claims generated: {len(claims)}")
                if len(claims) == 0:
                    print(f"WARNING: No claims generated for table {table_id} in file {file_name}")
                    print(f"Caption: {caption}")
                    print(f"References: {references}")

                # Salva i claims solo se ce ne sono
                if claims:
                    output_file = os.path.join(output_folder, f"{os.path.splitext(file_name)[0]}_{table_id}_claims.json")
                    with open(output_file, 'w', encoding='utf-8') as out_f:
                        # Aggiungi il tipo di tabella come primo nodo nel file JSON
                        output_data = {
                            "typeTable": table_type,
                            "claims": {str(i): claim for i, claim in enumerate(claims)}
                        }
                        json.dump(output_data, out_f, indent=4, ensure_ascii=False)
                
                table_id += 1

# Configura i percorsi delle cartelle
input_folder = 'raw'  # Cartella di input contenente i file JSON
output_folder = 'claims'  # Cartella di output per salvare i claims

# Avvia il processo
process_json_files(input_folder, output_folder, COLUMN_MAP)

Processing table 1 in file 2403.19709.json
Table type: Nested Relational
Number of claims generated: 1
Processing table 2 in file 2403.19709.json
Table type: Nested Relational
Number of claims generated: 1
Processing table 3 in file 2403.19709.json
Table type: Relational
Number of claims generated: 12
Processing table 4 in file 2403.19709.json
Table type: Relational
Number of claims generated: 6
Processing table 5 in file 2403.19709.json
Table type: Relational
Number of claims generated: 6
Processing table 1 in file 2404.16905.json
Table type: Relational
Number of claims generated: 20
Processing table 2 in file 2404.16905.json
Table type: Relational
Number of claims generated: 16
Processing table 3 in file 2404.16905.json
Table type: Relational
Number of claims generated: 8
Processing table 1 in file 2404.18739.json
Table type: Relational
Number of claims generated: 1
Processing table 2 in file 2404.18739.json
Table type: Relational
Number of claims generated: 3
Processing table 3 in f