## Extracción de tablas:

In [1]:
import json
import deepdoctection as dd
import time

config_overwrite = ["USE_OCR=False", "USE_PDF_MINER=True"]
analyzer = dd.get_dd_analyzer(config_overwrite=config_overwrite)
path_pdf = r"pdfs_prueba/HyperKG- Hyperbolic Knowledge Graph Embeddings for Knowledge Base Completion.pdf"

# Pruebas\pdfs_prueba\

start_time = time.time()

df = analyzer.analyze(path=path_pdf)
df.reset_state()

results_data = []

for dp in df:
    print(f"\n--- Processing page {dp.page_number} ---")
    
    if len(dp.tables) > 0:
        print(f"{len(dp.tables)} tables found")
    else:
        print("No tables found")

    table_content = []
    for table in dp.tables:
        table_content.append({
            "csv": table.csv,
            "html": table.html
        })
    
    if table_content:
        page_data = {
            "page": dp.page_number + 1,
            "tables": table_content
        }
        results_data.append(page_data)

end_time = time.time()
total_time = end_time - start_time

print(f"\nTotal time: {total_time:.2f} seconds")

final_output = {
    "file_name": path_pdf,
    "runtime_seconds": round(total_time, 2),
    "total_num_tables": sum(len(p["tables"]) for p in results_data),
    "results": results_data
}

with open("deepdoctection_output.json", "w", encoding="utf-8") as f:
    json.dump(final_output, f, indent=4, ensure_ascii=False)

print("Data saved in 'deepdoctection_output.json'")

  from .autonotebook import tqdm as notebook_tqdm
[32m[0218 12:56.21 @dd.py:119][0m  [32mINF[0m  [97mConfig: 
 {'CELL': {'FILTER': None,
          'PAD': {'BOTTOM': 60, 'LEFT': 60, 'RIGHT': 60, 'TOP': 60},
          'PADDING': False,
          'WEIGHTS': 'cell/d2_model_1849999_cell_inf_only.pt',
          'WEIGHTS_TS': 'cell/d2_model_1849999_cell_inf_only.ts'},
 'DEVICE': device(type='cpu'),
 'ENFORCE_WEIGHTS': {'CELL': True, 'ITEM': True, 'LAYOUT': True},
 'ITEM': {'FILTER': ['table'],
          'PAD': {'BOTTOM': 60, 'LEFT': 60, 'RIGHT': 60, 'TOP': 60},
          'PADDING': False,
          'WEIGHTS': 'deepdoctection/tatr_tab_struct_v2/model.safetensors',
          'WEIGHTS_TS': 'item/d2_model_1639999_item_inf_only.ts'},
 'LANGUAGE': None,
 'LAYOUT': {'FILTER': None,
            'PAD': {'BOTTOM': 0, 'LEFT': 0, 'RIGHT': 0, 'TOP': 0},
            'PADDING': False,
            'WEIGHTS': 'Aryn/deformable-detr-DocLayNet/model.safetensors',
            'WEIGHTS_TS': 'layout/d2_model_0

ImportError: 
DeformableDetrConvEncoder requires the timm library but it was not found in your environment. You can install it with pip:
`pip install timm`. Please note that you may need to restart your runtime after installation.


#### Posibles modelos:

## Extracción de modelos

In [None]:
# Probar con Ollama

## Extracción de datasets:

In [23]:
from pathlib import Path
import json
import re

FOLDER_ROOT = Path.cwd()
TABLES_JSON = FOLDER_ROOT / "deepdoctection_output.json"

DATASET_LIST = ["WN18","Dataset-B1","FB15k", "Dataset_A2", "WD"]
found_datasets = []

with open(TABLES_JSON, "r", encoding="utf-8") as j:
    tables_json = json.load(j)

tables_html = []

for page_data in tables_json['results']:
    for table in page_data['tables']:
        tables_html.append(table['html'])

def search_datasets_in_tables_html(html_list = tables_html):
    possible_datasets =  DATASET_LIST.copy()

    for html in html_list:
        for dataset_name in possible_datasets:
            clean_dataset_name = "".join(filter(str.isalnum, dataset_name))
            regex_pattern = r"[\W_]*".join(re.escape(c) for c in clean_dataset_name)
            final_pattern = r"\b" + regex_pattern
            
            if re.search(final_pattern, html, re.IGNORECASE) and dataset_name not in found_datasets:
                found_datasets.append(dataset_name)

found_datasets = []
search_datasets_in_tables_html()
print(found_datasets)

['WN18', 'FB15k', 'WD']


We evaluate our HyperKG model on the task of KBC using
two sets of experiments. We conduct experiments on the
WN18RR (Dettmers et al. 2018) and FB15k-237 (Toutanova
and Chen 2015) datasets. We also construct two datasets
whose statistical regularities can be expressed as QC rules to
test our model’s performance in their presence. WN18RR and
FB15k-237 constitute refined subsets of WN18 and FB15K
that were introduced by Bordes et al . (2013). Toutanova and
Chen (2015) identified that WN18 and FB15K contained a lot
of reversible relations, enabling, thus, various KB embedding
models to generalise easily. Exploiting this fact, Dettmers et
al . (2018) obtained state-of-the-art results only by using a
simple reversal rule. WN18RR and FB15k-237 were carefully
created to alleviate this leakage of information.

Al leer JSON usar .get si el dato no es obligatorio/puede no estar y [] si el dato es obligatorio, pues el script fallará si no lo encuentra

## Extracción de métricas:

In [None]:
import pandas as pd
import io
import re

METRICS_LIST = ["Accuracy", "MRR", "Hits@1", "Hits@3", "Hits@10", "F1-Score"]

PATRONES = {
    "Accuracy": re.compile(r"\b(?:acc(?:uracy)?|c\.?a\.?)\b", re.IGNORECASE),
    "MRR": re.compile(r"\b(?:mrr|mean\s+reciprocal\s+rank)\b", re.IGNORECASE),
    "F1-Score": re.compile(r"\bf-?1(?:-?(?:score|measure))?\b", re.IGNORECASE),
    
}
REGEX_HITS = re.compile(r"\b(?:hits?|h)(?:\s*@\s*|\s+at\s+|\s*)(\d{1,3})\b", re.IGNORECASE)

def normalizar_texto(texto):
    """Convierte texto sucio ('H@ 10') en métrica canónica ('Hits@10')."""
    texto = str(texto).strip()
    
    
    match = REGEX_HITS.search(texto)
    if match:
        return f"Hits@{match.group(1)}"
    
    
    for nombre_canonico, patron in PATRONES.items():
        if patron.search(texto):
            return nombre_canonico
            
    return None

def detectar_metricas_en_tabla(html_str, lista_objetivo):
    metricas_encontradas = set()
    
    
    try:
        dfs = pd.read_html(io.StringIO(html_str), header=[0, 1, 2])
    except:
        return []

    if not dfs: return []
    df = dfs[0]

    for col_tuple in df.columns:
        texto_cabecera = " ".join([str(x) for x in col_tuple if "Unnamed" not in str(x) and str(x) != "nan"])
        
        metrica = normalizar_texto(texto_cabecera)
        if metrica:
            metricas_encontradas.add(metrica)

    try:
        primera_columna = df.iloc[:, 0].astype(str).tolist()
        for celda in primera_columna:
            metrica = normalizar_texto(celda)
            if metrica:
                metricas_encontradas.add(metrica)
    except:
        pass

    metricas_presentes = list(metricas_encontradas.intersection(set(lista_objetivo)))
    
    return metricas_presentes

html_complejo = """
<table><tr><td rowspan=3>Models</td><td colspan=4>WN18RR</td></tr><tr><td rowspan=2>MRR</td><td colspan=3>Hits@</td></tr><tr><td>1</td><td>3</td><td>10</td></tr><tr><td>DistMult</td><td>43.0</td><td>39.0</td><td>44.0</td><td>49.0</td></tr></table>
"""

resultado = detectar_metricas_en_tabla(html_complejo, METRICS_LIST)

print(f"Lista objetivo: {METRICS_LIST}")
print(f"Encontradas en tabla: {resultado}")

Lista objetivo: ['Accuracy', 'MRR', 'Hits@1', 'Hits@3', 'Hits@10', 'F1-Score']
Encontradas en tabla: ['Hits@3', 'MRR', 'Hits@10', 'Hits@1']


## Extracción de valores de las tablas:

In [1]:
# <table><tr><td rowspan=3>Models</td><td colspan=4>WN18</td><td colspan=4>FB15k</td></tr><tr><td rowspan=2>MRR</td><td colspan=3>Hits@</td><td rowspan=2>MRR</td><td colspan=3>Hits@</td></tr><tr><td>1</td><td>3</td><td>10</td><td>1</td><td>3</td><td>10</td></tr><tr><td>TransE*</td><td>45.4</td><td>8.9</td><td>82.3</td><td>93.4</td><td>38.0</td><td>23.1</td><td>47.2</td><td>64.1</td></tr><tr><td>DistMult*</td><td>82.2</td><td>72.8</td><td>91.4</td><td>93.6</td><td>65.4</td><td>54.6</td><td>73.3</td><td>82.4</td></tr><tr><td>HolE*</td><td>93.8</td><td>93.0</td><td>94.5</td><td>94.9</td><td>52.4</td><td>40.2</td><td>61.3</td><td>73.9</td></tr><tr><td>ComplEx*</td><td>94.1</td><td>93.6</td><td>94.5</td><td>94.7</td><td>69.2</td><td>59.9</td><td>75.9</td><td>84.0</td></tr><tr><td>ANALOGY**</td><td>94.2</td><td>93.9</td><td>94.4</td><td>94.7</td><td>72.5</td><td>64.6</td><td>78.5</td><td>85.4</td></tr><tr><td>CP***</td><td>94.2</td><td>93.9</td><td>94.4</td><td>94.7</td><td>72.7</td><td>66.0</td><td>77.3</td><td>83.9</td></tr><tr><td>ConvE**</td><td>94.3</td><td>93.5</td><td>94.6</td><td>95.6</td><td>65.7</td><td>55.8</td><td>72.3</td><td>83.1</td></tr><tr><td>CP(D=200)</td><td>94.2</td><td>93.9</td><td>94.5</td><td>94.7</td><td>71.9</td><td>66.2</td><td>75.2</td><td>82.0</td></tr><tr><td>B-CP(D=200)</td><td>90.1</td><td>88.1</td><td>91.8</td><td>93.3</td><td>69.5</td><td>61.1</td><td>76.0</td><td>83.5</td></tr><tr><td>B-CP(D=400)</td><td>94.5</td><td>94.1</td><td>94.8</td><td>95.0</td><td>72.2</td><td>66.3</td><td>77.5</td><td>84.2</td></tr><tr><td>B-CP(D=300 3)</td><td>94.6</td><td>94.2</td><td>95.0</td><td>95.3</td><td>72.9</td><td>66.5</td><td>77.7</td><td>84.9</td></tr></table>
# <table><tr><td></td><td>WN18</td><td>FB15k</td><td>WN18RR</td><td>FB15k-237</td></tr><tr><td>Ne</td><td>40,943</td><td>14,951</td><td>40,559</td><td>14,505</td></tr><tr><td>Nr</td><td>18</td><td>1,345</td><td>11</td><td>237</td></tr><tr><td>#trainingtriples</td><td>141,442</td><td>483,142</td><td>86,835</td><td>272,115</td></tr><tr><td>#validationtriples</td><td>5,000</td><td>50,000</td><td>3,034</td><td>17,535</td></tr><tr><td>#testtriples</td><td>5,000</td><td>59,071</td><td>3,134</td><td>20,466</td></tr></table>

In [None]:
html_complejo = """
<table><tr><td rowspan=3>Models</td><td colspan=4>WN18</td><td colspan=4>FB15k</td></tr><tr><td rowspan=2>MRR</td><td colspan=3>Hits@</td><td rowspan=2>MRR</td><td colspan=3>Hits@</td></tr><tr><td>1</td><td>3</td><td>10</td><td>1</td><td>3</td><td>10</td></tr><tr><td>TransE*</td><td>45.4</td><td>8.9</td><td>82.3</td><td>93.4</td><td>38.0</td><td>23.1</td><td>47.2</td><td>64.1</td></tr><tr><td>DistMult*</td><td>82.2</td><td>72.8</td><td>91.4</td><td>93.6</td><td>65.4</td><td>54.6</td><td>73.3</td><td>82.4</td></tr><tr><td>HolE*</td><td>93.8</td><td>93.0</td><td>94.5</td><td>94.9</td><td>52.4</td><td>40.2</td><td>61.3</td><td>73.9</td></tr><tr><td>ComplEx*</td><td>94.1</td><td>93.6</td><td>94.5</td><td>94.7</td><td>69.2</td><td>59.9</td><td>75.9</td><td>84.0</td></tr><tr><td>ANALOGY**</td><td>94.2</td><td>93.9</td><td>94.4</td><td>94.7</td><td>72.5</td><td>64.6</td><td>78.5</td><td>85.4</td></tr><tr><td>CP***</td><td>94.2</td><td>93.9</td><td>94.4</td><td>94.7</td><td>72.7</td><td>66.0</td><td>77.3</td><td>83.9</td></tr><tr><td>ConvE**</td><td>94.3</td><td>93.5</td><td>94.6</td><td>95.6</td><td>65.7</td><td>55.8</td><td>72.3</td><td>83.1</td></tr><tr><td>CP(D=200)</td><td>94.2</td><td>93.9</td><td>94.5</td><td>94.7</td><td>71.9</td><td>66.2</td><td>75.2</td><td>82.0</td></tr><tr><td>B-CP(D=200)</td><td>90.1</td><td>88.1</td><td>91.8</td><td>93.3</td><td>69.5</td><td>61.1</td><td>76.0</td><td>83.5</td></tr><tr><td>B-CP(D=400)</td><td>94.5</td><td>94.1</td><td>94.8</td><td>95.0</td><td>72.2</td><td>66.3</td><td>77.5</td><td>84.2</td></tr><tr><td>B-CP(D=300 3)</td><td>94.6</td><td>94.2</td><td>95.0</td><td>95.3</td><td>72.9</td><td>66.5</td><td>77.7</td><td>84.9</td></tr></table>
"""
html_simple = "<table><tr><td></td><td>WN18</td><td>FB15k</td><td>WN18RR</td><td>FB15k-237</td></tr><tr><td>Ne</td><td>40,943</td><td>14,951</td><td>40,559</td><td>14,505</td></tr><tr><td>Nr</td><td>18</td><td>1,345</td><td>11</td><td>237</td></tr><tr><td>#trainingtriples</td><td>141,442</td><td>483,142</td><td>86,835</td><td>272,115</td></tr><tr><td>#validationtriples</td><td>5,000</td><td>50,000</td><td>3,034</td><td>17,535</td></tr><tr><td>#testtriples</td><td>5,000</td><td>59,071</td><td>3,134</td><td>20,466</td></tr></table>"

html_complejo_2 = "<table><tr><td rowspan=2>Model</td><td rowspan=2>Bitsperentity</td><td rowspan=2>Bitsperrelation</td><td colspan=2>MRR</td></tr><tr><td>WN18RR</td><td>FB15k-237</td></tr><tr><td rowspan=2 colspan=2>DistMult*(D=200) 6,400 ComplEx*(D=200) 12,800</td><td>6,400</td><td>43.0</td><td>24.1</td></tr><tr><td>12,800</td><td>44.0</td><td>24.7</td></tr><tr><td>ConvE*(D=200)</td><td>6,400</td><td>6,400</td><td>43.0</td><td>32.5</td></tr><tr><td rowspan=2>CP(D=15) CP(D=50)</td><td>960</td><td>480</td><td>40.0</td><td>22.0</td></tr><tr><td>3,200</td><td>1,600</td><td>43.0</td><td>24.8</td></tr><tr><td>CP(D=200)</td><td>12,800</td><td>6,400</td><td>44.0</td><td>29.0</td></tr><tr><td>CP(D=500)</td><td>32,000</td><td>16,000</td><td>43.0</td><td>29.2</td></tr><tr><td>VQ-CP(D=200)</td><td>400</td><td>200</td><td>36.0</td><td>8.7</td></tr><tr><td>VQ-CP(D=500)</td><td>1,000</td><td>500</td><td>36.0</td><td>8.3</td></tr><tr><td>B-CP(D=100)</td><td>200</td><td>100</td><td>38.0</td><td>23.2</td></tr><tr><td>B-CP(D=200)</td><td>400</td><td>200</td><td>45.0</td><td>27.8</td></tr><tr><td>B-CP(D=300)</td><td>600</td><td>300</td><td>46.0</td><td>29.0</td></tr><tr><td>B-CP(D=400)</td><td>800</td><td>400</td><td>45.0</td><td>29.2</td></tr><tr><td>B-CP(D=500)</td><td>1,000</td><td>500</td><td>45.0</td><td>29.1</td></tr><tr><td>B-CP(D=300 3)</td><td>1,800</td><td>900</td><td>48.0</td><td>30.3</td></tr></table>"


In [None]:
from bs4 import BeautifulSoup
import re
from pathlib import Path
import json

FOLDER_ROOT = Path.cwd()
TABLES_JSON = FOLDER_ROOT / "deepdoctection_output.json"

def html_to_matrix(html_str):
    soup = BeautifulSoup(html_str, 'html.parser')
    rows = soup.find_all('tr')
    
    grid = {} 
    max_cols = 0
    max_rows = len(rows)
    
    # Processing
    for r, row in enumerate(rows):
        cells = row.find_all(['td', 'th'])
        c_idx = 0
        
        for cell in cells:
            while (r, c_idx) in grid:
                c_idx += 1
            
            text = cell.get_text(strip=True)
            rowspan = int(cell.get('rowspan', 1))
            colspan = int(cell.get('colspan', 1))
            
            for i in range(rowspan):
                for j in range(colspan):
                    real_row = r + i
                    real_column = c_idx + j
                    
                    grid[(real_row, real_column)] = text
                    if real_column >= max_cols:
                        max_cols = real_column + 1

            c_idx += colspan

    # Turn dictionary into lists (matrix)
    matrix = []
    for r in range(max_rows):
        current_row = []
        for c in range(max_cols):
            current_row.append(grid.get((r, c), "")) 
        matrix.append(current_row)
    return matrix

def is_value(text):
    text = text.strip()
    value_regex = r'^[\d.,]+[%*]?$|^[-–]$'
    return bool(re.match(value_regex, text))

def clean_and_convert_to_float(value_str):
    if not value_str: return None
    clean_str = str(value_str).strip().replace("*", "").replace(",", "")
    if clean_str in ["-", "–", "—", "nan", "N/A"]:
        return None
    try:
        return float(clean_str)
    except ValueError:
        return None

def split_header(matrix):
    if not matrix: return [], []
    stub_header = matrix[0][0] 
    split_idx = 0
    
    for i, row in enumerate(matrix):
        cells_to_evaluate = row[1:]
        if not cells_to_evaluate: continue
            
        cells_total = len(cells_to_evaluate)
        num_count = sum(1 for c in cells_to_evaluate if is_value(c))
        different_stub = (row[0] != stub_header)
        
        if (num_count / cells_total) > 0.5 and different_stub:
            split_idx = i
            break

    headers = matrix[:split_idx]
    values = matrix[split_idx:]
    return headers, values

def extract_tuples(headers, values):
    context_by_column = []
    num_of_columns = len(headers[0])
    
    for col_idx in range(1, num_of_columns):
        linked_parts = []
        last_seen_value = ""
        
        for header_row in headers:
            current_value = header_row[col_idx]
            if current_value and current_value != last_seen_value:
                linked_parts.append(current_value)
                last_seen_value = current_value
        
        final_context = " | ".join(linked_parts)
        context_by_column.append(final_context)
    
    tuples = []
    
    for value_row in values:
        row_title = value_row[0]
        all_values = value_row[1:]
        for context, value in zip(context_by_column, all_values):
            if value in ["-", "–", ""]:
                continue
            tuple = (row_title, context, value)
            print(f"{tuple}")
            tuples.append(tuple)
            
    return tuples

def extract_values_from_html_table(html):
    matrix = html_to_matrix(html)
    if not matrix:
        raise ValueError("La matriz generada está vacía (HTML sin estructura válida).")
    headers, values = split_header(matrix)
    if not values:
        raise ValueError("No se pudieron separar datos numéricos de las cabeceras.")
    tuples = extract_tuples(headers, values)
    structured_data = []

    for tuple in tuples:
        structured_data.append({
                    "row": tuple[0],
                    "column": tuple[1],
                    "value": clean_and_convert_to_float(tuple[2])
                })
        
    return structured_data
    
def extract_values_from_paper():
    with open(TABLES_JSON, "r", encoding="utf-8") as j:
        tables_json = json.load(j)

    all_tables_data = []
    id = 0
    for page_data in tables_json['results']:
        for table in page_data['tables']:
            try:
                extracted_data = extract_values_from_html_table(table['html'])
                id += 1
                table_object = {
                    "id": id,
                    "page": page_data['page'],
                    "num_values": len(extracted_data),
                    "data": extracted_data
                }
                all_tables_data.append(table_object)
            except Exception as e:
                print(f"   ⚠️ SALTANDO TABLA: Estructura errónea o compleja.")
                print(f"      └── Causa: {e}")
    values_output_json = {
        "file_name": tables_json["file_name"],
        "total_num_tables": all_tables_data.__len__(),
        "table_values": all_tables_data
    }

    with open("values.json", "w", encoding="utf-8") as f:
        json.dump(values_output_json, f, indent=4, ensure_ascii=False)


extract_values_from_paper()


('WN18RR', '|E|', '40,943')
('WN18RR', '|R|', '11')
('WN18RR', '#Train', '86,835')
('WN18RR', '#Valid', '3,034')
('WN18RR', '#Test', '3,134')
('FB15k-237', '|E|', '14,541')
('FB15k-237', '|R|', '237')
('FB15k-237', '#Train', '272,115')
('FB15k-237', '#Valid', '17,535')
('FB15k-237', '#Test', '20,466')
('WD', '|E|', '418')
('WD', '|R|', '2')
('WD', '#Train', '550')
('WD', '#Valid', '25')
('WD', '#Test', '25')
('WD ++', '|E|', '763')
('WD ++', '|R|', '2')
('WD ++', '#Train', '1,120')
('WD ++', '#Valid', '40')
('WD ++', '#Test', '40')
   ⚠️ SALTANDO TABLA: Estructura errónea o compleja.
      └── Causa: list index out of range
('DISTMULT(Yangetal.2015)[(cid:63)]', 'Type', 'Bilinear')
('DISTMULT(Yangetal.2015)[(cid:63)]', 'WN18RR | MRR', '0.43')
('DISTMULT(Yangetal.2015)[(cid:63)]', 'WN18RR | H@10', '49')
('DISTMULT(Yangetal.2015)[(cid:63)]', 'FB15k-237 | MRR', '0.24')
('DISTMULT(Yangetal.2015)[(cid:63)]', 'FB15k-237 | H@10', '41')
('ComplEx(Trouillonetal.2016)[(cid:63)]', 'Type', 'Bilinea

## Extracción de tareas:

In [None]:
#Extracción de tareas