In [2]:
from utils.tei_extraction import extract_sections_fulltext, extract_abstract, tei_to_full_raw_text, extract_flat_sections_with_subtext, rank_sections_by_semantic_similarity
from utils.grobid_service import GrobidService


from rapidfuzz import fuzz, process
import ast
from itertools import chain
from pathlib import Path
from grobid_client.grobid_client import GrobidClient
from bs4 import BeautifulSoup
import Levenshtein
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import util
import torch


import json
import os

  from .autonotebook import tqdm as notebook_tqdm


# Authors

In [16]:
# Get the current working directory
current_dir = Path(os.getcwd())
parent_dir = current_dir.parent


with open("../data/papers_data_copy.json", "r", encoding="utf-8") as f:
    papers_list = json.load(f)

# remove if Local PDF Path is None
papers = [paper for paper in papers_list if paper.get("Local PDF Path") is not None]
authors = []
datasets = []
tasks = []

# get the authors from papers
for i, paper in enumerate(papers):
    authors.append(paper['Authors'])
    datasets.append(paper['Datasets'])
    tasks.append(paper['Tasks'])
references = [authors, datasets, tasks]

In [None]:
current_dir = Path(".")
grobid = GrobidService(config_path="./Grobid/config.json")
authors_grobid = []

for paper in papers:
    pdf_path = str(paper["Local PDF Path"])
    authors = grobid.extract_authors_from_pdf(pdf_path)
    authors_grobid.append(authors)
    print(authors)


INFO - Loading configuration file from ./Grobid/config.json
2026-01-05 13:45:29,971 - INFO - Loading configuration file from ./Grobid/config.json
INFO - Configuration file loaded successfully
2026-01-05 13:45:29,973 - INFO - Configuration file loaded successfully
2026-01-05 13:45:29,975 - INFO - Logging configured - Level: INFO, Console: True, File: disabled
2026-01-05 13:45:30,000 - INFO - GROBID server http://localhost:8070 is up and running


data/pdfs/KG^2- Learning to Reason Science Exam Questions with Contextual Knowledge Graph Embeddings.pdf
['Yuyu Zhang', 'Hanjun Dai', 'Toraman Kamil', 'Le Song']
data/pdfs/Incorporating Literals into Knowledge Graph Embeddings.pdf
['Agustinus Kristiadi', 'Mohammad Asif Khan', 'Denis Lukovnikov', 'Jens Lehmann', 'Asja Fischer']
data/pdfs/Adversarial Contrastive Estimation.pdf
['Avishek Joey Bose', 'Huan Ling', 'Yanshuai Cao', 'Borealis Ai']
data/pdfs/KBGAN- Adversarial Learning for Knowledge Graph Embeddings.pdf
['Liwei Cai', 'William Yang Wang']
data/pdfs/Convolutional 2D Knowledge Graph Embeddings.pdf
['Tim Dettmers', 'Pasquale Minervini', 'Pontus Stenetorp', 'Sebastian Riedel']
data/pdfs/Answering Visual-Relational Queries in Web-Extracted Knowledge Graphs.pdf
['Daniel Oñoro-Rubio', 'Mathias Niepert', 'Alberto García-Durán', 'Roberto González-Sánchez', 'Roberto J López-Sastre']
data/pdfs/Expeditious Generation of Knowledge Graph Embeddings.pdf
['Tommaso Soru', 'Stefano Ruberto', 'Die

# Dataset and taxonomy

In [20]:
def chunk_text(text, tokenizer, max_tokens=8000, overlap=200):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    
    start = 0
    while start < len(tokens):
        end = min(start + max_tokens, len(tokens))
        chunk = tokenizer.decode(tokens[start:end])
        chunks.append(chunk)
        start += max_tokens - overlap  # Overlapping context
    return chunks


def deduplicate_fuzzy(list, threshold=80):
    unique = []
    for name in list:
        if all(fuzz.ratio(name, existing) < threshold for existing in unique):
            unique.append(name)
    return unique

In [21]:
question = "What are the name of datasets used in the paper?"
question2 = "What are the tasks that the model is trained for?"
questions = [question, question2]

In [22]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen3-1.7B"

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
max_context_tokens = 32768 - 2048

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 41.34it/s]
Some parameters are on the meta device because they were offloaded to the cpu and disk.


In [23]:
import time
import argparse
def load_model_and_tokenizer(
    model,
    tokenizer,
    device: str = "cuda"
) -> tuple:
    """
    Load a model and tokenizer from pretrained checkpoints.

    Args:
        model_name:   HuggingFace model identifier.
        tokenizer_name: If not provided, defaults to model_name.
        device:       Device string, e.g. 'cuda' or 'cpu'.

    Returns:
        model, tokenizer
    """
    # tokenizer_name = tokenizer_name or model_name
    # tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    # model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
    model = model
    tokenizer = tokenizer
    model.eval()
    return model, tokenizer


def run_qa_on_chunk(
    model,
    tokenizer,
    chunk: str,
    question: str,
    unique_labels: list = None,
    device: str = "cuda",
    gen_kwargs: dict = None
) -> str:
    """
    Run a single question–answer generation on one text chunk.
    Returns the raw decoded content (Python list format expected).
    """
    # Build chat history
    chat = [
        {"role": "system", "content":
            "You are an assistant for QA tasks. Use only provided context."
        },
        {"role": "user", "content": f"Context chunk: {chunk}"}
    ]

    # Add question prompt
    if unique_labels and question and isinstance(unique_labels, list):
        prompt = (
            f"Now, given this question: {question}, and those possible tasks: {unique_labels}. "
            + "Return answer only in a Python list format, e.g. ['A','B']."
        )
    else:
        prompt = (
            f"Now, given this question: {question}. "
            + "Return answer only in a Python list format, e.g. ['A','B']."
        )
    chat.append({"role": "user", "content": prompt})

    # Format for model
    formatted = tokenizer.apply_chat_template(
        chat,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )
    inputs = tokenizer([formatted], return_tensors="pt").to(device)

    # Generation kwargs
    gen_kwargs = gen_kwargs or {
        "max_new_tokens": 8192,
        "temperature": 0.7,
        "top_p": 0.8,
        "top_k": 20
    }

    with torch.no_grad():
        outputs = model.generate(**inputs, **gen_kwargs)

    # Strip input prefix and parse
    output_ids = outputs[0][inputs.input_ids.shape[-1]:].tolist()
    # detect </think> token id if present
    try:
        think_idx = len(output_ids) - output_ids[::-1].index(tokenizer.convert_tokens_to_ids('</think>'))
    except ValueError:
        think_idx = 0
    raw = tokenizer.decode(output_ids[think_idx:], skip_special_tokens=True).strip()
        # Parse JSON list
    try:
        answer_list = json.loads(raw)
    except json.JSONDecodeError:
        # fallback to Python literal_eval
        from ast import literal_eval
        try:
            answer_list = literal_eval(raw)
        except Exception:
            answer_list = []
    return answer_list

def process_paper(
    pdf_path: str,
    grobid: GrobidService,
    model,
    tokenizer,
    questions: list,
    max_context_tokens: int,
    unique_labels: list = None,
    device: str = "cuda",
    extraction_method: str = "full_text",
    section_name: list = None
) -> dict:
    """
    Process one paper: extract text, chunk, run QA, and collect responses.
    Returns a dict with keys 'dataset', 'task', 'authors'.
    """
    print(f"\nProcessing paper: {pdf_path}")
    start_time = time.time()

    # Extract full text via Grobid
    tei = grobid.process_full_text(pdf_path)
    print(f"Grobid processing took: {time.time() - start_time:.2f}s")
    if extraction_method == "full_text":
        # Extract full text from TEI XML
        raw_text = extract_sections_fulltext(tei) # Quitado remove_ref=True como parámetro porque la función no lo acepta
    elif extraction_method == "abstract":
        # Extract abstract from TEI XML
        raw_text = extract_abstract(tei) # Quitado remove_ref=True como parámetro porque la función no lo acepta
    elif extraction_method == "flat_sections":
        from sentence_transformers import SentenceTransformer
        sim_model = SentenceTransformer('all-mpnet-base-v2')
        sections = extract_flat_sections_with_subtext(tei) # extract sections with their text in a dictionaty
        ranked_sections = rank_sections_by_semantic_similarity([sec['title'] for sec in sections], section_name,model = sim_model) # get the most similar sections to the queries
        best_match_section, best_score = ranked_sections[0]
        raw_text = sections[[sec['title'] for sec in sections].index(best_match_section)]['text']



    chunks = chunk_text(raw_text, tokenizer, max_tokens=max_context_tokens, overlap=200)

    responses = {"dataset": [], "task": [], "authors": []}
    for i, question in enumerate(questions):
        print(f"  Question {i+1}/{len(questions)}: {question}")
        selected_chunks = chunks[:1] if i == 2 else chunks
        answers = [
            run_qa_on_chunk(
                model, tokenizer, chunk, question,
                unique_labels if i == 1 else None,
                device=device
            ) for chunk in selected_chunks
        ]
        key = ["dataset", "task", "authors"][i]
        responses[key].append(answers)
        print(f"    Completed in {time.time() - start_time:.2f}s")

    return responses



In [24]:
# open the JSON file with paper metadata from paperswithcode
with open("../data/papers_data_copy.json", "r", encoding="utf-8") as f:
    papers_list = json.load(f)
# remove if Local PDF Path is None
papers_list = [paper for paper in papers_list if paper.get("Local PDF Path") is not None]

In [28]:

model_name = "Qwen/Qwen3-1.7B"

model, tokenizer = load_model_and_tokenizer(
    model, tokenizer, device="cuda"
)
grobid = GrobidService(config_path="./Grobid/config.json")

# Define or load your papers_list, questions, unique_labels
# papers_list = [...]  # List[dict]
# questions = [...]    # List[str]
# unique_labels = [...]

all_results = []
for paper in papers_list[0:2]:
    pdf_path = paper['Local PDF Path']
    res = process_paper(
        pdf_path,
        grobid,
        model,
        tokenizer,
        questions,
        2048,
        None,
        device="cuda"
    )
    all_results.append(res)

# Now you have a list of results per paper
for i in range(len(all_results)):
    all_results[i]['dataset'] = deduplicate_fuzzy(
        list(chain.from_iterable(all_results[i]['dataset'][0]))
    )
    all_results[i]['task'] = deduplicate_fuzzy(
        list(chain.from_iterable(all_results[i]['task'][0]))
    )
    all_results[i]['authors'] = deduplicate_fuzzy(
        list(chain.from_iterable(all_results[i]['authors'][0]))
    )


INFO - Loading configuration file from ./Grobid/config.json
2026-01-05 13:52:19,847 - INFO - Loading configuration file from ./Grobid/config.json
INFO - Configuration file loaded successfully
2026-01-05 13:52:19,848 - INFO - Configuration file loaded successfully
2026-01-05 13:52:19,851 - INFO - Logging configured - Level: INFO, Console: True, File: disabled
2026-01-05 13:52:19,860 - INFO - GROBID server http://localhost:8070 is up and running



Processing paper: data/pdfs/KG^2- Learning to Reason Science Exam Questions with Contextual Knowledge Graph Embeddings.pdf
Grobid processing took: 1.62s


TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [13]:
all_results

[]

# Taxonomy

In [15]:
# Save the responses to a JSON file
import json

# open the CSV file
import pandas as pd
df = pd.read_csv("model_taxonomy_survey_2022\index.csv")
df.head(5)
labels = []
# Create a mapping from category names to numerical labels
category_mapping = {
    "Semantic matching models": 0,
    "Translation models": 1,
    "Internal side information inside KGs": 2,
    "External extra information outside KGs": 3,
    "Other models": 4
}
# Initialize labels based on the category mapping
labels = [category_mapping.get(category, 4) for category in df['category']]


mask = ~df['url'].duplicated(keep=False)
df_unique_only = df[mask]
df_unique_only
# Initialize labels based on the category mapping
labels_unique = [category_mapping.get(category, 4) for category in df_unique_only['category']]

In [16]:
import time
# Get the current working directory
current_dir = Path(os.getcwd())
responses_model = []

documents = []
grobid = GrobidService(config_path="./Grobid/config.json")
for i in range(len(df_unique_only)): #len(df)
    paper_filename = df_unique_only.iloc[i]['filename']
    # label = df.iloc[i]['category']

    print("Processing paper:", paper_filename)
    start = time.time()
    pdf_path = str(current_dir/paper_filename)

    tei = grobid.process_full_text(pdf_path)
    print("Grobid processing took:", time.time() - start, "seconds")

    try:
        raw_text = extract_abstract(tei)
        # if extract_abstract returns None or an empty string, treat as failure
        if not raw_text or not raw_text.strip():
            print("No abstract found, skipping.")
            labels_unique.pop(i)  # Remove the label for this paper
            continue
    except Exception as e:
        print(f"Error extracting abstract ({e!r}), skipping.")
        labels_unique.pop(i)  # Remove the label for this paper
        continue

    # sections
    # sections = extract_flat_sections_with_subtext(tei) # extract sections with their text in a dictionaty
    # ranked_sections = rank_sections_by_semantic_similarity([sec['title'] for sec in sections], ["Experiments","Evaluation"],model = sim_model) # get the most similar sections to the queries
    # best_match_section, best_score = ranked_sections[0]
    # raw_text = sections[[sec['title'] for sec in sections].index(best_match_section)]['text']

    # full text
    # raw_text = tei_to_full_raw_text(tei, remove_ref=True)
    documents.append(raw_text)

INFO - Loading configuration file from ./Grobid/config.json
2026-01-05 13:18:37,619 - INFO - Loading configuration file from ./Grobid/config.json
INFO - Configuration file loaded successfully
2026-01-05 13:18:37,620 - INFO - Configuration file loaded successfully
2026-01-05 13:18:37,622 - INFO - Logging configured - Level: INFO, Console: True, File: disabled
2026-01-05 13:18:37,633 - INFO - GROBID server http://localhost:8070 is up and running
2026-01-05 13:18:37,635 - ERROR - Failed to open PDF file c:\Users\M\Desktop\Proyecto GAP-KGE\GAP-KGE\model_type_pdfs\1901.09590.pdf: [Errno 2] No such file or directory: 'c:\\Users\\M\\Desktop\\Proyecto GAP-KGE\\GAP-KGE\\model_type_pdfs\\1901.09590.pdf'
2026-01-05 13:18:37,645 - ERROR - Failed to open PDF file c:\Users\M\Desktop\Proyecto GAP-KGE\GAP-KGE\model_type_pdfs\0a1bf96b7165e962e90cb14648c9462d-Paper.pdf: [Errno 2] No such file or directory: 'c:\\Users\\M\\Desktop\\Proyecto GAP-KGE\\GAP-KGE\\model_type_pdfs\\0a1bf96b7165e962e90cb14648c946

Processing paper: model_type_pdfs/1901.09590.pdf
Grobid processing took: 0.0 seconds
Error extracting abstract (XMLSyntaxError("Start tag expected, '<' not found, line 1, column 1")), skipping.
Processing paper: model_type_pdfs/0a1bf96b7165e962e90cb14648c9462d-Paper.pdf
Grobid processing took: 0.0 seconds
Error extracting abstract (XMLSyntaxError("Start tag expected, '<' not found, line 1, column 1")), skipping.
Processing paper: model_type_pdfs/1506.00999.pdf
Grobid processing took: 0.0014858245849609375 seconds
Error extracting abstract (XMLSyntaxError("Start tag expected, '<' not found, line 1, column 1")), skipping.
Processing paper: model_type_pdfs/liu17d.pdf
Grobid processing took: 0.0012526512145996094 seconds
Error extracting abstract (XMLSyntaxError("Start tag expected, '<' not found, line 1, column 1")), skipping.
Processing paper: model_type_pdfs/1412.6575.pdf
Grobid processing took: 0.001016855239868164 seconds
Error extracting abstract (XMLSyntaxError("Start tag expected, 

IndexError: pop index out of range

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score

X_train, X_test, y_train, y_test = train_test_split(documents, labels_unique, stratify=labels_unique, test_size=0.20, random_state=42 )
print("Number of training samples:", len(X_train))
print("Number of test samples:", len(X_test))

model = make_pipeline(
    TfidfVectorizer(ngram_range=(1,2), max_features=50000),
    LogisticRegression(class_weight='balanced', max_iter=1000)
)
print(cross_val_score(model, X_train, y_train, cv=5, scoring='f1_macro').mean())
model.fit(X_train, y_train)
print("Test F1:", f1_score(y_test, model.predict(X_test), average='macro'))

from sklearn.metrics import f1_score, precision_score, recall_score

y_pred = model.predict(X_test)

average_recall = recall_score(y_test, y_pred, average='macro')
average_precision = precision_score(y_test, y_pred, average='macro')
average_f1 = f1_score(y_test, y_pred, average='macro')

print(f"Average Recall: {average_recall:.4f}")
print(f"Average Precision: {average_precision:.4f}")
print(f"Average F1: {average_f1:.4f}")
