In [None]:
from utils.tei_extraction import extract_sections_fulltext, extract_abstract, tei_to_full_raw_text, extract_flat_sections_with_subtext, rank_sections_by_semantic_similarity
from utils.grobid_service import GrobidService


from rapidfuzz import fuzz, process
import ast
from itertools import chain
from pathlib import Path
from grobid_client.grobid_client import GrobidClient
from bs4 import BeautifulSoup
import Levenshtein
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from sentence_transformers import util

import json
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def chunk_text(text, tokenizer, max_tokens=8000, overlap=200):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    
    start = 0
    while start < len(tokens):
        end = min(start + max_tokens, len(tokens))
        chunk = tokenizer.decode(tokens[start:end])
        chunks.append(chunk)
        start += max_tokens - overlap  # Overlapping context
    return chunks


def deduplicate_fuzzy(list, threshold=80):
    unique = []
    for name in list:
        if all(fuzz.ratio(name, existing) < threshold for existing in unique):
            unique.append(name)
    return unique

In [3]:
question = "What are the name of datasets used in the paper?"
question2 = "What are the tasks that the model is trained for?"
question3 = "Who are the authors of the paper?"
question4 = "Given the model definitions mentioned before, choose one of the following taxonomy as the taxonomy of the model mentioned in this paper: Semantic matching model, Translation models, Internal side information inside KGs model, External extra information outside KGs or Other KGC Technologies ? "
questions = [question, question2, question3, question4]

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import accelerate

model_name = "Qwen/Qwen3-1.7B"

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
max_context_tokens = 32768 - 2048

ValueError: Using a `device_map`, `tp_plan`, `torch.device` context manager or setting `torch.set_default_device(device)` requires `accelerate`. You can install it with `pip install accelerate`

In [5]:
import time
import argparse
def load_model_and_tokenizer(
    model,
    tokenizer,
    device: str = "cuda"
) -> tuple:
    """
    Load a model and tokenizer from pretrained checkpoints.

    Args:
        model_name:   HuggingFace model identifier.
        tokenizer_name: If not provided, defaults to model_name.
        device:       Device string, e.g. 'cuda' or 'cpu'.

    Returns:
        model, tokenizer
    """
    # tokenizer_name = tokenizer_name or model_name
    # tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    # model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
    model = model
    tokenizer = tokenizer
    model.eval()
    return model, tokenizer


def run_qa_on_chunk(
    model,
    tokenizer,
    chunk: str,
    question: str,
    model_taxonomy: bool = False,
    unique_labels: list = None,
    device: str = "cuda",
    gen_kwargs: dict = None,
) -> str:
    """
    Run a single questionâ€“answer generation on one text chunk.
    Returns the raw decoded content (Python list format expected).
    """
    # Build chat history
    chat = [
        {"role": "system", "content":
            "You are an assistant for QA tasks. Use only provided context."
        },
        {"role": "user", "content": f"Context chunk: {chunk}"}
    ]

    model_context = '''The semantic matching models and The translation models only use the structure information of internal facts in KGs. The semantic matching models generally use semantic matching-based scoring functions and further consists of tensor/matrix factorization models and neural network models. The translation models apply distance-based scoring functions.
    While Internal side information inside KGs and External extra information outside KGs outside KGs cooperate with additional information (the inside or outside information of KGs except for the structure information) to achieve KGC. Internal side information inside KGs involved in KGs, including node attributes information, entity-related information, relation-related information, neighborhood information, relational path information; External extra information outside KGs outside KGs, mainly including two aspects: rule-based KGC and third-party data sources-based KGC. 
    And if it is not any of the previous models, then it is Other KGC technologies.'''

    # Add question prompt
    if model_taxonomy:
        prompt = (
           f"Now, given this context for model taxonomy: {model_context} Answer this question: {question} "
           +"Give back the answer only and only in a correct Python list format, for example: ['A']. If you don't know the answer, just return an empty list."

        )
    if unique_labels and question and isinstance(unique_labels, list):
        prompt = (
            f"Now, given this question: {question}, and those possible tasks: {unique_labels}. "
            + "Return answer only in a Python list format, e.g. ['A','B']."
        )
    else:
        prompt = (
            f"Now, given this question: {question}. "
            + "Return answer only in a Python list format, e.g. ['A','B']."
        )
    chat.append({"role": "user", "content": prompt})

    # Format for model
    formatted = tokenizer.apply_chat_template(
        chat,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )
    inputs = tokenizer([formatted], return_tensors="pt").to(device)

    # Generation kwargs
    gen_kwargs = gen_kwargs or {
        "max_new_tokens": 8192,
        "temperature": 0.7,
        "top_p": 0.8,
        "top_k": 20
    }

    with torch.no_grad():
        outputs = model.generate(**inputs, **gen_kwargs)

    # Strip input prefix and parse
    output_ids = outputs[0][inputs.input_ids.shape[-1]:].tolist()
    # detect </think> token id if present
    try:
        think_idx = len(output_ids) - output_ids[::-1].index(tokenizer.convert_tokens_to_ids('</think>'))
    except ValueError:
        think_idx = 0
    raw = tokenizer.decode(output_ids[think_idx:], skip_special_tokens=True).strip()
        # Parse JSON list
    try:
        answer_list = json.loads(raw)
    except json.JSONDecodeError:
        # fallback to Python literal_eval
        from ast import literal_eval
        try:
            answer_list = literal_eval(raw)
        except Exception:
            answer_list = []
    return answer_list

def process_paper(
    pdf_path: str,
    grobid: GrobidService,
    model,
    tokenizer,
    questions: list,
    answers: list,
    max_context_tokens: int,
    unique_labels: list = None,
    device: str = "cuda",
    extraction_method: str = "full_text",
    section_name: list = None
) -> dict:
    """
    Process one paper: extract text, chunk, run QA, and collect responses.
    Returns a dict with keys 'dataset', 'task', 'authors'.
    """
    print(f"\nProcessing paper: {pdf_path}")
    start_time = time.time()

    # Extract full text via Grobid
    tei = grobid.process_full_text(pdf_path)
    print(f"Grobid processing took: {time.time() - start_time:.2f}s")
    if extraction_method == "full_text":
        # Extract full text from TEI XML
        raw_text = tei
    elif extraction_method == "abstract":
        # Extract abstract from TEI XML
        raw_text = extract_abstract(tei, remove_ref=True)
    elif extraction_method == "flat_sections":
        from sentence_transformers import SentenceTransformer
        sim_model = SentenceTransformer('all-mpnet-base-v2')
        sections = extract_flat_sections_with_subtext(tei) # extract sections with their text in a dictionaty
        ranked_sections = rank_sections_by_semantic_similarity([sec['title'] for sec in sections], section_name,model = sim_model) # get the most similar sections to the queries
        best_match_section, best_score = ranked_sections[0]
        raw_text = sections[[sec['title'] for sec in sections].index(best_match_section)]['text']



    chunks = chunk_text(raw_text, tokenizer, max_tokens=max_context_tokens, overlap=200)
    responses = {}
    for i in answers:
        responses[i] = []
    

    # responses = {"dataset": [], "task": [], "authors": []}
    for i, question in enumerate(questions):
        print(f"  Question {i+1}/{len(questions)}: {question}")
        selected_chunks = chunks[:1] if i == 2 else chunks
        responses_llm = [
            run_qa_on_chunk(
                model, tokenizer, chunk, question,
                model_taxonomy=True if i == 3 else False,
                unique_labels=unique_labels if i == 1 else None,
                device=device
            ) for chunk in selected_chunks
        ]
        key = answers[i]
        responses[key].append(responses_llm)
        print(f"Completed in {time.time() - start_time:.2f}s")

    return responses



In [None]:
# open the JSON file with paper metadata from paperswithcode
with open("../data/papers_data copy.json", "r", encoding="utf-8") as f:
    papers_list = json.load(f)
# remove if Local PDF Path is None
papers_list = [paper for paper in papers_list if paper.get("Local PDF Path") is not None]

[]

In [8]:

model_name = "Qwen/Qwen3-1.7B"

model, tokenizer = load_model_and_tokenizer(
    model, tokenizer, device="cuda"
)
grobid = GrobidService(config_path="./Grobid/config.json")

# Define or load your papers_list, questions, unique_labels
# papers_list = [...]  # List[dict]
# questions = [...]    # List[str]
# unique_labels = [...]

all_results = []
for paper in papers_list[0:1]:
    pdf_path = str(Path(os.getcwd()) / paper['Local PDF Path'])
    res = process_paper(
        pdf_path,
        grobid,
        model,
        tokenizer,
        questions,
        ["dataset", "task", "authors", "taxonomy"],
        2048,
        None,
        device="cuda"
    )
    all_results.append(res)

# Now you have a list of results per paper
# You can save or further process all_results


INFO - Loading configuration file from ./Grobid/config.json
INFO - Configuration file loaded successfully
2025-10-29 18:22:59,112 - INFO - Logging configured - Level: INFO, Console: True, File: disabled
2025-10-29 18:22:59,141 - INFO - GROBID server http://localhost:8070 is up and running


In [9]:
all_results[1]

IndexError: list index out of range

In [31]:
for i in range(len(all_results)):
    all_results[i]['dataset'] = deduplicate_fuzzy(
        list(chain.from_iterable(all_results[i]['dataset'][0]))
    )
    all_results[i]['task'] = deduplicate_fuzzy(
        list(chain.from_iterable(all_results[i]['task'][0]))
    )
    all_results[i]['authors'] = deduplicate_fuzzy(
        list(chain.from_iterable(all_results[i]['authors'][0]))
    )
    all_results[i]['taxonomy'] = deduplicate_fuzzy(
        list(chain.from_iterable(all_results[i]['taxonomy'][0]))
    )

In [32]:
all_results

[{'dataset': ['ARC Challenge Set',
   'Advances in neural information processing systems',
   'A large annotated corpus for learning natural language inference',
   'An analysis of open information extraction based on semantic role labeling',
   'Combining retrieval, statistics, and inference to answer elementary science questions',
   'Think you have solved question answering? try arc, the ai2 reasoning challenge',
   'Discriminative embeddings of latent variable models for structured data',
   'Gake: Graph aware knowledge embedding',
   'arXiv:1704.01212',
   'arXiv:150',
   'Answering complex questions using open information extraction',
   'Scitail: A textual entailment dataset from science question answering',
   'Squad: 100,000+ questions for machine comprehension of text',
   'Markov logic networks',
   'arXiv:1611.01603',
   'arXiv:1709.04071'],
  'task': ['Learning to reason with contextual knowledge graphs',
   'Learning to reason with neural embeddings of both knowledge grap