# PDFs download

In [2]:
from utils.pwc_service import PapersWithCodeClient

client = PapersWithCodeClient(task_slug="knowledge-graph-embeddings", data_dir ="../data")

# Step 1: Fetch and save metadata
papers = client.fetch_papers_metadata(limit=None) # fetch metadata for the first N papers in PapersWithCode
client.save_json(papers, "papers_data.json")

# Step 2: Load metadata and download PDFs
papers = client.load_json("papers_data.json")
updated_papers = client.download_all_pdfs(papers) # download PDFs and update metadata with local paths
client.save_json(updated_papers, "papers_data.json")


Saved data to data/papers_data.json
Downloaded: data/papers_pdfs/KG^2:_Learning_to_Reason_Science_Exam_Questions_with_Contextual_Knowledge_Graph_Embeddings.pdf
Downloaded: data/papers_pdfs/Incorporating_Literals_into_Knowledge_Graph_Embeddings.pdf
Downloaded: data/papers_pdfs/Adversarial_Contrastive_Estimation.pdf
Downloaded: data/papers_pdfs/KBGAN:_Adversarial_Learning_for_Knowledge_Graph_Embeddings.pdf
Downloaded: data/papers_pdfs/Convolutional_2D_Knowledge_Graph_Embeddings.pdf
Downloaded: data/papers_pdfs/Answering_Visual-Relational_Queries_in_Web-Extracted_Knowledge_Graphs.pdf
Downloaded: data/papers_pdfs/Expeditious_Generation_of_Knowledge_Graph_Embeddings.pdf
Downloaded: data/papers_pdfs/Learning_Knowledge_Graph_Embeddings_with_Type_Regularizer.pdf
Downloaded: data/papers_pdfs/Analysis_of_the_Impact_of_Negative_Sampling_on_Link_Prediction_in_Knowledge_Graphs.pdf
Downloaded: data/papers_pdfs/DeepPath:_A_Reinforcement_Learning_Method_for_Knowledge_Graph_Reasoning.pdf
Downloaded: da

# Grobid

In [1]:
from utils.tei_extraction import extract_sections_fulltext, extract_abstract, tei_to_full_raw_text, extract_flat_sections_with_subtext, rank_sections_by_semantic_similarity
from utils.grobid_service import GrobidService


from rapidfuzz import fuzz, process
import ast
from itertools import chain
from pathlib import Path
from grobid_client.grobid_client import GrobidClient
from bs4 import BeautifulSoup
import Levenshtein
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch


import json
import os

2025-07-21 09:03:29.089796: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-21 09:03:29.438336: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-21 09:03:29.438372: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-07-21 09:03:29.440257: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-21 09:03:29.599054: I tensorflow/core/platform/cpu_feature_g

In [2]:
# open the JSON file with paper metadata from paperswithcode
with open("../data/papers_data.json", "r", encoding="utf-8") as f:
    papers_list = json.load(f)
# remove if Local PDF Path is None
papers_list = [paper for paper in papers_list if paper.get("Local PDF Path") is not None]

In [3]:

from sentence_transformers import SentenceTransformer, util
sim_model = SentenceTransformer('all-mpnet-base-v2')

In [3]:
def chunk_text(text, tokenizer, max_tokens=8000, overlap=200):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    
    start = 0
    while start < len(tokens):
        end = min(start + max_tokens, len(tokens))
        chunk = tokenizer.decode(tokens[start:end])
        chunks.append(chunk)
        start += max_tokens - overlap  # Overlapping context
    return chunks

In [4]:

def deduplicate_fuzzy(list, threshold=80):
    unique = []
    for name in list:
        if all(fuzz.ratio(name, existing) < threshold for existing in unique):
            unique.append(name)
    return unique

# def remove_fuzzy(names, threshold=80):
#     # remove = ['Knolwedge Graph', 'Knowledge Graph Embeddings']
#     remove = []
#     unique = []
#     for name in names:
#         if all(fuzz.ratio(name, existing) < threshold for existing in remove):
#             unique.append(name)
#     return unique

def compute_max_similarity(reference, prediction, model):
    # Compute embeddings
    ref_emb = model.encode(reference, convert_to_tensor=True)
    pred_emb = model.encode(prediction, convert_to_tensor=True)

    # Compute cosine similarities: shape (len(ref), len(pred))
    cos_sim = util.pytorch_cos_sim(ref_emb, pred_emb)

    # For each reference entity, find the max similarity to predicted entities
    max_similarities = cos_sim.max(dim=1).values

    return max_similarities

def closest_string_index(target, candidates):
    distances = [Levenshtein.distance(target, cand) for cand in candidates]
    return distances.index(min(distances))

In [5]:
# return the unique task list that appears in the papers
tasks_dataset = []
for i,paper in enumerate(papers_list):
    tasks_dataset.append(paper['Tasks'])

for i in range(len(tasks_dataset)):
    tasks_dataset[i] = deduplicate_fuzzy(tasks_dataset[i], threshold=80)
    tasks_dataset[i] = [sublist for sublist in tasks_dataset[i] if sublist]

# add everything in a set 
unique_labels = sorted({item for group in tasks_dataset for item in group if item})
unique_labels = deduplicate_fuzzy(unique_labels, threshold=80)

In [6]:
len(unique_labels), unique_labels

(169,
 ['AI2 Reasoning Challenge',
  'ARC',
  'Active Learning',
  'Adversarial Attack',
  'Adversarial Robustness',
  'Answer Selection',
  'Articles',
  'Attribute',
  'AutoML',
  'Autonomous Driving',
  'BIG-bench Machine Learning',
  'Bayesian Inference',
  'Benchmarking',
  'Bias Detection',
  'Bilevel Optimization',
  'Blocking',
  'Bridging Anaphora Resolution',
  'Caption Generation',
  'Classification',
  'Click-Through Rate Prediction',
  'Collaborative Filtering',
  'Common Sense Reasoning',
  'Complex Query Answering',
  'Computational Efficiency',
  'Conformal Prediction',
  'Continual Learning',
  'Contrastive Learning',
  'Counterfactual Reasoning',
  'Data Augmentation',
  'Data Integration',
  'Data Poisoning',
  'Data-to-Text Generation',
  'Decision Making',
  'Decoder',
  'Deep Learning',
  'Descriptive',
  'Diagnostic',
  'Dialogue Generation',
  'Disease Prediction',
  'Diversity',
  'Document Classification',
  'Drug Discovery',
  'EDIT Task',
  'Ensemble Learnin

In [19]:
question = "What are the name of datasets used in the paper?"
question2 = "What are the tasks that the model is trained for?"
question3 = "Who are the authors of the paper?"
questions = [question, question2, question3]

In [7]:
model_question = "Is the model in this paper a Structural Information-based KGC Technology, Additional Information-based KGC Technology or Other KGC Technologies?"

### Llama-3

In [8]:
# 1: Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
max_context_tokens = 8192 - 2048

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



In [5]:
# Get the current working directory
current_dir = Path(os.getcwd())
responses_dataset = []
responses_task = []
responses_authors = []

grobid = GrobidService(config_path="./Grobid/config.json")
for paper in papers_list[0:1]:
    pdf_path = str(current_dir /paper['Local PDF Path'])

    tei = grobid.process_full_text(pdf_path)
    raw_text = tei_to_full_raw_text(tei,remove_ref=True)
    print(raw_text)


GROBID server is up and running
KG 2 : Learning to Reason Science Exam Questions with Contextual Knowledge Graph Embeddings
31 May 2018
Yuyu
Zhang
yuyu.zhang@cc.gatech.edu
College of Computing
Georgia Institute of Technology
Hanjun
Dai
hanjun.dai@cc.gatech.edu
College of Computing
Georgia Institute of Technology
Toraman
Kamil
Korea Advanced Institute of Science and Technology
Le
Song
lsong@cc.gatech.edu
College of Computing
Georgia Institute of Technology
KG 2 : Learning to Reason Science Exam Questions with Contextual Knowledge Graph Embeddings
31 May 2018
177559F076614A6214436678E9CC6581
arXiv:1805.12393v1[cs.LG]
GROBID - A machine learning software for extracting information from scholarly documents
The AI2 Reasoning Challenge (ARC), a new benchmark dataset for question answering (QA) has been recently released. ARC only contains natural science questions authored for human exams, which are hard to answer and require advanced logic reasoning. On the ARC Challenge Set, existing state

In [None]:

# Get the current working directory
current_dir = Path(os.getcwd())
responses_dataset = []
responses_task = []
responses_authors = []

grobid = GrobidService(config_path="./Grobid/config.json")
for paper in papers_list[0:30]:
    pdf_path = str(current_dir /paper['Local PDF Path'])

    tei = grobid.process_full_text(pdf_path)


    # abstract
    # raw_text = extract_abstract(tei)

    # sections

    # sections = extract_flat_sections_with_subtext(tei) # extract sections with their text in a dictionaty
    # ranked_sections = rank_sections_by_semantic_similarity([sec['title'] for sec in sections], ["Experiments","Evaluation"],model = sim_model) # get the most similar sections to the queries
    # best_match_section, best_score = ranked_sections[0]
    # raw_text = sections[[sec['title'] for sec in sections].index(best_match_section)]['text']


    # full text
    raw_text = tei_to_full_raw_text(tei, remove_ref=True)

    # Example Usage
    chunks = chunk_text(raw_text, tokenizer, max_tokens=max_context_tokens, overlap=200)
    for i, question in enumerate(questions):
        response = []

        # Select which chunks to run
        if i == 2:
            # only chunk 0 when i == 2
            chunks_to_process = chunks[:1]
        else:
            # otherwise all chunks
            chunks_to_process = chunks

        # Loop over the chosen chunks
        for j, chunk in enumerate(chunks_to_process):
            # Build the chat history
            chat = [
                {
                    "role": "system",
                    "content": "You are an assistant for question-answering tasks. Use only the provided context information to form your response."
                },
                {"role": "user", "content": f"Context chunk: {chunk}"}
            ]

            # Add the question prompt (with or without task labels)
            if i != 1:
                chat.append({
                    "role": "user",
                    "content": (
                        f"Now, given this question: {question}. Give back the answer only and only in a Python list format, for example: ['A','B']. If you don't know the answer, just return an empty list."
                    )
                })
            else:
                chat.append({
                    "role": "user",
                    "content": (
                        f"Now, given this question: {question}, and those possible tasks: {unique_labels}. Give back the answer only and only in a Python list format, for example: ['A','B']. If you don't know the answer, just return an empty list."
                    )
                })

            # 2: Apply the chat template
            formatted_chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
            #print("Formatted chat:\n", formatted_chat)

            # 3: Tokenize the chat (This can be combined with the previous step using tokenize=True)
            inputs = tokenizer(formatted_chat, return_tensors="pt", add_special_tokens=False)
            # Move the tokenized inputs to the same device the model is on (GPU/CPU)
            inputs = {key: tensor.to(model.device) for key, tensor in inputs.items()}
            #print("Tokenized inputs:\n", inputs)

            # 4: Generate text from the model
            outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.6, top_p=0.9,)
            #print("Generated tokens:\n", outputs)

            # 5: Decode the output back to a string
            decoded_output = tokenizer.decode(outputs[0][inputs['input_ids'].size(1):], skip_special_tokens=True)
            print("Decoded output:\n", decoded_output)
            # print decoded output datatype

            response.append(decoded_output)

        # Route responses into the right list
        if i == 0:
            responses_dataset.append(response)
        elif i == 1:
            responses_task.append(response)
        else:
            responses_authors.append(response)

In [13]:
predictions = [responses_authors, responses_dataset, responses_task]

In [None]:
data = {
    "E_pred": predictions
}

with open("../data/test_answers/qa_entities_with_options_grobid_llama3_experiment.json", "w") as f:
    json.dump(data, f, indent=2)

### Qwen/Qwen3-1.7B

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen3-1.7B"

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
max_context_tokens = 32768 - 2048

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
import time
# Get the current working directory
current_dir = Path(os.getcwd())
responses_dataset = []
responses_task = []
responses_authors = []



grobid = GrobidService(config_path="./Grobid/config.json")
for paper in papers_list[0:1]:
    print("Processing paper:", paper['Title'])
    start = time.time()
    pdf_path = str(current_dir /paper['Local PDF Path'])

    tei = grobid.process_full_text(pdf_path)
    print("Grobid processing took:", time.time() - start, "seconds")

    # raw_text = extract_abstract(tei)

    # sections
    # sections = extract_flat_sections_with_subtext(tei) # extract sections with their text in a dictionaty
    # ranked_sections = rank_sections_by_semantic_similarity([sec['title'] for sec in sections], ["Experiments","Evaluation"],model = sim_model) # get the most similar sections to the queries
    # best_match_section, best_score = ranked_sections[0]
    # raw_text = sections[[sec['title'] for sec in sections].index(best_match_section)]['text']

    # full text
    raw_text = tei_to_full_raw_text(tei, remove_ref=True)

    # Example Usage
    chunks = chunk_text(raw_text, tokenizer, max_tokens=max_context_tokens, overlap=200)
    for i, question in enumerate(questions):
        print(f"Processing question {i+1}: {question}")
        response = []

        # Select which chunks to run
        if i == 2:
            # only chunk 0 when i == 2
            chunks_to_process = chunks[:1]
        else:
            # otherwise all chunks
            chunks_to_process = chunks

        # Loop over the chosen chunks
        for j, chunk in enumerate(chunks_to_process):
            # Build the chat history
            chat = [
                {
                    "role": "system",
                    "content": "You are an assistant for question-answering tasks. Use only the provided context information to form your response."
                },
                {"role": "user", "content": f"Context chunk: {chunk}"}
            ]

            # Add the question prompt (with or without task labels)
            if i != 1:
                chat.append({
                    "role": "user",
                    "content": (
                        f"Now, given this question: {question}. Give back the answer only and only in a correct Python list format, for example: ['A','B']. If you don't know the answer, just return an empty list."
                    )
                })
            else:
                chat.append({
                    "role": "user",
                    "content": (
                        f"Now, given this question: {question}, and those possible tasks: {unique_labels}. Give back the answer only and only in a correct Python list format, for example: ['A','B']. If you don't know the answer, just return an empty list."
                    )
                })

            # 2: Apply the chat template
            formatted_chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True,enable_thinking=False)
            #print("Formatted chat:\n", formatted_chat)

            # 3: Tokenize the chat (This can be combined with the previous step using tokenize=True)
            model_inputs = tokenizer([formatted_chat], return_tensors="pt").to(model.device)

            model.eval()
            with torch.no_grad():
            # 4: Generate text from the model
                generated_ids = model.generate(
                    **model_inputs,
                    max_new_tokens=8192,
                    temperature=0.7,
                    top_p=0.8,
                    top_k=20,
                )
            output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 

            # parsing thinking content
            try:
                # rindex finding 151668 (</think>)
                index = len(output_ids) - output_ids[::-1].index(151668)
            except ValueError:
                index = 0

            #thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
            content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
            print("content:", content)
            #print("thinking content:", thinking_content)

            response.append(content)
            print(f"Generation took {time.time() - start:.2f} seconds")
            del model_inputs, generated_ids
            torch.cuda.empty_cache()
            

        # Route responses into the right list
        if i == 0:
            responses_dataset.append(response)
        elif i == 1:
            responses_task.append(response)
        else:
            responses_authors.append(response)

In [None]:
# problema con que se queda generando todos los tokens sin parar, no se detiene

In [21]:
question = "What are the name of unique datasets used in the paper?"
question2 = "What are the unique tasks that the model is trained for?"
question3 = "Who are the authors of the paper?"
questions = [question, question2, question3]

In [None]:
# asking only dataset
import time

# only doing the task part full text
current_dir = Path(os.getcwd())
responses_dataset = []
acuumulated_time = 0

grobid = GrobidService(config_path="./Grobid/config.json")
for paper in papers_list:

    print("Processing paper:", paper['Title'])
    start = time.time()
    pdf_path = str(current_dir /paper['Local PDF Path'])

    tei = grobid.process_full_text(pdf_path)

    # raw_text = extract_abstract(tei)

    # sections
    # sections = extract_flat_sections_with_subtext(tei) # extract sections with their text in a dictionaty
    # ranked_sections = rank_sections_by_semantic_similarity([sec['title'] for sec in sections], ["Experiments","Evaluation"],model = sim_model) # get the most similar sections to the queries
    # best_match_section, best_score = ranked_sections[0]
    # raw_text = sections[[sec['title'] for sec in sections].index(best_match_section)]['text']

    # full text
    raw_text = tei_to_full_raw_text(tei, remove_ref=True)


    # Example Usage
    chunks = chunk_text(raw_text, tokenizer, max_tokens=max_context_tokens, overlap=200)
    print("Number of chunks:", len(chunks))
    for i, question in enumerate(questions):
        response = []

        # Select which chunks to run
        if i != 0:
            continue  # skip all except i == 1

        else:
            # otherwise all chunks
            chunks_to_process = chunks

            # Loop over the chosen chunks
            for j, chunk in enumerate(chunks_to_process):
                # Build the chat history
                chat = [
                    {
                        "role": "system",
                        "content": "You are an assistant for question-answering tasks. Use only the provided context information to form your response."
                    },
                    {"role": "user", "content": f"Context chunk: {chunk}"}
                ]


                chat.append({
                    "role": "user",
                    "content": (
                        f"Now, given this question: {question}. Give back the answer only and only in a correct Python list format, for example: ['A','B']. If you don't know the answer, just return an empty list."
                    )
                })


                # 2: Apply the chat template
                formatted_chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True,enable_thinking=False)
                #print("Formatted chat:\n", formatted_chat)

                # 3: Tokenize the chat (This can be combined with the previous step using tokenize=True)
                model_inputs = tokenizer([formatted_chat], return_tensors="pt").to(model.device)
                print("Input tokens:", model_inputs.input_ids.shape[-1])
                print("GPU memory:", torch.cuda.memory_allocated() / 1e6, "MB")


                # 4: Generate text from the model
                generated_ids = model.generate(
                    **model_inputs,
                    max_new_tokens=512,
                    temperature=0.7,
                    top_p=0.8,
                    top_k=20,
                )
                output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 

                # parsing thinking content
                try:
                    # rindex finding 151668 (</think>)
                    index = len(output_ids) - output_ids[::-1].index(151668)
                except ValueError:
                    index = 0

                # thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
                content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
                # print("thinking content:", thinking_content)
                print("content:", content)


                response.append(content)
                print(f"Generation took {time.time() - start:.2f} seconds")
                acuumulated_time += (time.time() - start)
            responses_dataset.append(response)
            print(f"Accumulated time: {acuumulated_time:.2f} seconds")


In [None]:
# without the 10
import time

# only doing the task part full text
current_dir = Path(os.getcwd())
responses_tasks = []
acuumulated_time = 0

grobid = GrobidService(config_path="./Grobid/config.json")
for paper in papers_list[:30]:
    
    print("Processing paper:", paper['Title'])
    start = time.time()
    pdf_path = str(current_dir /paper['Local PDF Path'])

    tei = grobid.process_full_text(pdf_path)

    # raw_text = extract_abstract(tei)

    # sections
    # sections = extract_flat_sections_with_subtext(tei) # extract sections with their text in a dictionaty
    # ranked_sections = rank_sections_by_semantic_similarity([sec['title'] for sec in sections], ["Experiments","Evaluation"],model = sim_model) # get the most similar sections to the queries
    # best_match_section, best_score = ranked_sections[0]
    # raw_text = sections[[sec['title'] for sec in sections].index(best_match_section)]['text']

    # full text
    raw_text = tei_to_full_raw_text(tei, remove_ref=True)


    # Example Usage
    chunks = chunk_text(raw_text, tokenizer, max_tokens=max_context_tokens, overlap=200)
    print("Number of chunks:", len(chunks))
    for i, question in enumerate(questions):
        response = []

        # Select which chunks to run
        if i != 1:
            continue  # skip all except i == 1

        else:
            # otherwise all chunks
            chunks_to_process = chunks

            # Loop over the chosen chunks
            for j, chunk in enumerate(chunks_to_process):
                # Build the chat history
                chat = [
                    {
                        "role": "system",
                        "content": "You are an assistant for question-answering tasks. Use only the provided context information to form your response. "
                    },
                    {"role": "user", "content": f"Context chunk: {chunk}"}
                ]


                chat.append({
                    "role": "user",
                    "content": (
                        f"Now, given this question: {question}, and those possible tasks: {unique_labels}. Give back the answer only and only in a correct Python list format, for example: ['A','B']. If you don't know the answer, just return an empty list."
                    )
                })


                # 2: Apply the chat template
                formatted_chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True,enable_thinking=False)
                #print("Formatted chat:\n", formatted_chat)

                # 3: Tokenize the chat (This can be combined with the previous step using tokenize=True)
                model_inputs = tokenizer([formatted_chat], return_tensors="pt").to(model.device)
                print("Input tokens:", model_inputs.input_ids.shape[-1])
                print("GPU memory:", torch.cuda.memory_allocated() / 1e6, "MB")


                # 4: Generate text from the model
                generated_ids = model.generate(
                    **model_inputs,
                    max_new_tokens=2048,
                    temperature=0.7,
                    top_p=0.8,
                    top_k=20,
                )
                output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 

                # parsing thinking content
                try:
                    # rindex finding 151668 (</think>)
                    index = len(output_ids) - output_ids[::-1].index(151668)
                except ValueError:
                    index = 0

                # thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
                content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
                # print("thinking content:", thinking_content)
                print("content:", content)


                response.append(content)
                print(f"Generation took {time.time() - start:.2f} seconds")
                acuumulated_time += (time.time() - start)
            responses_tasks.append(response)
            print(f"Accumulated time: {acuumulated_time:.2f} seconds")


GROBID server is up and running
Processing paper: KG^2: Learning to Reason Science Exam Questions with Contextual Knowledge Graph Embeddings
Number of chunks: 1
Input tokens: 6906
GPU memory: 4718.162432 MB
content: ['AI2 Reasoning Challenge', 'ARC', 'Knowledge Graph Embeddings', 'Graph Question Answering', 'Graph Representation Learning', 'Graph Learning', 'Graph Neural Network', 'Knowledge Base Question Answering', 'Knowledge Graph Embeddings', 'Knowledge Base Completion', 'Knowledge Probing', 'Knowledge Distillation', 'Knowledge Graphs', 'Graph Attention', 'Graph Embedding', 'Graph Learning', 'Graph Neural Network', 'Graph Question Answering', 'Graph Representation Learning', 'Knowledge Graph Embeddings', 'Knowledge Base Question Answering', 'Knowledge Graph Embeddings', 'Graph Attention', 'Graph Embedding', 'Graph Learning', 'Graph Neural Network', 'Graph Question Answering', 'Graph Representation Learning', 'Knowledge Graph Embeddings', 'Knowledge Base Question Answering', 'Knowle

In [None]:
# task but only 10 answer max
import time

# only doing the task part full text
current_dir = Path(os.getcwd())
responses_tasks = []
acuumulated_time = 0

grobid = GrobidService(config_path="./Grobid/config.json")
for paper in papers_list[:30]:
    print("Processing paper:", paper['Title'])
    start = time.time()
    pdf_path = str(current_dir /paper['Local PDF Path'])

    tei = grobid.process_full_text(pdf_path)

    # raw_text = extract_abstract(tei)

    # sections
    # sections = extract_flat_sections_with_subtext(tei) # extract sections with their text in a dictionaty
    # ranked_sections = rank_sections_by_semantic_similarity([sec['title'] for sec in sections], ["Experiments","Evaluation"],model = sim_model) # get the most similar sections to the queries
    # best_match_section, best_score = ranked_sections[0]
    # raw_text = sections[[sec['title'] for sec in sections].index(best_match_section)]['text']

    # full text
    raw_text = tei_to_full_raw_text(tei, remove_ref=True)


    # Example Usage
    chunks = chunk_text(raw_text, tokenizer, max_tokens=max_context_tokens, overlap=200)
    print("Number of chunks:", len(chunks))
    for i, question in enumerate(questions):
        response = []

        # Select which chunks to run
        if i != 1:
            continue  # skip all except i == 1

        else:
            # otherwise all chunks
            chunks_to_process = chunks

            # Loop over the chosen chunks
            for j, chunk in enumerate(chunks_to_process):
                # Build the chat history
                # chat = [
                #     {
                #         "role": "system",
                #         "content": "You are an assistant for question-answering tasks. Use only the provided context information to form your response. "
                #     },
                #     {"role": "user", "content": f"Context chunk: {chunk}"}
                # ]
                chat = [
                    {
                        "role": "system",
                        "content": "You are an assistant for question-answering tasks. Use only the provided context information to form your response. "
                    },
                    {"role": "user", "content": f"Context chunk: {chunk}"}
                ]


                chat.append({
                    "role": "user",
                    "content": (
                        f"Now, given this question: {question}, and those possible tasks: {unique_labels}. Give back the answer only and only in a correct Python list format, for example: ['A','B']. If you don't know the answer, just return an empty list. Return 10 taks max."
                    )
                })


                # 2: Apply the chat template
                formatted_chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True,enable_thinking=False)
                #print("Formatted chat:\n", formatted_chat)

                # 3: Tokenize the chat (This can be combined with the previous step using tokenize=True)
                model_inputs = tokenizer([formatted_chat], return_tensors="pt").to(model.device)
                print("Input tokens:", model_inputs.input_ids.shape[-1])
                print("GPU memory:", torch.cuda.memory_allocated() / 1e6, "MB")


                # 4: Generate text from the model
                generated_ids = model.generate(
                    **model_inputs,
                    max_new_tokens=2048,
                    temperature=0.7,
                    top_p=0.8,
                    top_k=20,
                )
                output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 

                # parsing thinking content
                try:
                    # rindex finding 151668 (</think>)
                    index = len(output_ids) - output_ids[::-1].index(151668)
                except ValueError:
                    index = 0

                # thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
                content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
                # print("thinking content:", thinking_content)
                print("content:", content)


                response.append(content)
                print(f"Generation took {time.time() - start:.2f} seconds")
                acuumulated_time += (time.time() - start)
            responses_tasks.append(response)
            print(f"Accumulated time: {acuumulated_time:.2f} seconds")


GROBID server is up and running
Processing paper: KG^2: Learning to Reason Science Exam Questions with Contextual Knowledge Graph Embeddings
Number of chunks: 1
Input tokens: 6914
GPU memory: 12466.5216 MB
content: ['AI2 Reasoning Challenge', 'ARC', 'Graph Knowledge Embeddings', 'Knowledge Graphs', 'Learning To Rank', 'Multi-hop Question Answering', 'Text Retrieval', 'Text Similarity', 'Variational Inference', 'Self-Supervised Learning']
Generation took 26.40 seconds
Accumulated time: 26.40 seconds
Processing paper: Incorporating Literals into Knowledge Graph Embeddings
Number of chunks: 1
Input tokens: 11032
GPU memory: 12466.537984 MB
content: ['Knowledge Graph Embeddings', 'Entity Embeddings', 'Entity Resolution', 'Entity Linking', 'Entity Disambiguation', 'Entity Typing', 'Knowledge Base Question Answering', 'Knowledge Base Completion', 'Knowledge Distillation', 'Knowledge Probing']
Generation took 35.10 seconds
Accumulated time: 61.50 seconds
Processing paper: Adversarial Contrast

In [28]:
questions

['What are the name of unique datasets used in the paper?',
 'What are the unique tasks that the model is trained for?',
 'Who are the authors of the paper?']

In [None]:
import time

# only doing the task part full text
current_dir = Path(os.getcwd())
responses_tasks = []
acuumulated_time = 0

grobid = GrobidService(config_path="./Grobid/config.json")
counter = 0
for paper in papers_list:
    print("Paper index:", counter)
    counter += 1
    
    print("Processing paper:", paper['Title'])
    start = time.time()
    pdf_path = str(current_dir /paper['Local PDF Path'])

    tei = grobid.process_full_text(pdf_path)

    # raw_text = extract_abstract(tei)

    # sections
    # sections = extract_flat_sections_with_subtext(tei) # extract sections with their text in a dictionaty
    # ranked_sections = rank_sections_by_semantic_similarity([sec['title'] for sec in sections], ["Experiments","Evaluation"],model = sim_model) # get the most similar sections to the queries
    # best_match_section, best_score = ranked_sections[0]
    # raw_text = sections[[sec['title'] for sec in sections].index(best_match_section)]['text']

    # full text
    raw_text = tei_to_full_raw_text(tei, remove_ref=True)


    # Example Usage
    chunks = chunk_text(raw_text, tokenizer, max_tokens=max_context_tokens, overlap=200)
    print("Number of chunks:", len(chunks))
    for i, question in enumerate(questions):
        response = []

        # Select which chunks to run
        if i != 0:
            continue  # skip all except i == 1

        else:
            # otherwise all chunks
            chunks_to_process = chunks

            # Loop over the chosen chunks
            for j, chunk in enumerate(chunks_to_process):
                # Build the chat history
                chat = [
                    {
                        "role": "system",
                        "content": "You are an assistant for question-answering tasks. Use only the provided context information to form your response. "
                    },
                    {"role": "user", "content": f"Context chunk: {chunk}"}
                ]


                chat.append({
                    "role": "user",
                    "content": (
                        f"Now, given this question: {question}. Give back the answer only and only in a correct Python list format, for example: ['A','B']. If you don't know the answer, just return an empty list."
                    )
                })


                # 2: Apply the chat template
                formatted_chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True,enable_thinking=False)
                #print("Formatted chat:\n", formatted_chat)

                # 3: Tokenize the chat (This can be combined with the previous step using tokenize=True)
                model_inputs = tokenizer([formatted_chat], return_tensors="pt").to(model.device)
                print("Input tokens:", model_inputs.input_ids.shape[-1])
                print("GPU memory:", torch.cuda.memory_allocated() / 1e6, "MB")


                # 4: Generate text from the model
                generated_ids = model.generate(
                    **model_inputs,
                    max_new_tokens=4096,
                    temperature=0.7,
                    top_p=0.8,
                    top_k=20,
                )
                output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 

                # parsing thinking content
                try:
                    # rindex finding 151668 (</think>)
                    index = len(output_ids) - output_ids[::-1].index(151668)
                except ValueError:
                    index = 0

                # thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
                content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
                # print("thinking content:", thinking_content)
                print("content:", content)


                response.append(content)
                print(f"Generation took {time.time() - start:.2f} seconds")
                acuumulated_time += (time.time() - start)
            responses_tasks.append(response)
            print(f"Accumulated time: {acuumulated_time:.2f} seconds")


GROBID server is up and running
Paper index: 0
Processing paper: KG^2: Learning to Reason Science Exam Questions with Contextual Knowledge Graph Embeddings
Number of chunks: 1
Input tokens: 6061
GPU memory: 8030.641152 MB
content: ['ARC Challenge Set']
Generation took 2.15 seconds
Accumulated time: 2.15 seconds
Paper index: 1
Processing paper: Incorporating Literals into Knowledge Graph Embeddings
Number of chunks: 1
Input tokens: 10179
GPU memory: 8030.65856 MB
content: ['FB15k', 'FB15k-237', 'YAGO3-10']
Generation took 3.23 seconds
Accumulated time: 5.37 seconds
Paper index: 2
Processing paper: Adversarial Contrastive Estimation
Number of chunks: 1
Input tokens: 12174
GPU memory: 8030.723584 MB
content: ['RW', 'WS353']
Generation took 3.00 seconds
Accumulated time: 8.37 seconds
Paper index: 3
Processing paper: KBGAN: Adversarial Learning for Knowledge Graph Embeddings
Number of chunks: 1
Input tokens: 10175
GPU memory: 8030.706688 MB
content: ['FB15k-237', 'WN18', 'WN18RR']
Generatio

KeyboardInterrupt: 

In [None]:
predictions = [responses_authors, responses_dataset, responses_task]

In [None]:
data = {
    "E_pred": predictions
}

with open("../data/test_answers/qa_entities_with_options_grobid_qwen1b_abstract.json", "w") as f:
    json.dump(data, f, indent=2)

### Deepseek-r1:1.5b

In [30]:
question = "What are the name of datasets used in the paper given?"
question2 = "What are the tasks that the model in the paper given is trained for?"
question3 = "Who are the authors in the paper given?"
questions = [question, question2, question3]

In [1]:
from ollama import Client


client_ollama = Client(host='http://127.0.0.1:11434')  # Default; can omit if using defaults
model = "deepseek-r1:1.5b"  # replace with an available/pulled model


In [None]:
model_name = "Qwen/Qwen3-1.7B"
max_context_tokens = 128000 - 2048
# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)

from pathlib import Path
import os
# Get the current working directory
current_dir = Path(os.getcwd())
responses_dataset = []
responses_task = []
responses_authors = []
from bs4 import BeautifulSoup

grobid = GrobidService(config_path="./Grobid/config.json")
for paper in papers_list[0:30]:
    pdf_path = str(current_dir /paper['Local PDF Path'])

    tei = grobid.process_full_text(pdf_path)

    raw_text = tei_to_full_raw_text(tei, remove_ref=True)
    # Example Usage
    chunks = chunk_text(raw_text, tokenizer, max_tokens=max_context_tokens, overlap=200)
    for i, question in enumerate(questions):
        
        client_ollama = Client(host='http://127.0.0.1:11434')  # Default; can omit if using defaults
        model = "deepseek-r1:1.5b"  # replace with an available/pulled model
        response = []

        # Select which chunks to run
        if i == 2:
            # only chunk 0 when i == 2
            chunks_to_process = chunks[:1]
        else:
            # otherwise all chunks
            chunks_to_process = chunks

        # Loop over the chosen chunks
        for j, chunk in enumerate(chunks_to_process):
            # Build the chat history
            prompt = f"You are an assistant for question-answering tasks. Use only the provided context information to form your response. Context chunk: {chunk}"
            # Add the question prompt (with or without task labels)
            if i != 1:
                prompt += f"Now, given this question: {question}. Give back the answer only and only in a Python list format, for example: ['A','B']. If you don't know the answer, just return an empty list."
            
            else:
                prompt += f"Now, given this question: {question}, and those possible tasks: {unique_labels}. Give back the answer only and only in a Python list format, for example: ['A','B']. If you don't know the answer, just return an empty list."

            # 2: Apply the chat template
            decoded_output = client_ollama.generate(model=model, prompt=prompt)

            response.append(decoded_output)

        # Route responses into the right list
        if i == 0:
            responses_dataset.append(response)
        elif i == 1:
            responses_task.append(response)
        else:
            responses_authors.append(response)

GROBID server is up and running


In [39]:
for response in responses_authors:
    print(response[0]['response'])
    break

<think>
Okay, so I'm trying to figure out who the authors of that particular paper are. The question is straightforward: "Who are the authors in the paper given? Give back the answer only and only in a Python list format..." Hmm, wait, but this isn't a real question because the user didn't specify which specific paper they're referring to. That's pretty standard; I remember that without context or specificity, people often ask vague questions.

I should probably consider what information they might be asking for. Maybe they want me to explain how to find authors of any given paper? But no, the question is phrased as if it's a specific paper. If it were a specific paper like "Deep Learning for Scientific Discovery," then I could provide that information. But since it's not provided here, I can't answer correctly.

I remember from other queries where the user mentioned providing the authors in a list format without any context, I just need to state that I don't have enough information. T

### Evaluation

In [6]:
with open("../data/test_answers/qa_entities_with_options_grobid_qwen1b_all_dataset.json", "r+") as f:
    data = json.load(f)
    f.seek(0)
    json.dump(data, f, indent=2)
    f.truncate()
    predictions = data["E_pred"]

In [55]:
responses_authors_copy = predictions[0]
responses_dataset_copy = predictions[1]
responses_task_copy = predictions[2]

In [None]:
# for i in range(len(responses_dataset_copy)):
#     for j in range(len(responses_dataset_copy[i])):
#         if isinstance(responses_dataset_copy[i][j], str):
#             responses_dataset_copy[i][j] = ast.literal_eval(responses_dataset_copy[i][j])
#     if len(responses_dataset_copy[i]) > 1:
#         merged= list(chain.from_iterable(responses_dataset_copy[i]))
#         responses_dataset_copy[i] = deduplicate_fuzzy(merged, threshold=80)
#     else:
#         responses_dataset_copy[i] = deduplicate_fuzzy(responses_dataset_copy[i][j], threshold=80)


# for i in range(len(responses_task_copy)):
#     for j in range(len(responses_task_copy[i])):
#         if isinstance(responses_task_copy[i][j], str):
#             responses_task_copy[i][j] = ast.literal_eval(responses_task_copy[i][j])
#     if len(responses_task_copy[i]) > 1:
#         merged= list(chain.from_iterable(responses_task_copy[i]))
#         deduplicate = deduplicate_fuzzy(merged, threshold=80)
#         responses_task_copy[i] = deduplicate_fuzzy(deduplicate, threshold=80)

#     else:
#         deduplicate = deduplicate_fuzzy(responses_task_copy[i][j], threshold=80)
#         responses_task_copy[i] = deduplicate_fuzzy(deduplicate, threshold=80)


# # return the first response, because the authors are in the first chunk always
# for i, response in enumerate(responses_authors_copy):
#         responses_authors_copy[i] = response[0]

In [None]:
for i in range(len(responses_dataset_copy)):
    for j in range(len(responses_dataset_copy[i])):
        if isinstance(responses_dataset_copy[i][j], str):
            responses_dataset_copy[i][j] = ast.literal_eval(responses_dataset_copy[i][j])
        if len(responses_dataset_copy[i]) > 1:
            merged= list(chain.from_iterable(responses_dataset_copy[i]))
            responses_dataset_copy[i] = deduplicate_fuzzy(merged, threshold=80)
        else:
            responses_dataset_copy[i] = deduplicate_fuzzy(responses_dataset_copy[i][j], threshold=80)


for i in range(len(responses_task_copy)):
    for j in range(len(responses_task_copy[i])):
        if isinstance(responses_task_copy[i][j], str):
            responses_task_copy[i][j] = ast.literal_eval(responses_task_copy[i][j])
        if len(responses_task_copy[i]) > 1:
            merged= list(chain.from_iterable(responses_task_copy[i]))
            deduplicate = deduplicate_fuzzy(merged, threshold=80)
            responses_task_copy[i] = deduplicate_fuzzy(deduplicate, threshold=80)

        else:
            deduplicate = deduplicate_fuzzy(responses_task_copy[i][j], threshold=80)
            responses_task_copy[i] = deduplicate_fuzzy(deduplicate, threshold=80)


# return the first response, because the authors are in the first chunk always
# for i, response in enumerate(responses_authors_copy):
#         responses_authors_copy[i] = response[0]

In [9]:
# open the JSON file with paper metadata from paperswithcode
with open("../data/papers_data.json", "r", encoding="utf-8") as f:
    papers_list = json.load(f)
# remove if Local PDF Path is None
papers_list = [paper for paper in papers_list if paper.get("Local PDF Path") is not None]
    
papers = papers_list
authors = []
datasets = []
tasks = []

# get the authors from papers
for i, paper in enumerate(papers):
    authors.append(paper['Authors'])
    datasets.append(paper['Datasets'])
    tasks.append(paper['Tasks'])

In [12]:
# remove from the task 
for i in range(len(tasks)):
    for j in range(len(tasks[i])):
        tasks[i][j] = deduplicate_fuzzy(tasks[i][j], threshold=80)

    tasks[i] = [sublist for sublist in tasks[i] if sublist]

In [15]:
responses_dataset_copy = predictions[0]
for i in range(len(responses_dataset_copy)):
    for j in range(len(responses_dataset_copy[i])):
        if isinstance(responses_dataset_copy[i][j], str):
            responses_dataset_copy[i][j] = ast.literal_eval(responses_dataset_copy[i][j])
        if len(responses_dataset_copy[i]) > 1:
            merged= list(chain.from_iterable(responses_dataset_copy[i]))
            responses_dataset_copy[i] = deduplicate_fuzzy(merged, threshold=80)
        else:
            responses_dataset_copy[i] = deduplicate_fuzzy(responses_dataset_copy[i][j], threshold=80)

SyntaxError: '[' was never closed (<unknown>, line 1)

In [None]:
references = [datasets, tasks]
predictions = [responses_dataset_copy, responses_task_copy]

In [None]:
autor_recalls = []
dataset_recalls = []
task_recalls = []
autor_precisions = []
dataset_precisions = []
task_precisions = []
autors_f1s = []
dataset_f1s = []
task_f1s = []
empty_references = {}
for i in range(len(predictions)):
    for j in range(len(predictions[i])):
        reference = references[i][j]
        prediction = predictions[i][j]

        if i != 0:
            # Flatten reference list into a single string (first elements or all)
            ref_text = [item[0] for item in reference]
            pred_text = prediction
        else:
            ref_text = [name.strip() for name in reference.split(',')]
            pred_text = ast.literal_eval(prediction)
        if len(ref_text) == 0:
            if i not in empty_references:
                empty_references[i] = []
            empty_references[i].append(j)

        if len(ref_text) > 0:
            if len(pred_text) == 0:
                recall = 0
                precision = 0
                f1 = 0
            else:
                # For each reference entity, find the max similarity to predicted entities
                max_similarities = compute_max_similarity(ref_text, pred_text, sim_model)

                # Apply threshold
                threshold = 0.6
                tp = (max_similarities >= threshold).sum().item()
                fn = len(ref_text) - tp         # false negatives
                fp = len(pred_text) - tp    # false positives
                def safe_div(num, denom):
                    return num / denom if denom else 0.0        # or np.nan
                precision=safe_div(tp, tp + fp)
                recall=safe_div(tp, tp + fn)
                f1=safe_div(2 * precision * recall, precision + recall)
            if i == 0:
                autor_recalls.append(recall)
                autor_precisions.append(precision)
                autors_f1s.append(f1)
            elif i == 1:
                dataset_recalls.append(recall)
                dataset_precisions.append(precision)
                dataset_f1s.append(f1)
            elif i == 2:
                task_recalls.append(recall)
                task_precisions.append(precision)
                task_f1s.append(f1)
        # print(f"Reference: {ref_text}")
        # print(f"Prediction: {pred_text}")
        # print(f"Recall: {recall:.4f}")

In [None]:
# removing authors from the predictions
dataset_recalls = []
task_recalls = []
dataset_precisions = []
task_precisions = []
dataset_f1s = []
task_f1s = []
empty_references = {}
for i in range(len(predictions)):
    for j in range(len(predictions[i])):
        reference = references[i][j]
        prediction = predictions[i][j]

        ref_text = [item[0] for item in reference]
        pred_text = prediction
        if len(ref_text) == 0:
            if i not in empty_references:
                empty_references[i] = []
            empty_references[i].append(j)

        if len(ref_text) > 0:
            if len(pred_text) == 0:
                recall = 0
                precision = 0
                f1 = 0
            else:
                # For each reference entity, find the max similarity to predicted entities
                max_similarities = compute_max_similarity(ref_text, pred_text, sim_model)

                # Apply threshold
                threshold = 0.6
                tp = (max_similarities >= threshold).sum().item()
                fn = len(ref_text) - tp         # false negatives
                fp = len(pred_text) - tp    # false positives
                def safe_div(num, denom):
                    return num / denom if denom else 0.0        # or np.nan
                precision=safe_div(tp, tp + fp)
                recall=safe_div(tp, tp + fn)
                f1=safe_div(2 * precision * recall, precision + recall)

            if i == 0:
                dataset_recalls.append(recall)
                dataset_precisions.append(precision)
                dataset_f1s.append(f1)
            elif i == 1:
                task_recalls.append(recall)
                task_precisions.append(precision)
                task_f1s.append(f1)
        # print(f"Reference: {ref_text}")
        # print(f"Prediction: {pred_text}")
        # print(f"Recall: {recall:.4f}")

In [78]:
empty_references

{0: [2, 6, 7, 10, 15, 16, 17, 21, 22, 23, 24, 25, 27, 29]}

In [68]:
for j in empty_references[1]:
    print(f'Title: {papers[j]["Title"]}')
    print(f'Paper{j}:',predictions[1][j])

Title: Adversarial Contrastive Estimation
Paper2: []
Title: Expeditious Generation of Knowledge Graph Embeddings
Paper6: ['AKSW-bib', 'DBpedia 2015-10']
Title: Learning Knowledge Graph Embeddings with Type Regularizer
Paper7: ['Freebase', 'FB15K']
Title: Inducing Interpretability in Knowledge Graph Embeddings
Paper10: ['FB15k-237']
Title: Knowledge-Based Distant Regularization in Learning Probabilistic Models
Paper15: ['World Bank Climate Change Knowledge Portal Dataset']
Title: Embedding Models for Episodic Knowledge Graphs
Paper16: ['GDELT', 'ICEWS']
Title: Seq2RDF: An end-to-end application for deriving Triples from Natural Language Text
Paper17: ['NYT', 'ADE', 'Wiki-DBpedia']
Title: DOLORES: Deep Contextualized Knowledge Graph Embeddings
Paper21: ['FB15K-237', 'WN11', 'FB13']
Title: Modelling Salient Features as Directions in Fine-Tuned Semantic Spaces
Paper22: ['movies', 'place-types', 'newsgroups', 'IMDB sentiment']
Title: Towards Understanding the Geometry of Knowledge Graph Emb

In [64]:
#mean_autor_recall = sum(autor_recalls) / len(autor_recalls)
mean_dataset_recall = sum(dataset_recalls) / len(dataset_recalls)
mean_task_recall = sum(task_recalls) / len(task_recalls)
#mean_autor_precision = sum(autor_precisions) / len(autor_precisions)
mean_dataset_precision = sum(dataset_precisions) / len(dataset_precisions)
mean_task_precision = sum(task_precisions) / len(task_precisions)
#mean_autor_f1 = sum(autors_f1s) / len(autors_f1s)
mean_dataset_f1 = sum(dataset_f1s) / len(dataset_f1s)
mean_task_f1 = sum(task_f1s) / len(task_f1s)

#print(f"Mean Author Recall: {mean_autor_recall:.4f}")
print(f"Mean Dataset Recall: {mean_dataset_recall:.4f}")
print(f"Mean Task Recall: {mean_task_recall:.4f}")
# print(f"Mean Author Precision: {mean_autor_precision:.4f}")
print(f"Mean Dataset Precision: {mean_dataset_precision:.4f}")
print(f"Mean Task Precision: {mean_task_precision:.4f}")
# print(f"Mean Author F1: {mean_autor_f1:.4f}")
print(f"Mean Dataset F1: {mean_dataset_f1:.4f}")
print(f"Mean Task F1: {mean_task_f1:.4f}")

Mean Dataset Recall: 0.1815
Mean Task Recall: 0.7202
Mean Dataset Precision: 0.3281
Mean Task Precision: 0.7124
Mean Dataset F1: 0.2116
Mean Task F1: 0.6292


In [34]:
mean_autor_recall = sum(autor_recalls) / len(autor_recalls)
mean_dataset_recall = sum(dataset_recalls) / len(dataset_recalls)
mean_task_recall = sum(task_recalls) / len(task_recalls)
mean_autor_precision = sum(autor_precisions) / len(autor_precisions)
mean_dataset_precision = sum(dataset_precisions) / len(dataset_precisions)
mean_task_precision = sum(task_precisions) / len(task_precisions)
mean_autor_f1 = sum(autors_f1s) / len(autors_f1s)
mean_dataset_f1 = sum(dataset_f1s) / len(dataset_f1s)
mean_task_f1 = sum(task_f1s) / len(task_f1s)

print(f"Mean Author Recall: {mean_autor_recall:.4f}")
print(f"Mean Dataset Recall: {mean_dataset_recall:.4f}")
print(f"Mean Task Recall: {mean_task_recall:.4f}")
print(f"Mean Author Precision: {mean_autor_precision:.4f}")
print(f"Mean Dataset Precision: {mean_dataset_precision:.4f}")
print(f"Mean Task Precision: {mean_task_precision:.4f}")
print(f"Mean Author F1: {mean_autor_f1:.4f}")
print(f"Mean Dataset F1: {mean_dataset_f1:.4f}")
print(f"Mean Task F1: {mean_task_f1:.4f}")

Mean Author Recall: 0.9422
Mean Dataset Recall: 0.8661
Mean Task Recall: 0.7821
Mean Author Precision: 0.8340
Mean Dataset Precision: 0.7542
Mean Task Precision: 0.4257
Mean Author F1: 0.8482
Mean Dataset F1: 0.7307
Mean Task F1: 0.4669


### Authors grobid


In [8]:
# Get the current working directory
current_dir = Path(os.getcwd())
parent_dir = current_dir.parent


with open("../data/papers_data.json", "r", encoding="utf-8") as f:
    papers_list = json.load(f)

# remove if Local PDF Path is None
papers = [paper for paper in papers_list if paper.get("Local PDF Path") is not None]
authors = []
datasets = []
tasks = []

# get the authors from papers
for i, paper in enumerate(papers):
    authors.append(paper['Authors'])
    datasets.append(paper['Datasets'])
    tasks.append(paper['Tasks'])
references = [authors, datasets, tasks]

In [9]:
current_dir = Path(".")
grobid = GrobidService(config_path="./Grobid/config.json")
authors_grobid = []

for paper in papers:
    pdf_path = str(current_dir / paper["Local PDF Path"])
    authors = grobid.extract_authors_from_pdf(pdf_path)
    authors_grobid.append(authors)
    print(authors)


GROBID server is up and running
['Yuyu Zhang', 'Hanjun Dai', 'Toraman Kamil', 'Le Song']
['Agustinus Kristiadi', 'Mohammad Asif Khan', 'Denis Lukovnikov', 'Jens Lehmann', 'Asja Fischer']
['Avishek Joey Bose', 'Huan Ling', 'Yanshuai Cao', 'Borealis Ai']
['Liwei Cai', 'William Yang Wang']
['Tim Dettmers', 'Pasquale Minervini', 'Pontus Stenetorp', 'Sebastian Riedel']
['Daniel Oñoro-Rubio', 'Mathias Niepert', 'Alberto García-Durán', 'Roberto González-Sánchez', 'Roberto J López-Sastre']
['Tommaso Soru', 'Stefano Ruberto', 'Diego Moussallem', 'André Valdestilhas', 'Alexander Bigerl', 'Edgard Marx', 'Diego Esteves']
['Bhushan Kotnis', 'Vivi Nastase']
['Bhushan Kotnis', 'Vivi Nastase']
['Wenhan Xiong', 'Thien Hoang', 'William Yang Wang']
['Tathagata Sengupta', 'Cibi Pragadeesh', 'Partha Pratim Talukdar']
['Armand Joulin', 'Piotr Bojanowski', 'Maximilian Nickel', 'Tomas Mikolov']
['Théo Trouillon', 'Maximilian Nickel']
['Muhao Chen', 'Yingtao Tian', 'Mohan Yang', 'Carlo Zaniolo']
['He He', 'Anu

In [13]:
autor_grobid_recalls = []
autor_grobid_precisions = []
autor_grobid_f1s = []


for i in range(len(authors_grobid)):
    reference = references[0][i]
    prediction = authors_grobid[i]


    ref_text = [name.strip() for name in reference.split(',')]
    pred_text = prediction

    if len(ref_text) == 0:
        recall = 1
        precision = 1
    elif len(pred_text) == 0:
        precision = 0
        recall = 0
    else:
        # For each reference entity, find the max similarity to predicted entities
        max_similarities = compute_max_similarity(ref_text, pred_text, sim_model)

        # Apply threshold
        threshold = 0.6
        num_matched = (max_similarities >= threshold).sum().item()
        recall = num_matched / len(ref_text) 
        precision = num_matched / len(pred_text)
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        autor_grobid_recalls.append(recall)
        autor_grobid_precisions.append(precision)
        autor_grobid_f1s.append(f1)

In [14]:
average_autor_grobid_recall = sum(autor_grobid_recalls) / len(autor_grobid_recalls)
average_autor_grobid_precision = sum(autor_grobid_precisions) / len(autor_grobid_precisions)
average_autor_grobid_f1 = sum(autor_grobid_f1s) / len(autor_grobid_f1s)
print(f"Mean Author Recall: {average_autor_grobid_recall:.4f}")
print(f"Mean Author Precision: {average_autor_grobid_precision:.4f}")
print(f"Mean Author F1: {average_autor_grobid_f1:.4f}")

Mean Author Recall: 0.9733
Mean Author Precision: 0.9778
Mean Author F1: 0.9669


In [None]:
pdf_path = str(current_dir /papers_list[11]['Local PDF Path'])

grobid = GrobidService(config_path="./Grobid/config.json")
tei = grobid.process_full_text(pdf_path)

raw_text = tei_to_full_raw_text(tei, remove_ref=True)

GROBID server is up and running


### dataset

In [None]:
import os
import json
dataset_mentions = []
with open('./datasets.json', "r") as file:
    dataset_mentions = json.load(file)

    

In [None]:
# Get the current working directory
current_dir = Path(os.getcwd())
recalls = []
precisions = []
f1s = []


grobid = GrobidService(config_path="./Grobid/config.json")
for paper in papers_list[0:30]:
    pdf_path = str(current_dir /paper['Local PDF Path'])

    tei = grobid.process_full_text(pdf_path)

    abstract = extract_abstract(tei)


    paper_title =  paper['Title'] 
    dataset_titles = []
    for data in dataset_mentions:
        dataset_titles.append(data)
    reference = paper['Datasets']
    ref_text = [item[0] for item in reference]
    if len(ref_text) == 0:
        print(f"No reference datasets found for paper: {paper_title}")
        continue
    if len(ref_text) > 1:
        print(f"Reference: {ref_text}")
        #print(f"Reference: {ref_text}")

        idx = closest_string_index(paper_title, dataset_titles)
        pred_text = dataset_mentions[dataset_titles[idx]]
        
        # if the pred_text is not in the best match section, then we don't have a match
        def filter_datasets_in_text(text, dataset_names):
            text_lower = text.lower()
            return [name for name in dataset_names if name.lower() in text_lower]
        pred_datasets = filter_datasets_in_text(abstract, pred_text)
        print(f"Predicted datasets: {pred_datasets}")
        pred_datasets = deduplicate_fuzzy(pred_datasets, threshold=80)
        print(f"Deduplicated predicted datasets: {pred_datasets}")
        #print(f"Predicted datasets: {pred_datasets}")
        if len(pred_datasets) == 0:
            recall = 0
            precision = 0
            f1 = 0
        else:

            # For each reference entity, find the max similarity to predicted entities
            max_similarities = compute_max_similarity(ref_text, pred_text, sim_model)

            # Apply threshold
            threshold = 0.6
            num_matched = (max_similarities >= threshold).sum().item()
                
            tp = num_matched
            fn = len(ref_text) - tp         # false negatives
            fp = len(pred_datasets) - tp    # false positives

            # helper to avoid zero-division warnings à la scikit-learn
            def safe_div(num, denom):
                return num / denom if denom else 0.0        # or np.nan

            precision = safe_div(tp, tp + fp)
            recall    = safe_div(tp, tp + fn)
            f1        = safe_div(2 * precision * recall, precision + recall)
            
        recalls.append(recall)
        precisions.append(precision)
        f1s.append(f1)
        print(f"recall: {recall}")
        print(f"precision: {precision}")
        print(f"f1: {f1}")
mean_recall = sum(recalls) / len(recalls)
mean_precision = sum(precisions) / len(precisions)
mean_f1 = sum(f1s) / len(f1s)
print(f"Mean Recall: {mean_recall:.4f}")    
print(f"Mean Precision: {mean_precision:.4f}")
print(f"Mean F1: {mean_f1:.4f}")

GROBID server is up and running
Reference: ['ARC (AI2 Reasoning Challenge)', 'SNLI', 'SQuAD']
Predicted datasets: ['ARC Challenge Set', 'ARC Challenge', 'AI2 Reasoning Challenge (ARC', 'AI2 Reasoning Challenge (ARC)']
Deduplicated predicted datasets: ['ARC Challenge Set', 'AI2 Reasoning Challenge (ARC']
recall: 0.3333333333333333
precision: 0.5
f1: 0.4
GROBID server is up and running
Reference: ['FB15k-237', 'FB15k']
Predicted datasets: []
Deduplicated predicted datasets: []
recall: 0
precision: 0
f1: 0
GROBID server is up and running
No reference datasets found for paper: Adversarial Contrastive Estimation
GROBID server is up and running
Reference: ['WN18RR', 'FB15k', 'WN18', 'FB15k-237']
Predicted datasets: ['FB15k-237', 'KBGAN', 'DISTMULT', 'WN18', 'WN18RR']
Deduplicated predicted datasets: ['FB15k-237', 'KBGAN', 'DISTMULT', 'WN18']
recall: 1.0
precision: 1.0
f1: 1.0
GROBID server is up and running
Reference: ['YAGO', 'YAGO3-10', 'UMLS', 'WN18RR', 'FB15k', 'FB15k-237', 'WN18']
Predi

In [None]:
# get only the dataset that appears in the section text we want
from grobid_client.grobid_client import GrobidClient

from pathlib import Path
import os
# Get the current working directory
current_dir = Path(os.getcwd())
recalls = []
precisions = []
f1s = []

for paper in papers_list[0:30]:
    pdf_path = str(current_dir /paper['Local PDF Path'])

    client = GrobidClient(config_path="./Grobid/config.json")       # uses defaults: localhost:8070

    tei_xml = client.process_pdf(
        service="processFulltextDocument",
        pdf_file=pdf_path,
        generateIDs        = False,   # was optional, now required
        consolidate_header = True,  # same default as server
        consolidate_citations = False,
        include_raw_citations   = False,
        include_raw_affiliations = False,
        segment_sentences   = False,
        tei_coordinates     = False
    )

    _, status, tei = tei_xml

    sections = extract_flat_sections_with_subtext(tei) # extract sections with their text in a dictionaty
    ranked_sections = rank_sections_by_semantic_similarity([sec['title'] for sec in sections], ["Experiments","Evaluation"]) # get the most similar sections to the queries
    best_match_section, best_score = ranked_sections[0]
    #print(f"Best matching section: {best_match_section} with score {best_score:.4f}")
    best_match_section_text = sections[[sec['title'] for sec in sections].index(best_match_section)]['text']

    paper_title =  paper['Title'] 
    dataset_titles = []
    for data in dataset_mentions:
        dataset_titles.append(data)
    reference = paper['Datasets']
    ref_text = [item[0] for item in reference]
    if len(ref_text) == 0:
        print(f"No reference datasets found for paper: {paper_title}")
        continue
    if len(ref_text) > 1:
        print(f"Reference: {ref_text}")
        #print(f"Reference: {ref_text}")

        idx = closest_string_index(paper_title, dataset_titles)
        pred_text = dataset_mentions[dataset_titles[idx]]
        
        # if the pred_text is not in the best match section, then we don't have a match
        def filter_datasets_in_text(text, dataset_names):
            text_lower = text.lower()
            return [name for name in dataset_names if name.lower() in text_lower]
        pred_datasets = filter_datasets_in_text(best_match_section_text, pred_text)
        print(f"Predicted datasets: {pred_datasets}")
        pred_datasets = deduplicate_fuzzy(pred_datasets, threshold=80)
        print(f"Deduplicated predicted datasets: {pred_datasets}")
        #print(f"Predicted datasets: {pred_datasets}")
        if len(pred_datasets) == 0:
            recall = 0
            precision = 0
            f1 = 0
        else:

            # For each reference entity, find the max similarity to predicted entities
            max_similarities = compute_max_similarity(ref_text, pred_text, sim_model)

            # Apply threshold
            threshold = 0.6
            num_matched = (max_similarities >= threshold).sum().item()
                
            tp = num_matched
            fn = len(ref_text) - tp         # false negatives
            fp = len(pred_datasets) - tp    # false positives

            # helper to avoid zero-division warnings à la scikit-learn
            def safe_div(num, denom):
                return num / denom if denom else 0.0        # or np.nan

            precision = safe_div(tp, tp + fp)
            recall    = safe_div(tp, tp + fn)
            f1        = safe_div(2 * precision * recall, precision + recall)
            
        recalls.append(recall)
        precisions.append(precision)
        f1s.append(f1)
        print(f"recall: {recall}")
        print(f"precision: {precision}")
        print(f"f1: {f1}")
mean_recall = sum(recalls) / len(recalls)
mean_precision = sum(precisions) / len(precisions)
mean_f1 = sum(f1s) / len(f1s)
print(f"Mean Recall: {mean_recall:.4f}")    
print(f"Mean Precision: {mean_precision:.4f}")
print(f"Mean F1: {mean_f1:.4f}")

GROBID server is up and running
Reference: ['ARC (AI2 Reasoning Challenge)', 'SNLI', 'SQuAD']
Predicted datasets: ['ARC Challenge Set', 'ARC Challenge', 'ARC Corpus']
Deduplicated predicted datasets: ['ARC Challenge Set', 'ARC Corpus']
recall: 0.3333333333333333
precision: 0.5
f1: 0.4
GROBID server is up and running
Reference: ['FB15k-237', 'FB15k']
Predicted datasets: ['15k', 'YAGO3-10', 'Freebase', 'FB15k-237', 'MTKGNN', 'YAGO3 knowledge graph', 'FB15k']
Deduplicated predicted datasets: ['15k', 'YAGO3-10', 'Freebase', 'FB15k-237', 'MTKGNN', 'YAGO3 knowledge graph', 'FB15k']
recall: 1.0
precision: 0.2857142857142857
f1: 0.4444444444444445
GROBID server is up and running
No reference datasets found for paper: Adversarial Contrastive Estimation
GROBID server is up and running
Reference: ['WN18RR', 'FB15k', 'WN18', 'FB15k-237']
Predicted datasets: ['FB15k-237', 'KBGAN', 'WN18 dataset', 'DISTMULT', 'WN18', 'WN18RR']
Deduplicated predicted datasets: ['FB15k-237', 'KBGAN', 'WN18 dataset', '

In [None]:
# get best match section tei

from grobid_client.grobid_client import GrobidClient

from pathlib import Path
import os
import re
from lxml import etree

# Get the current working directory
current_dir = Path(os.getcwd())
recalls = []
precisions = []
f1s = []
from bs4 import BeautifulSoup




grobid = GrobidService(config_path="./Grobid/config.json")
for paper in papers_list[0:2]:
    pdf_path = str(current_dir /paper['Local PDF Path'])

    tei = grobid.process_full_text(pdf_path)

    if status != 200:
        print(f"Error processing {pdf_path}: GROBID returned status {status}")
        continue

    try:
        etree.fromstring(tei.encode("utf-8"))  # or etree.XML(tei)
        print(f"{paper['Title']}: TEI is well-formed.")
    except etree.XMLSyntaxError as e:
        print(f"{paper['Title']}: TEI is malformed! Error: {e}")


    sections = extract_flat_sections_with_subtext(tei) # extract sections with their text in a dictionaty
    ranked_sections = rank_sections_by_semantic_similarity([sec['title'] for sec in sections], ["Experiments","Evaluation"]) # get the most similar sections to the queries
    best_match_section, best_score = ranked_sections[0]
    #print(f"Best matching section: {best_match_section} with score {best_score:.4f}")
    best_match_section_text = sections[[sec['title'] for sec in sections].index(best_match_section)]['text']

    # save the tei in a file inside a folder 
    tei_folder = current_dir / "tei_sections"
    tei_folder.mkdir(exist_ok=True)

    safe_title = re.sub(r'[^a-zA-Z0-9_\- ]+', '', paper['Title']).strip().replace(' ', '_')
    tei_file_path = tei_folder / f"{safe_title}.xml"

    with open(tei_file_path, "w", encoding="utf-8") as tei_file:
        tei_file.write(tei)
    print(f"TEI file saved to: {tei_file_path}")
 

GROBID server is up and running
KG^2: Learning to Reason Science Exam Questions with Contextual Knowledge Graph Embeddings: TEI is well-formed.
TEI file saved to: /mnt/c/Users/Che/GAP-KGE/tei_sections/KG2_Learning_to_Reason_Science_Exam_Questions_with_Contextual_Knowledge_Graph_Embeddings.xml
GROBID server is up and running
Incorporating Literals into Knowledge Graph Embeddings: TEI is well-formed.
TEI file saved to: /mnt/c/Users/Che/GAP-KGE/tei_sections/Incorporating_Literals_into_Knowledge_Graph_Embeddings.xml


In [11]:
papers_list[0]

{'Title': 'KG^2: Learning to Reason Science Exam Questions with Contextual Knowledge Graph Embeddings',
 'Authors': 'Hanjun Dai, Le Song, Kamil Toraman, Yuyu Zhang',
 'Abstract': 'The AI2 Reasoning Challenge (ARC), a new benchmark dataset for question\nanswering (QA) has been recently released. ARC only contains natural science\nquestions authored for human exams, which are hard to answer and require\nadvanced logic reasoning. On the ARC Challenge Set, existing state-of-the-art\nQA systems fail to significantly outperform random baseline, reflecting the\ndifficult nature of this task. In this paper, we propose a novel framework for\nanswering science exam questions, which mimics human solving process in an\nopen-book exam. To address the reasoning challenge, we construct contextual\nknowledge graphs respectively for the question itself and supporting sentences.\nOur model learns to reason with neural embeddings of both knowledge graphs.\nExperiments on the ARC Challenge Set show that o

In [12]:
best_match_section

'Experiments'

In [None]:
from bs4 import BeautifulSoup
import re

def normalize_title(title):
    # Normalize by stripping case and numbers like "2 RELATED WORK" → "related work"
    return re.sub(r"^\d+\.?\s*", "", title.strip().lower())


def keep_only_target_section(tei_xml: str, target_title: str) -> str:
    soup = BeautifulSoup(tei_xml, "xml")

    norm_target = normalize_title(target_title)

    body = soup.find("body")
    if body is None:
        raise ValueError("TEI does not contain a <body> section.")

    all_divs = body.find_all("div", recursive=False)

    collected_divs = []
    capture = False
    target_prefix = None

    for div in all_divs:
        head = div.find("head")
        if not head:
            continue

        head_text = head.get_text()
        norm_head = normalize_title(head_text)
        n_attr = head.get("n")

        # Detect start of target section
        if not capture and norm_head == norm_target:
            collected_divs.append(div)
            capture = True
            # Get the numeric prefix for nested matching (e.g., "5" from "5.1")
            if n_attr:
                target_prefix = n_attr.split(".")[0]
            continue

        # If capturing, collect subsections like "5.1", "5.2"
        if capture:
            if n_attr and target_prefix and n_attr.startswith(target_prefix + "."):
                collected_divs.append(div)
            else:
                break  # Stop capturing once outside the target section

    if not collected_divs:
        raise ValueError(f"No section found with title matching '{target_title}'.")

    body.clear()
    for div in collected_divs:
        body.append(div)

    return soup.prettify()



In [None]:

filtered_xml = keep_only_target_section(tei, best_match_section)

# Define output directory and file
output_dir = Path("output_sections")  # use your desired path here
output_file = output_dir / "experiments_section.tei.xml"

# Create the directory if it doesn't exist
output_dir.mkdir(parents=True, exist_ok=True)

# Write the result to a file
with open(output_file, "w", encoding="utf-8") as f:
    f.write(filtered_xml)

print(f"Saved to {output_file}")

Saved to output_sections/experiments_section.xml


In [73]:
mini_tei = keep_only_target_section(tei, best_match_section)
mini_tei

'<?xml version="1.0" encoding="utf-8"?>\n<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd">\n <teiHeader xml:lang="en">\n  <fileDesc>\n   <titleStmt>\n    <title level="a" type="main">\n     KG 2 : Learning to Reason Science Exam Questions with Contextual Knowledge Graph Embeddings\n    </title>\n   </titleStmt>\n   <publicationStmt>\n    <publisher/>\n    <availability status="unknown">\n     <licence/>\n    </availability>\n    <date type="published" when="2018-05-31">\n     31 May 2018\n    </date>\n   </publicationStmt>\n   <sourceDesc>\n    <biblStruct>\n     <analytic>\n      <author>\n       <persName>\n        <forename type="first">\n         Yuyu\n        </forename>\n        <surname>\n         Zhang\n        </surname>\n       <

In [23]:
for i in dataset_mentions:
    dataset_mentions[i] = deduplicate_fuzzy(dataset_mentions[i], threshold=80)
    print(f"Dataset: {i}, Mentions: {dataset_mentions[i]}")


Dataset: Adaptive_Margin_Ranking_Loss_for_Knowledge_Graph_Embeddings_via_a_Correntropy_Objective_Function, Mentions: ['DBpedia', 'NELL', 'Freebase', 'UNSTRUC- TURES', 'WordNet', 'Yago']
Dataset: Adversarial_Contrastive_Estimation, Mentions: ['WordSim- 353 dataset', 'Rare word dataset', 'Rare Word and WordSim353 data', 'TransD', 'WN18 dataset', 'WordSim- 353']
Dataset: Analysis_of_the_Impact_of_Negative_Sampling_on_Link_Prediction_in_Knowledge_Graphs, Mentions: ['FB15k dataset', 'WordNet data', 'FB', 'WordNet lexi- cal database', 'Freebase', 'wordnet', 'WN', 'FB15K benchmark dataset', 'WN18 dataset', 'RESCAL', 'WordNet', 'train and development', 'Freebase dataset', 'FB15k']
Dataset: Answering_Visual-Relational_Queries_in_Web-Extracted_Knowledge_Graphs, Mentions: ['DBpedia', 'ILSVRC2012 data set', 'ImageNet', 'Freebase', 'VisualGenome datasets', 'VisualGenome project', 'ImageGraph', 'WordNet', 'FB15k']
Dataset: Augmenting_and_Tuning_Knowledge_Graph_Embeddings, Mentions: ['FB15K', 'WordNe

In [None]:
titles = []
recalls = []
precisions = []
f1s = []
for i, paper in enumerate(papers_list):
    title = paper['Title']
    titles.append(title)
for data in dataset_mentions:
    # find the closest match in the papers_list[i]['title']
    idx = closest_string_index(data, titles)
    pred_text = dataset_mentions[data]
    reference = papers_list[idx]['Datasets']
    ref_text = [item[0] for item in reference]
    if len(ref_text) == 0:
            continue
    if len(ref_text) > 0:
        if len(pred_text) == 0:
            recall = 0
            precision = 0
            f1 = 0
        else:

            # For each reference entity, find the max similarity to predicted entities
            max_similarities = compute_max_similarity(ref_text, pred_text, sim_model)

            # Apply threshold
            threshold = 0.6
            tp = (max_similarities >= threshold).sum().item()
            fn = len(ref_text) - tp         # false negatives
            fp = len(pred_text) - tp    # false positives
            def safe_div(num, denom):
                return num / denom if denom else 0.0        # or np.nan
            precision=safe_div(tp, tp + fp)
            recall=safe_div(tp, tp + fn)
            f1=safe_div(2 * precision * recall, precision + recall)
            recalls.append(recall)
            precisions.append(precision)
            f1s.append(f1)
mean_recall = sum(recalls) / len(recalls)
mean_precision = sum(precisions) / len(precisions)
mean_f1 = sum(f1s) / len(f1s)
print(f"Mean Recall: {mean_recall:.4f}")    
print(f"Mean Precision: {mean_precision:.4f}")
print(f"Mean F1: {mean_f1:.4f}")

Mean Recall: 0.7329
Mean Precision: 0.2733
Mean F1: 0.3738


In [None]:
import requests

with open("../data/papers_data.json", "r", encoding="utf-8") as f:
    papers_list = json.load(f)

# === Setup ===
client = GrobidClient(config_path="./Grobid/config.json")
current_dir = Path(os.getcwd())
recalls = []
precisions = [] 
f1s = []

dataset_files = {}
for paper in papers_list[0:30]:
    print(f"Processing paper: {paper['Title']}")
    pdf_path = str(current_dir /paper['Local PDF Path'])

    # client.process(
    #     "processFulltextDocument",                # or "processHeaderDocument"
    #     str(current_dir /paper['Local PDF Path']),
    #     generateIDs=True,                         # adds xml:id attributes
    #     consolidate_citations=True                # cross‑links refs ↔ bibliography
    # )
    tei_xml = client.process_pdf(
        service="processFulltextDocument",
        pdf_file=pdf_path,
        generateIDs        = False,   # was optional, now required
        consolidate_header = True,  # same default as server
        consolidate_citations = False,
        include_raw_citations   = False,
        include_raw_affiliations = False,
        segment_sentences   = False,
        tei_coordinates     = False
    )

    _, status, tei = tei_xml

    sections = extract_flat_sections_with_subtext(tei)
    ranked_sections = rank_sections_by_semantic_similarity([sec['title'] for sec in sections], ["Experiments","Evaluation"],model = sim_model)
    best_match_section, best_score = ranked_sections[0]
    best_match_section_text = str(sections[[sec['title'] for sec in sections].index(best_match_section)]['text'])
    # print best text
    print(f"Best matching section: {best_match_section} with score {best_score:.4f}")
    

    response = requests.post(
        "http://localhost:8060/service/annotateDatasetSentence",
        data={"text": best_match_section_text}
    )


    if response.status_code == 200:
        # with open("result.json", "w", encoding="utf-8") as f:
        #     f.write(response.text)
        data = response.json()
        dataset_names = [
            mention['normalizedForm']
            for mention in data.get('mentions', [])
            if mention.get('type') == 'dataset-name'
        ]

        # optionally, deduplicate
        unique_dataset_names = list(set(dataset_names))


        unique_dataset_names = deduplicate_fuzzy(unique_dataset_names, threshold=80)
        print(f"Mentions: {unique_dataset_names}")

        pred_text = unique_dataset_names
        reference = paper['Datasets']
        ref_text = [item[0] for item in reference]
        print(f"Reference: {ref_text}")
        if len(ref_text) == 0:
                continue
        if len(ref_text) > 0:
            if len(pred_text) == 0:
                recall = 0
                precision = 0
                f1 = 0
            else:
                # For each reference entity, find the max similarity to predicted entities
                max_similarities = compute_max_similarity(ref_text, pred_text, sim_model)
                # Apply threshold
                threshold = 0.6
                tp = (max_similarities >= threshold).sum().item()
                fn = len(ref_text) - tp         # false negatives
                fp = len(pred_text) - tp    # false positives
                def safe_div(num, denom):
                    return num / denom if denom else 0.0        # or np.nan
                precision=safe_div(tp, tp + fp)
                recall=safe_div(tp, tp + fn)
                f1=safe_div(2 * precision * recall, precision + recall)
                recalls.append(recall)
                precisions.append(precision)
                f1s.append(f1)

    else:
        print(f"Error: {response.status_code}")

mean_recall = sum(recalls) / len(recalls)
mean_precision = sum(precisions) / len(precisions)
mean_f1 = sum(f1s) / len(f1s)
print(f"Mean Recall: {mean_recall:.4f}")    
print(f"Mean Precision: {mean_precision:.4f}")
print(f"Mean F1: {mean_f1:.4f}")


GROBID server is up and running
Processing paper: KG^2: Learning to Reason Science Exam Questions with Contextual Knowledge Graph Embeddings
Best matching section: Experiments with score 1.0000
Mentions: ['ARC Challenge Set', 'ARC Corpus']
Reference: ['ARC (AI2 Reasoning Challenge)', 'SNLI', 'SQuAD']
Processing paper: Incorporating Literals into Knowledge Graph Embeddings
Best matching section: Experiments with score 1.0000
Mentions: []
Reference: ['FB15k-237', 'FB15k']
Processing paper: Adversarial Contrastive Estimation
Best matching section: Experiments with score 1.0000
Mentions: ['WordSim-353', 'Rare word dataset']
Reference: []
Processing paper: KBGAN: Adversarial Learning for Knowledge Graph Embeddings
Best matching section: Experiments with score 1.0000
Mentions: ['WN18', 'FB15k-237', 'WN', 'FB']
Reference: ['WN18RR', 'FB15k', 'WN18', 'FB15k-237']
Processing paper: Convolutional 2D Knowledge Graph Embeddings
Best matching section: Experimental Setup with score 0.6321
Mentions: 

In [12]:
unique_dataset_names

['ARC Challenge Set', 'ARC Corpus']

In [38]:
unique_dataset_names = list(set(dataset_names))
for i in range(len(unique_dataset_names)):
    print(i)

0
1


In [27]:
import os
import json
dataset_mentions = []
with open('./datasets.json', "r") as file:
    dataset_mentions = json.load(file)

In [28]:
dataset_mentions

{'Adaptive_Margin_Ranking_Loss_for_Knowledge_Graph_Embeddings_via_a_Correntropy_Objective_Function': ['DBpedia',
  'NELL',
  'Freebase',
  'UNSTRUC- TURES',
  'WordNet',
  'Yago'],
 'Adversarial_Contrastive_Estimation': ['WordSim- 353 dataset',
  'Rare word dataset',
  'Rare Word and WordSim353 data',
  'TransD',
  'WN18 dataset',
  'WordSim- 353'],
 'Analysis_of_the_Impact_of_Negative_Sampling_on_Link_Prediction_in_Knowledge_Graphs': ['FB15k dataset',
  'WordNet data',
  'FB',
  'WordNet lexi- cal database',
  'Freebase',
  'wordnet',
  'WN',
  'WordNet dataset',
  'FB15K benchmark dataset',
  'WN18 dataset',
  'RESCAL',
  'WordNet',
  'train and development',
  'Freebase dataset',
  'FB15k'],
 'Answering_Visual-Relational_Queries_in_Web-Extracted_Knowledge_Graphs': ['DBpedia',
  'ILSVRC2012 data set',
  'ImageNet',
  'Freebase',
  'VisualGenome datasets',
  'FreeBase',
  'VisualGenome project',
  'ImageGraph',
  'VisualGenome data',
  'WordNet',
  'FB15k'],
 'Augmenting_and_Tuning_Kn

### task ner

In [None]:

# Get the current working directory
current_dir = Path(os.getcwd())
responses_dataset = []
responses_task = []
responses_authors = []
from bs4 import BeautifulSoup

grobid = GrobidService(config_path="./Grobid/config.json")
for paper in papers_list[0:30]:
    pdf_path = str(current_dir /paper['Local PDF Path'])

    tei = grobid.process_full_text(pdf_path)


    raw_text = tei_to_full_raw_text(tei, remove_ref=True)

In [65]:
import spacy
nlp = spacy.load("en_core_web_sm")

def parse_tei(tei_text):
    soup = BeautifulSoup(tei_text, "xml")

    words = []
    sentences = []
    sections = []

    token_idx = 0
    for div in soup.find_all("div"):
        sec_start = token_idx
        for p in div.find_all("p"):
            text = p.get_text()
            doc = nlp(text)
            for sent in doc.sents:
                s_start = token_idx
                for token in sent:
                    words.append(token.text)
                    token_idx += 1
                s_end = token_idx
                sentences.append([s_start, s_end])
        sec_end = token_idx
        if sec_end > sec_start:
            sections.append([sec_start, sec_end])
    
    return {
        "doc_id": soup.find("teiHeader").find("idno").text if soup.find("idno") else "unknown_doc",
        "words": words,
        "sentences": sentences,
        "sections": sections
    }

In [66]:
def parse_sections(section_texts, nlp):
    words = []
    sentences = []
    sections = []

    token_idx = 0
    for text in section_texts:
        sec_start = token_idx
        doc = nlp(text)
        for sent in doc.sents:
            s_start = token_idx
            for token in sent:
                words.append(token.text)
                token_idx += 1
            s_end = token_idx
            sentences.append([s_start, s_end])
        sec_end = token_idx
        if sec_end > sec_start:
            sections.append([sec_start, sec_end])

    return {
        "doc_id": "from_sections",
        "words": words,
        "sentences": sentences,
        "sections": sections
    }

In [42]:
section_texts = [sec['text'] for sec in sections]

In [None]:
with open("../data/papers_data.json", "r", encoding="utf-8") as f:
    papers_list = json.load(f)
from grobid_client.grobid_client import GrobidClient
# === Setup ===
client = GrobidClient(config_path="./Grobid/config.json")
current_dir = Path(os.getcwd())
output_jsonl_path = str(current_dir) + "/SciREX-master/scirex_format_abstract.jsonl"


# === Processing Loop ===
with open(output_jsonl_path, "w") as out_f:
    for idx, paper in enumerate(papers_list[:30]):
        pdf_path = str(current_dir / paper['Local PDF Path'])

        try:
            tei_xml = client.process_pdf(
                service="processFulltextDocument",
                pdf_file=pdf_path,
                generateIDs=True,  # ✅ this is now required
                consolidate_header=True,
                consolidate_citations=False,
                include_raw_citations=False,
                include_raw_affiliations=False,
                segment_sentences=True,
                tei_coordinates=False
            )
            _, status, tei = tei_xml
            abstract = [extract_abstract(tei)]
            parsed = parse_sections(abstract, nlp)

            # parsed = parse_tei(tei)
            parsed["doc_id"] = f"doc_{idx:04d}"  # doc_0000, doc_0001, ...
            json.dump(parsed, out_f)
            out_f.write("\n")
        except Exception as e:
            print(f"Failed to process {pdf_path}: {e}")


GROBID server is up and running


In [None]:
with open("../data/papers_data.json", "r", encoding="utf-8") as f:
    papers_list = json.load(f)

# === Setup ===
client = GrobidClient(config_path="./Grobid/config.json")
current_dir = Path(os.getcwd())
output_jsonl_path = str(current_dir) + "/SciREX-master/scirex_format_experiment.jsonl"


# === Processing Loop ===
with open(output_jsonl_path, "w") as out_f:
    for idx, paper in enumerate(papers_list[:30]):
        pdf_path = str(current_dir / paper['Local PDF Path'])

        try:
            tei_xml = client.process_pdf(
                service="processFulltextDocument",
                pdf_file=pdf_path,
                generateIDs=True,  # ✅ this is now required
                consolidate_header=True,
                consolidate_citations=False,
                include_raw_citations=False,
                include_raw_affiliations=False,
                segment_sentences=True,
                tei_coordinates=False
            )
            _, status, tei = tei_xml
            sections = extract_flat_sections_with_subtext(tei)
            ranked_sections = rank_sections_by_semantic_similarity([sec['title'] for sec in sections], ["Experiments","Evaluation"])
            best_match_section, best_score = ranked_sections[0]
            best_match_section_text = [sections[[sec['title'] for sec in sections].index(best_match_section)]['text']]
            parsed = parse_sections(best_match_section_text, nlp)

            # parsed = parse_tei(tei)
            parsed["doc_id"] = f"doc_{idx:04d}"  # doc_0000, doc_0001, ...
            json.dump(parsed, out_f)
            out_f.write("\n")
        except Exception as e:
            print(f"Failed to process {pdf_path}: {e}")


GROBID server is up and running


In [43]:
with open("papers_data.json", "r", encoding="utf-8") as f:
    papers_list = json.load(f)

In [61]:
# Get the current working directory
current_dir = Path(os.getcwd())

with open(str(current_dir) + '/SciREX-master/test_outputs/pdfs/ner_predictions_abstract.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

In [62]:
first_doc = data[0]

doc_id = first_doc["doc_id"]
words = first_doc["words"]
ner_spans = first_doc.get("ner", [])

print(f"First Document ID: {doc_id}")
for start, end, label in ner_spans:
    span_text = " ".join(words[start:end])
    print(f"  - {label}: {span_text}")

First Document ID: doc_0000
  - Method: neural models
  - Material: SQuAD
  - Material: SNLI


In [64]:
tasks_per_doc = {}

for entry in data:
    doc_id = entry["doc_id"]
    words = entry["words"]
    ner_spans = entry.get("ner", [])

    tasks = set()
    for start, end, label in ner_spans:
        if label == "Task":
            span_text = " ".join(words[start:end])
            tasks.add(span_text)
    tasks = list(tasks)

    tasks_per_doc[doc_id] = tasks

In [65]:
for i in tasks_per_doc:
    tasks_per_doc[i] = deduplicate_fuzzy(tasks_per_doc[i], threshold=80)

In [66]:
for doc_id, tasks in tasks_per_doc.items():
    print(f"{doc_id}:")
    for task in tasks:
        print(f"  - {task}")

doc_0000:
doc_0001:
doc_0002:
doc_0003:
doc_0004:
doc_0005:
doc_0006:
doc_0007:
doc_0008:
doc_0009:
doc_0010:
doc_0011:
doc_0012:
doc_0013:
doc_0014:
doc_0015:
doc_0016:
doc_0017:
doc_0018:
doc_0019:
doc_0020:
doc_0021:
doc_0022:
doc_0023:
doc_0024:
doc_0025:
doc_0026:
doc_0027:
doc_0028:
doc_0029:


In [None]:
titles = []
recalls = []
precisions = []
f1s = []
for i, paper in enumerate(papers_list):
    title = paper['Title']
    titles.append(title)
for idx, data in enumerate(tasks_per_doc):

    pred_text = tasks_per_doc[data]
    reference = papers_list[idx]['Tasks']
    ref_text = [item[0] for item in reference]
    if len(ref_text) == 0:
        recall = 1
    elif len(pred_text) == 0:
        recall = 0
        precision = 0
        f1 = 0
    else:
        # For each reference entity, find the max similarity to predicted entities
        max_similarities = compute_max_similarity(ref_text, pred_text, sim_model)

        # Apply threshold
        threshold = 0.6
        num_matched = (max_similarities >= threshold).sum().item()
        tp = num_matched
        fn = len(ref_text) - tp         # false negatives
        fp = len(pred_text) - tp    # false positives

        # helper to avoid zero-division warnings à la scikit-learn
        def safe_div(num, denom):
            return num / denom if denom else 0.0        # or np.nan

        precision=safe_div(tp, tp + fp)
        recall=safe_div(tp, tp + fn)
        f1=safe_div(2 * precision * recall, precision + recall)
    
    recalls.append(recall)
    precisions.append(precision)
    f1s.append(f1)

    print(f"Reference: {ref_text}")
    print(f"Prediction: {pred_text}")
    print(f"Recall: {recall:.4f}")
    print(f"Precision: {precision:.4f}")

Reference: ['ARC', 'Question Answering', 'Knowledge Graphs', 'AI2 Reasoning Challenge', 'Knowledge Graph Embeddings']
Prediction: ['model evaluation', 'multiple - choice QA', 'span prediction QA', 'reasoning', 'sentence - level entailment', 'IR - ARC', 'IR - Google']
Recall: 0.2000
Precision: 0.1429
Reference: ['Knowledge Graphs', 'Knowledge Graph Embeddings', 'Link Prediction', 'Entity Embeddings']
Prediction: ['link prediction task']
Recall: 0.2500
Precision: 1.0000
Reference: ['Contrastive Learning', 'Knowledge Graphs', 'Knowledge Graph Embeddings', 'Word Embeddings', 'Learning Word Embeddings']
Prediction: ['link prediction task', 'MRR', 'knowledge graph embeddings', 'ablation study', 'hypernym prediction task', 'ACE.We', 'order embeddings', 'word similarity tasks']
Recall: 0.8000
Precision: 0.5000
Reference: ['Graph Embedding', 'Knowledge Base Completion', 'Knowledge Graphs', 'Knowledge Graph Embeddings', 'Link Prediction', 'Knowledge Graph Embedding']
Prediction: ['link predictio

In [50]:
avg_recall = sum(recalls) / len(recalls)
avg_precision = sum(precisions) / len(precisions)
avg_f1 = sum(f1s) / len(f1s)

print(f"Average Recall: {avg_recall:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average F1: {avg_f1:.4f}")

Average Recall: 0.3132
Average Precision: 0.3028
Average F1: 0.2831


### Model question

In [5]:
# open the CSV file
import pandas as pd
df = pd.read_csv("index.csv")
df.head(5)

Unnamed: 0,category,url,filename
0,Semantic matching models,https://arxiv.org/pdf/1901.09590,model_type_pdfs/1901.09590.pdf
1,Semantic matching models,https://www.cip.ifi.lmu.de/~nickel/data/slides...,model_type_pdfs/slides-icml2011.pdf
2,Semantic matching models,https://proceedings.neurips.cc/paper/2012/file...,model_type_pdfs/0a1bf96b7165e962e90cb14648c946...
3,Semantic matching models,https://arxiv.org/pdf/1506.00999,model_type_pdfs/1506.00999.pdf
4,Semantic matching models,https://proceedings.mlr.press/v70/liu17d/liu17...,model_type_pdfs/liu17d.pdf


In [7]:
for i in range(len(df)):
    paper_filename = df.iloc[i]['filename']
    print(f"Paper {i}: {paper_filename}")

Paper 0: model_type_pdfs/1901.09590.pdf
Paper 1: model_type_pdfs/slides-icml2011.pdf
Paper 2: model_type_pdfs/0a1bf96b7165e962e90cb14648c9462d-Paper.pdf
Paper 3: model_type_pdfs/1506.00999.pdf
Paper 4: model_type_pdfs/liu17d.pdf
Paper 5: model_type_pdfs/1412.6575.pdf
Paper 6: model_type_pdfs/trouillon16.pdf
Paper 7: model_type_pdfs/1802.04868.pdf
Paper 8: model_type_pdfs/ds-paper-620.pdf
Paper 9: model_type_pdfs/1705.10744.pdf
Paper 10: model_type_pdfs/1805.02408.pdf
Paper 11: model_type_pdfs/lacroix18a.pdf
Paper 12: model_type_pdfs/1912.02686.pdf
Paper 13: model_type_pdfs/1904.10281.pdf
Paper 14: model_type_pdfs/1910.11583.pdf
Paper 15: model_type_pdfs/b337e84de8752b27eda3a12363109e80-Paper.pdf
Paper 16: model_type_pdfs/45634.pdf
Paper 17: model_type_pdfs/1603.07704.pdf
Paper 18: model_type_pdfs/1808.04122.pdf
Paper 19: model_type_pdfs/1703.06103.pdf
Paper 20: model_type_pdfs/1911.03082.pdf
Paper 21: model_type_pdfs/1711.04071.pdf
Paper 22: model_type_pdfs/85.pdf
Paper 23: model_type_

In [13]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen3-1.7B"

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
max_context_tokens = 32768 - 2048

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
model_question = "Given the model definitions mentioned before, choose one of the following taxonomy as the taxonomy of the model mentioned in this paper: Semantic matching model, Translation models, Internal side information inside KGs model, External extra information outside KGs or Other KGC Technologies ? "

In [None]:
model_context = '''The semantic matching models and The translation models only use the structure information of internal facts in KGs. The semantic matching models generally use semantic matching-based scoring functions and further consists of tensor/matrix factorization models and neural network models. The translation models apply distance-based scoring functions.
While Internal side information inside KGs and External extra information outside KGs outside KGs cooperate with additional information (the inside or outside information of KGs except for the structure information) to achieve KGC. Internal side information inside KGs involved in KGs, including node attributes information, entity-related information, relation-related information, neighborhood information, relational path information; External extra information outside KGs outside KGs, mainly including two aspects: rule-based KGC and third-party data sources-based KGC. 
And if it is not any of the previous models, then it is Other KGC technologies.'''
# model_context = '''Structure information-based KGC methods: which only use the structure information of internal facts in KGs. For this category, KGC is reviewed under semantic matching models and translation models according to the nature of their scoring functions. The semantic matching models generally use semantic matching-based scoring functions and further consists of tensor/matrix factorization models and neural network models. The translation models apply distance-based scoring function; 
# Additional information-based KGC methods: which cooperate with additional information (the inside or outside information of KGs except for the structure information) to achieve KGC. For this category, we further propose fine-grained taxonomies respective into two views about the usage of inside information or outside information: Internal side information inside KGs involved in KGs, including node attributes information, entity-related information, relation-related information, neighborhood information, relational path information; External extra information outside KGs outside KGs, mainly including two aspects: rule-based KGC and third-party data sources-based KGC.
# And if it is not any of the previous models, then it is Other KGC technologies.'''


In [None]:
import time
# Get the current working directory
current_dir = Path(os.getcwd())
responses_model = []

labels = []
answers = []
grobid = GrobidService(config_path="./Grobid/config.json")
for i in range(10): #len(df)
    paper_filename = df.iloc[i]['filename']
    label = df.iloc[i]['category']
    print("Processing paper:", paper_filename)
    start = time.time()
    pdf_path = str(current_dir/paper_filename)

    tei = grobid.process_full_text(pdf_path)
    print("Grobid processing took:", time.time() - start, "seconds")

    # raw_text = extract_abstract(tei)

    # sections
    # sections = extract_flat_sections_with_subtext(tei) # extract sections with their text in a dictionaty
    # ranked_sections = rank_sections_by_semantic_similarity([sec['title'] for sec in sections], ["Experiments","Evaluation"],model = sim_model) # get the most similar sections to the queries
    # best_match_section, best_score = ranked_sections[0]
    # raw_text = sections[[sec['title'] for sec in sections].index(best_match_section)]['text']

    # full text
    raw_text = tei_to_full_raw_text(tei, remove_ref=True)

    # Example Usage
    chunks = chunk_text(raw_text, tokenizer, max_tokens=max_context_tokens, overlap=200)



    # Select which chunks to run

    chunks_to_process = chunks

    # Loop over the chosen chunks
    for j, chunk in enumerate(chunks_to_process):
        # Build the chat history
        chat = [
            {
                "role": "system",
                "content": "You are an assistant for question-answering tasks. Use only the provided context information to form your response."
            },
            {"role": "user", "content": f"Context chunk: {chunk}"
            },
            {
            "role": "user",
            "content": (
                f"Now, given this context for model taxonomy: {model_context} Answer this question: {model_question} Give back the answer only and only in a correct Python list format, for example: ['A']. If you don't know the answer, just return an empty list."
            )
            }           
        ]

        # 2: Apply the chat template
        formatted_chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True,enable_thinking=True)
        #print("Formatted chat:\n", formatted_chat)

        # 3: Tokenize the chat (This can be combined with the previous step using tokenize=True)
        model_inputs = tokenizer([formatted_chat], return_tensors="pt").to(model.device)

        model.eval()
        with torch.no_grad():
        # 4: Generate text from the model
            generated_ids = model.generate(
                **model_inputs,
                max_new_tokens=2048,
                temperature=0.7,
                top_p=0.8,
                top_k=20,
            )
        output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 

        # parsing thinking content
        try:
            # rindex finding 151668 (</think>)
            index = len(output_ids) - output_ids[::-1].index(151668)
        except ValueError:
            index = 0

        thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
        content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
        print("label:", label)
        print("content:", content)
        print("thinking_content:", thinking_content)

        
        labels.append(label)
        answers.append(content)

        


        print(f"Generation took {time.time() - start:.2f} seconds")
        del model_inputs, generated_ids
        torch.cuda.empty_cache()
        

        # Route responses into the right list
        responses_model.append(content)


In [None]:
# Save the responses to a JSON file
import json

output_file = "model_taxonomy_responses.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump({
        "labels": labels,
        "answers": answers,
    }, f, ensure_ascii=False, indent=4)
# Calculate and print the average metrics   

In [None]:
# Save the responses to a JSON file
import json

with open("model_taxonomy_responses.json", "r", encoding="utf-8") as f:
    responses = json.load(f)
    
labels = responses["labels"]
answers = responses["answers"]
for i in range(len(labels)):
    if labels[i] == "Other models":
        labels[i] = "Other KGC Technologies"



print("Number of responses:", len(answers))

recalls = []
precisions = []
f1s = []

for i in range(len(answers)):
    label = labels[i]
    answer = answers[i]
    if isinstance(answer, str):
        answer = ast.literal_eval(answer)
        print (f"Answer for paper {i}: {answer}")
        print(f"Label for paper {i}: {label}")
        print(len(answer))
    if len(answer) != 1:
        recall = 0
        precision = 0
        f1 = 0
    else:
        # For each reference entity, find the max similarity to predicted entities
        max_similarities = compute_max_similarity([label], answer, sim_model)

        # Apply threshold
        threshold = 0.8
        num_matched = (max_similarities >= threshold).sum().item()
        
        tp = num_matched
        fn = 1 - tp         # false negatives
        fp = 1 - tp
           
        # helper to avoid zero-division warnings à la scikit-learn
        def safe_div(num, denom):
            return num / denom if denom else 0.0
        precision = safe_div(tp, tp + fp)
        recall    = safe_div(tp, tp + fn)
        f1        = safe_div(2 * precision * recall, precision + recall)
        
    recalls.append(recall)
    precisions.append(precision)
    f1s.append(f1)
    
    print(f"Recall for paper {i}: {recall:.4f}")
    print(f"Precision for paper {i}: {precision:.4f}")
    print(f"F1 for paper {i}: {f1:.4f}")
# Calculate and print the average metrics   
average_autor_grobid_recall = sum(recalls) / len(recalls)
average_autor_grobid_precision = sum(precisions) / len(precisions)
average_autor_grobid_f1 = sum(f1s) / len(f1s)

print(f"Mean Author Recall: {average_autor_grobid_recall:.4f}")
print(f"Mean Author Precision: {average_autor_grobid_precision:.4f}")
print(f"Mean Author F1: {average_autor_grobid_f1:.4f}")

Number of responses: 99
Answer for paper 0: []
Label for paper 0: Semantic matching models
0
Recall for paper 0: 0.0000
Precision for paper 0: 0.0000
F1 for paper 0: 0.0000
Answer for paper 1: ['Semantic matching model']
Label for paper 1: Semantic matching models
1
Recall for paper 1: 1.0000
Precision for paper 1: 1.0000
F1 for paper 1: 1.0000
Answer for paper 2: ['Semantic matching model']
Label for paper 2: Semantic matching models
1
Recall for paper 2: 1.0000
Precision for paper 2: 1.0000
F1 for paper 2: 1.0000
Answer for paper 3: ['Semantic matching model']
Label for paper 3: Semantic matching models
1
Recall for paper 3: 1.0000
Precision for paper 3: 1.0000
F1 for paper 3: 1.0000
Answer for paper 4: ['Other KGC Technologies']
Label for paper 4: Semantic matching models
1
Recall for paper 4: 0.0000
Precision for paper 4: 0.0000
F1 for paper 4: 0.0000
Answer for paper 5: ['Semantic matching model']
Label for paper 5: Semantic matching models
1
Recall for paper 5: 1.0000
Precision f

#### Only abstract

In [6]:
# Save the responses to a JSON file
import json

# open the CSV file
import pandas as pd
df = pd.read_csv("index.csv")
df.head(5)
labels = []
# Create a mapping from category names to numerical labels
category_mapping = {
    "Semantic matching models": 0,
    "Translation models": 1,
    "Internal side information inside KGs": 2,
    "External extra information outside KGs": 3,
    "Other models": 4
}
# Initialize labels based on the category mapping
labels = [category_mapping.get(category, 4) for category in df['category']]

In [19]:
duplicates = df[df['url'].duplicated(keep=False)]
duplicates

Unnamed: 0,category,url,filename
17,Semantic matching models,https://arxiv.org/pdf/1603.07704,model_type_pdfs/1603.07704.pdf
38,Internal side information inside KGs,https://aclanthology.org/D15-1031.pdf,model_type_pdfs/D15-1031.pdf
89,External extra information outside KGs,https://aclanthology.org/D15-1031.pdf,model_type_pdfs/D15-1031.pdf
106,Other models,https://arxiv.org/pdf/1603.07704,model_type_pdfs/1603.07704.pdf


In [None]:
mask = ~df['url'].duplicated(keep=False)
df_unique_only = df[mask]
df_unique_only


Unnamed: 0,category,url,filename
0,Semantic matching models,https://arxiv.org/pdf/1901.09590,model_type_pdfs/1901.09590.pdf
1,Semantic matching models,https://www.cip.ifi.lmu.de/~nickel/data/slides...,model_type_pdfs/slides-icml2011.pdf
2,Semantic matching models,https://proceedings.neurips.cc/paper/2012/file...,model_type_pdfs/0a1bf96b7165e962e90cb14648c946...
3,Semantic matching models,https://arxiv.org/pdf/1506.00999,model_type_pdfs/1506.00999.pdf
4,Semantic matching models,https://proceedings.mlr.press/v70/liu17d/liu17...,model_type_pdfs/liu17d.pdf
...,...,...,...
108,Other models,https://aclanthology.org/K18-1014.pdf,model_type_pdfs/K18-1014.pdf
109,Other models,https://arxiv.org/pdf/1906.05317,model_type_pdfs/1906.05317.pdf
110,Other models,https://arxiv.org/pdf/2001.04170,model_type_pdfs/2001.04170.pdf
111,Other models,https://arxiv.org/pdf/1604.08642,model_type_pdfs/1604.08642.pdf


In [24]:
# Initialize labels based on the category mapping
labels_unique = [category_mapping.get(category, 4) for category in df_unique_only['category']]

In [None]:
mask = ~df['url'].duplicated(keep=False)
df_unique_only = df[mask]
df_unique_only
# Initialize labels based on the category mapping
labels_unique = [category_mapping.get(category, 4) for category in df_unique_only['category']]
import time
# Get the current working directory
current_dir = Path(os.getcwd())
responses_model = []

documents = []
grobid = GrobidService(config_path="./Grobid/config.json")
for i in range(len(df_unique_only)): #len(df)
    paper_filename = df_unique_only.iloc[i]['filename']
    # label = df.iloc[i]['category']

    print("Processing paper:", paper_filename)
    start = time.time()
    pdf_path = str(current_dir/paper_filename)

    tei = grobid.process_full_text(pdf_path)
    print("Grobid processing took:", time.time() - start, "seconds")

    try:
        raw_text = extract_abstract(tei)
        # if extract_abstract returns None or an empty string, treat as failure
        if not raw_text or not raw_text.strip():
            print("No abstract found, skipping.")
            labels_unique.pop(i)  # Remove the label for this paper
            continue
    except Exception as e:
        print(f"Error extracting abstract ({e!r}), skipping.")
        labels_unique.pop(i)  # Remove the label for this paper
        continue

    # sections
    # sections = extract_flat_sections_with_subtext(tei) # extract sections with their text in a dictionaty
    # ranked_sections = rank_sections_by_semantic_similarity([sec['title'] for sec in sections], ["Experiments","Evaluation"],model = sim_model) # get the most similar sections to the queries
    # best_match_section, best_score = ranked_sections[0]
    # raw_text = sections[[sec['title'] for sec in sections].index(best_match_section)]['text']

    # full text
    # raw_text = tei_to_full_raw_text(tei, remove_ref=True)
    documents.append(raw_text)



GROBID server is up and running
Processing paper: model_type_pdfs/1901.09590.pdf
Grobid processing took: 1.5844838619232178 seconds
Processing paper: model_type_pdfs/slides-icml2011.pdf
Grobid processing took: 1.1370532512664795 seconds
No abstract found, skipping.
Processing paper: model_type_pdfs/0a1bf96b7165e962e90cb14648c9462d-Paper.pdf
Grobid processing took: 1.4528651237487793 seconds
Processing paper: model_type_pdfs/1506.00999.pdf
Grobid processing took: 1.9051806926727295 seconds
Processing paper: model_type_pdfs/liu17d.pdf
Grobid processing took: 1.5787098407745361 seconds
Processing paper: model_type_pdfs/1412.6575.pdf
Grobid processing took: 1.4570648670196533 seconds
Processing paper: model_type_pdfs/trouillon16.pdf
Grobid processing took: 1.5201945304870605 seconds
Processing paper: model_type_pdfs/1802.04868.pdf
Grobid processing took: 1.6138768196105957 seconds
Processing paper: model_type_pdfs/ds-paper-620.pdf
Grobid processing took: 1.6179580688476562 seconds
Processi

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score

X_train, X_test, y_train, y_test = train_test_split(documents, labels_unique, stratify=labels_unique, test_size=0.2, random_state=42)
model = make_pipeline(
    TfidfVectorizer(ngram_range=(1,2), max_features=50000),
    LogisticRegression(class_weight='balanced', max_iter=1000)
)
print(cross_val_score(model, X_train, y_train, cv=5, scoring='f1_macro').mean())
model.fit(X_train, y_train)
print("Test F1:", f1_score(y_test, model.predict(X_test), average='macro'))

from sklearn.metrics import f1_score, precision_score, recall_score

y_pred = model.predict(X_test)

average_recall = recall_score(y_test, y_pred, average='macro')
average_precision = precision_score(y_test, y_pred, average='macro')
average_f1 = f1_score(y_test, y_pred, average='macro')

print(f"Average Recall: {average_recall:.4f}")
print(f"Average Precision: {average_precision:.4f}")
print(f"Average F1: {average_f1:.4f}")


0.6234341736694679
Test F1: 0.6409803921568626
Average Recall: 0.6129
Average Precision: 0.7200
Average F1: 0.6410


In [45]:
print(len(documents), len(X_train), len(X_test))

107 85 22


In [30]:
from sklearn.model_selection import GridSearchCV

pipeline = make_pipeline(
    TfidfVectorizer(stop_words='english'),
    LogisticRegression(class_weight='balanced', max_iter=1000)
)

param_grid = {
    'tfidfvectorizer__ngram_range': [(1,1), (1,2), (1,3)],
    'tfidfvectorizer__min_df': [1, 3, 5],
    'tfidfvectorizer__max_df': [0.7, 0.8, 0.9, 1.0],
    'tfidfvectorizer__max_features': [10000, 30000, 50000],
    'logisticregression__C': [0.01, 0.1, 1, 10]
}

search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
search.fit(X_train, y_train)
print("Best params:", search.best_params_)
print("Best CV F1:", search.best_score_)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Best params: {'logisticregression__C': 1, 'tfidfvectorizer__max_df': 0.7, 'tfidfvectorizer__max_features': 10000, 'tfidfvectorizer__min_df': 1, 'tfidfvectorizer__ngram_range': (1, 2)}
Best CV F1: 0.6591758241758241


In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score

X_train, X_test, y_train, y_test = train_test_split(documents, labels_unique, stratify=labels_unique, test_size=0.2, random_state=42)
model = make_pipeline(
    TfidfVectorizer(ngram_range=(1,2), max_features=10000, stop_words='english', min_df=1, max_df=0.7),
    LogisticRegression(class_weight='balanced', max_iter=1000,C = 1)
)
print(cross_val_score(model, X_train, y_train, cv=5, scoring='f1_macro').mean())
model.fit(X_train, y_train)
print("Test F1:", f1_score(y_test, model.predict(X_test), average='macro'))

from sklearn.metrics import f1_score, precision_score, recall_score

y_pred = model.predict(X_test)

average_recall = recall_score(y_test, y_pred, average='macro')
average_precision = precision_score(y_test, y_pred, average='macro')
average_f1 = f1_score(y_test, y_pred, average='macro')

print(f"Average Recall: {average_recall:.4f}")
print(f"Average Precision: {average_precision:.4f}")
print(f"Average F1: {average_f1:.4f}")


0.6194341736694678
Test F1: 0.6988888888888889
Average Recall: 0.6700
Average Precision: 0.8473
Average F1: 0.6989


In [47]:
import numpy as np

# 1. Fit if you haven’t already
model.fit(X_train, y_train)

# 2. Grab the components
tfidf = model.named_steps['tfidfvectorizer']
clf = model.named_steps['logisticregression']
feature_names = tfidf.get_feature_names_out()
coefs = clf.coef_                     # shape (n_classes, n_features)

# If binary classification, coefs.shape == (1, n_features)
# If multiclass, coefs.shape == (n_classes, n_features)

n = 15  # how many top terms to show

if coefs.shape[0] == 1:
    # Binary case: look at single row
    sorted_idx = np.argsort(coefs[0])
    top_neg   = sorted_idx[:n]      # most negative (class 0 indicators)
    top_pos   = sorted_idx[-n:]     # most positive (class 1 indicators]

    print("Top negative features (strongest for class 0):")
    for i in top_neg:
        print(f"  {feature_names[i]:20s} ({coefs[0][i]:+.3f})")

    print("\nTop positive features (strongest for class 1):")
    for i in reversed(top_pos):
        print(f"  {feature_names[i]:20s} ({coefs[0][i]:+.3f})")

else:
    # Multiclass: loop over each class
    for class_idx, label in enumerate(clf.classes_):
        print(f"\nTop features for class {label!r}:")
        sorted_idx = np.argsort(coefs[class_idx])
        top = sorted_idx[-n:]
        for i in reversed(top):
            print(f"  {feature_names[i]:20s} ({coefs[class_idx][i]:+.3f})")



Top features for class 0:
  multi relational     (+0.283)
  valued               (+0.260)
  standard             (+0.260)
  tucker               (+0.252)
  baseline             (+0.234)
  model                (+0.220)
  large                (+0.219)
  complex              (+0.219)
  relational           (+0.203)
  cp                   (+0.200)
  negative             (+0.186)
  size                 (+0.185)
  datasets             (+0.174)
  gcns                 (+0.174)
  fb15k                (+0.171)

Top features for class 1:
  translation          (+0.748)
  translation based    (+0.480)
  projection           (+0.420)
  head                 (+0.375)
  tail                 (+0.373)
  relations            (+0.349)
  attention            (+0.340)
  entity               (+0.335)
  transe               (+0.329)
  itransf              (+0.323)
  rotate               (+0.314)
  relation             (+0.309)
  tail entity          (+0.307)
  head entity          (+0.305)
  scholarly       

In [39]:
from sentence_transformers import SentenceTransformer
from nltk import sent_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# 1. Load a pre-trained sentence embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# 2. Helper to encode one document
def doc_to_vec(doc):
    sentences = sent_tokenize(doc)
    sent_embs = embedder.encode(sentences)   # shape: (n_sentences, dim)
    return sent_embs.mean(axis=0)           # mean pooling → (dim,)

# 3. Prepare embeddings for all docs
X = [doc_to_vec(d) for d in documents]
y = labels_unique

# 4. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

# 5. Simple classifier
clf = LogisticRegression(max_iter=1000, class_weight='balanced')
clf.fit(X_train, y_train)
print("Test macro-F1:", f1_score(y_test, clf.predict(X_test), average='macro'))
# calculate the average metrics
from sklearn.metrics import f1_score, precision_score, recall_score

y_pred = clf.predict(X_test)

average_recall = recall_score(y_test, y_pred, average='macro')
average_precision = precision_score(y_test, y_pred, average='macro')
average_f1 = f1_score(y_test, y_pred, average='macro')

print(f"Average Recall: {average_recall:.4f}")
print(f"Average Precision: {average_precision:.4f}")
print(f"Average F1: {average_f1:.4f}")


Train size: 85, Test size: 22
Test macro-F1: 0.7766666666666666
Average Recall: 0.7976
Average Precision: 0.8000
Average F1: 0.7767


In [44]:
from sentence_transformers import SentenceTransformer
from nltk import sent_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# 1. Load a pre-trained sentence embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# 2. Helper to encode one document
def doc_to_vec(doc):
    sentences = sent_tokenize(doc)
    sent_embs = embedder.encode(sentences)   # shape: (n_sentences, dim)
    return sent_embs.mean(axis=0)           # mean pooling → (dim,)

# 3. Prepare embeddings for all docs
X = [doc_to_vec(d) for d in documents]
y = labels_unique

# 4. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

# 5. Simple classifier
clf = RandomForestClassifier(n_estimators=100, class_weight='balanced')
clf.fit(X_train, y_train)
print("Test macro-F1:", f1_score(y_test, clf.predict(X_test), average='macro'))
# calculate the average metrics
from sklearn.metrics import f1_score, precision_score, recall_score

y_pred = clf.predict(X_test)

average_recall = recall_score(y_test, y_pred, average='macro')
average_precision = precision_score(y_test, y_pred, average='macro')
average_f1 = f1_score(y_test, y_pred, average='macro')

print(f"Average Recall: {average_recall:.4f}")
print(f"Average Precision: {average_precision:.4f}")
print(f"Average F1: {average_f1:.4f}")


Train size: 85, Test size: 22
Test macro-F1: 0.5914285714285714
Average Recall: 0.5395
Average Precision: 0.7436
Average F1: 0.5914


#### Full text

In [48]:
# Save the responses to a JSON file
import json

# open the CSV file
import pandas as pd
df = pd.read_csv("index.csv")
df.head(5)
labels = []
# Create a mapping from category names to numerical labels
category_mapping = {
    "Semantic matching models": 0,
    "Translation models": 1,
    "Internal side information inside KGs": 2,
    "External extra information outside KGs": 3,
    "Other models": 4
}
# Initialize labels based on the category mapping
labels = [category_mapping.get(category, 4) for category in df['category']]

In [None]:
mask = ~df['url'].duplicated(keep=False)
df_unique_only = df[mask]
df_unique_only
# Initialize labels based on the category mapping
labels_unique = [category_mapping.get(category, 4) for category in df_unique_only['category']]

In [50]:

import time
# Get the current working directory
current_dir = Path(os.getcwd())
responses_model = []

documents = []
grobid = GrobidService(config_path="./Grobid/config.json")
for i in range(len(df_unique_only)): #len(df)
    paper_filename = df_unique_only.iloc[i]['filename']
    # label = df.iloc[i]['category']

    print("Processing paper:", paper_filename)
    start = time.time()
    pdf_path = str(current_dir/paper_filename)

    tei = grobid.process_full_text(pdf_path)
    print("Grobid processing took:", time.time() - start, "seconds")

    try:
        raw_text = extract_abstract(tei)
        # if extract_abstract returns None or an empty string, treat as failure
        if not raw_text or not raw_text.strip():
            print("No abstract found, skipping.")
            labels_unique.pop(i)  # Remove the label for this paper
            continue
    except Exception as e:
        print(f"Error extracting abstract ({e!r}), skipping.")
        labels_unique.pop(i)  # Remove the label for this paper
        continue

    # sections
    # sections = extract_flat_sections_with_subtext(tei) # extract sections with their text in a dictionaty
    # ranked_sections = rank_sections_by_semantic_similarity([sec['title'] for sec in sections], ["Experiments","Evaluation"],model = sim_model) # get the most similar sections to the queries
    # best_match_section, best_score = ranked_sections[0]
    # raw_text = sections[[sec['title'] for sec in sections].index(best_match_section)]['text']

    # full text
    # raw_text = tei_to_full_raw_text(tei, remove_ref=True)
    documents.append(raw_text)



GROBID server is up and running
Processing paper: model_type_pdfs/1901.09590.pdf
Grobid processing took: 1.537729263305664 seconds
Processing paper: model_type_pdfs/slides-icml2011.pdf
Grobid processing took: 1.1344513893127441 seconds
No abstract found, skipping.
Processing paper: model_type_pdfs/0a1bf96b7165e962e90cb14648c9462d-Paper.pdf
Grobid processing took: 1.437873363494873 seconds
Processing paper: model_type_pdfs/1506.00999.pdf
Grobid processing took: 1.8809728622436523 seconds
Processing paper: model_type_pdfs/liu17d.pdf
Grobid processing took: 1.6332082748413086 seconds
Processing paper: model_type_pdfs/1412.6575.pdf
Grobid processing took: 1.4580271244049072 seconds
Processing paper: model_type_pdfs/trouillon16.pdf
Grobid processing took: 1.6761054992675781 seconds
Processing paper: model_type_pdfs/1802.04868.pdf
Grobid processing took: 1.5943093299865723 seconds
Processing paper: model_type_pdfs/ds-paper-620.pdf
Grobid processing took: 1.6394598484039307 seconds
Processing

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score

X_train, X_test, y_train, y_test = train_test_split(documents, labels_unique, stratify=labels_unique, test_size=0.20, random_state=42 )
print("Number of training samples:", len(X_train))
print("Number of test samples:", len(X_test))

model = make_pipeline(
    TfidfVectorizer(ngram_range=(1,2), max_features=50000),
    LogisticRegression(class_weight='balanced', max_iter=1000)
)
print(cross_val_score(model, X_train, y_train, cv=5, scoring='f1_macro').mean())
model.fit(X_train, y_train)
print("Test F1:", f1_score(y_test, model.predict(X_test), average='macro'))

from sklearn.metrics import f1_score, precision_score, recall_score

y_pred = model.predict(X_test)

average_recall = recall_score(y_test, y_pred, average='macro')
average_precision = precision_score(y_test, y_pred, average='macro')
average_f1 = f1_score(y_test, y_pred, average='macro')

print(f"Average Recall: {average_recall:.4f}")
print(f"Average Precision: {average_precision:.4f}")
print(f"Average F1: {average_f1:.4f}")


Number of training samples: 85
Number of test samples: 22
0.6234341736694679
Test F1: 0.6409803921568626
Average Recall: 0.6129
Average Precision: 0.7200
Average F1: 0.6410


In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score

X_train, X_test, y_train, y_test = train_test_split(documents, labels_unique, stratify=labels_unique, test_size=0.2, random_state=42)
print("Number of training samples:", len(X_train))
print("Number of test samples:", len(X_test))

model = make_pipeline(
    TfidfVectorizer(ngram_range=(1,1), max_features=50000, stop_words='english',max_df=0.7, min_df=3),
    LogisticRegression(class_weight='balanced', max_iter=1000, C=1)
)
print(cross_val_score(model, X_train, y_train, cv=5, scoring='f1_macro').mean())
model.fit(X_train, y_train)
print("Test F1:", f1_score(y_test, model.predict(X_test), average='macro'))

from sklearn.metrics import f1_score, precision_score, recall_score

y_pred = model.predict(X_test)

average_recall = recall_score(y_test, y_pred, average='macro')
average_precision = precision_score(y_test, y_pred, average='macro')
average_f1 = f1_score(y_test, y_pred, average='macro')

print(f"Average Recall: {average_recall:.4f}")
print(f"Average Precision: {average_precision:.4f}")
print(f"Average F1: {average_f1:.4f}")


Number of training samples: 85
Number of test samples: 22
0.6177858220211161
Test F1: 0.65
Average Recall: 0.6314
Average Precision: 0.7467
Average F1: 0.6500


In [54]:
from sklearn.model_selection import GridSearchCV

pipeline = make_pipeline(
    TfidfVectorizer(stop_words='english'),
    LogisticRegression(class_weight='balanced', max_iter=1000)
)

param_grid = {
    'tfidfvectorizer__ngram_range': [(1,1), (1,2), (1,3)],
    'tfidfvectorizer__min_df': [1, 3, 5],
    'tfidfvectorizer__max_df': [0.7, 0.8, 0.9, 1.0],
    'tfidfvectorizer__max_features': [10000, 30000, 50000],
    'logisticregression__C': [0.01, 0.1, 1, 10]
}

search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
search.fit(X_train, y_train)
print("Best params:", search.best_params_)
print("Best CV F1:", search.best_score_)


Best params: {'logisticregression__C': 0.01, 'tfidfvectorizer__max_df': 1.0, 'tfidfvectorizer__max_features': 10000, 'tfidfvectorizer__min_df': 1, 'tfidfvectorizer__ngram_range': (1, 2)}
Best CV F1: 0.6588658008658008


In [19]:
import numpy as np
import pandas as pd

# Turn cv_results_ into a DataFrame
results = pd.DataFrame(search.cv_results_)

# Sort by mean_test_score descending
results = results.sort_values(by="mean_test_score", ascending=False)

results.head(5)




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_logisticregression__C,param_tfidfvectorizer__max_df,param_tfidfvectorizer__max_features,param_tfidfvectorizer__min_df,param_tfidfvectorizer__ngram_range,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
246,0.241116,0.021403,0.044233,0.004218,1,0.8,10000,3,"(1, 1)","{'logisticregression__C': 1, 'tfidfvectorizer_...",0.583333,0.955556,0.692222,0.819048,0.931429,0.796317,0.141652,1
255,0.349559,0.081402,0.053547,0.004603,1,0.8,30000,3,"(1, 1)","{'logisticregression__C': 1, 'tfidfvectorizer_...",0.583333,0.955556,0.692222,0.819048,0.931429,0.796317,0.141652,1
237,0.281741,0.06006,0.068667,0.021182,1,0.7,50000,3,"(1, 1)","{'logisticregression__C': 1, 'tfidfvectorizer_...",0.583333,0.955556,0.692222,0.819048,0.931429,0.796317,0.141652,1
264,0.26629,0.047587,0.071011,0.024098,1,0.8,50000,3,"(1, 1)","{'logisticregression__C': 1, 'tfidfvectorizer_...",0.583333,0.955556,0.692222,0.819048,0.931429,0.796317,0.141652,1
219,0.23592,0.02318,0.045396,0.003814,1,0.7,10000,3,"(1, 1)","{'logisticregression__C': 1, 'tfidfvectorizer_...",0.583333,0.955556,0.692222,0.819048,0.931429,0.796317,0.141652,1


In [55]:
import numpy as np

# 1. Fit if you haven’t already
model.fit(X_train, y_train)

# 2. Grab the components
tfidf: TfidfVectorizer        = model.named_steps['tfidfvectorizer']
clf:  LogisticRegression      = model.named_steps['logisticregression']
feature_names = tfidf.get_feature_names_out()
coefs = clf.coef_                     # shape (n_classes, n_features)

# If binary classification, coefs.shape == (1, n_features)
# If multiclass, coefs.shape == (n_classes, n_features)

n = 15  # how many top terms to show

if coefs.shape[0] == 1:
    # Binary case: look at single row
    sorted_idx = np.argsort(coefs[0])
    top_neg   = sorted_idx[:n]      # most negative (class 0 indicators)
    top_pos   = sorted_idx[-n:]     # most positive (class 1 indicators]

    print("Top negative features (strongest for class 0):")
    for i in top_neg:
        print(f"  {feature_names[i]:20s} ({coefs[0][i]:+.3f})")

    print("\nTop positive features (strongest for class 1):")
    for i in reversed(top_pos):
        print(f"  {feature_names[i]:20s} ({coefs[0][i]:+.3f})")

else:
    # Multiclass: loop over each class
    for class_idx, label in enumerate(clf.classes_):
        print(f"\nTop features for class {label!r}:")
        sorted_idx = np.argsort(coefs[class_idx])
        top = sorted_idx[-n:]
        for i in reversed(top):
            print(f"  {feature_names[i]:20s} ({coefs[class_idx][i]:+.3f})")



Top features for class 0:
  standard             (+0.521)
  valued               (+0.518)
  baseline             (+0.508)
  model                (+0.441)
  relational           (+0.437)
  large                (+0.412)
  complex              (+0.407)
  fb15k                (+0.368)
  datasets             (+0.352)
  negative             (+0.350)
  size                 (+0.336)
  tensor               (+0.335)
  bilinear             (+0.331)
  multiple             (+0.319)
  expressive           (+0.309)

Top features for class 1:
  translation          (+1.195)
  projection           (+0.702)
  transe               (+0.661)
  attention            (+0.658)
  head                 (+0.628)
  tail                 (+0.625)
  relations            (+0.554)
  entity               (+0.541)
  relation             (+0.509)
  flexible             (+0.395)
  patterns             (+0.385)
  various              (+0.379)
  mechanism            (+0.347)
  related              (+0.347)
  specific        

In [56]:
from sentence_transformers import SentenceTransformer
from nltk import sent_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# 1. Load a pre-trained sentence embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# 2. Helper to encode one document
def doc_to_vec(doc):
    sentences = sent_tokenize(doc)
    sent_embs = embedder.encode(sentences)   # shape: (n_sentences, dim)
    return sent_embs.mean(axis=0)           # mean pooling → (dim,)

# 3. Prepare embeddings for all docs
X = [doc_to_vec(d) for d in documents]
y = labels_unique

# 4. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.20, random_state=42)

# 5. Simple classifier
clf = LogisticRegression(max_iter=1000, class_weight='balanced')
clf.fit(X_train, y_train)
# calculate the average metrics
from sklearn.metrics import f1_score, precision_score, recall_score

y_pred = clf.predict(X_test)

average_recall = recall_score(y_test, y_pred, average='macro')
average_precision = precision_score(y_test, y_pred, average='macro')
average_f1 = f1_score(y_test, y_pred, average='macro')

print(f"Average Recall: {average_recall:.4f}")
print(f"Average Precision: {average_precision:.4f}")
print(f"Average F1: {average_f1:.4f}")


Average Recall: 0.7976
Average Precision: 0.8000
Average F1: 0.7767
