In [1]:
from utils.tei_extraction import extract_sections_fulltext, extract_abstract, tei_to_full_raw_text, extract_flat_sections_with_subtext, rank_sections_by_semantic_similarity
from utils.grobid_service import GrobidService


from rapidfuzz import fuzz, process
import ast
from itertools import chain
from pathlib import Path
from grobid_client.grobid_client import GrobidClient
from bs4 import BeautifulSoup
import Levenshtein
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch


import json
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

def deduplicate_fuzzy(list, threshold=80):
    unique = []
    for name in list:
        if all(fuzz.ratio(name, existing) < threshold for existing in unique):
            unique.append(name)
    return unique

In [3]:
# Get the current working directory
current_dir = Path(os.getcwd())
parent_dir = current_dir.parent


with open("../data/papers_data_copy.json", "r", encoding="utf-8") as f:
    papers_list = json.load(f)

# remove if Local PDF Path is None
papers = [paper for paper in papers_list if paper.get("Local PDF Path") is not None]
authors = []
datasets = []
tasks = []

# get the authors from papers
for i, paper in enumerate(papers):
    authors.append(paper['Authors'])
    datasets.append(paper['Datasets'])
    tasks.append(paper['Tasks'])
references = [authors, datasets, tasks]

In [4]:
tasks

[[['ARC'],
  ['Question Answering'],
  ['Knowledge Graphs'],
  ['AI2 Reasoning Challenge'],
  ['Knowledge Graph Embeddings']],
 [['Knowledge Graphs'],
  ['Knowledge Graph Embeddings'],
  ['Link Prediction'],
  ['Entity Embeddings']],
 [['Contrastive Learning'],
  ['Knowledge Graphs'],
  ['Knowledge Graph Embeddings'],
  ['Word Embeddings'],
  ['Learning Word Embeddings']],
 [['Graph Embedding'],
  ['Knowledge Base Completion'],
  ['Knowledge Graphs'],
  ['Knowledge Graph Embeddings'],
  ['Link Prediction'],
  ['Knowledge Graph Embedding']],
 [['Knowledge Graphs'], ['Knowledge Graph Embeddings'], ['Link Prediction']],
 [['Knowledge Graphs'],
  ['Retrieval'],
  ['Graph Embedding'],
  ['Image Retrieval'],
  ['Knowledge Graph Embeddings'],
  ['Knowledge Graph Embedding'],
  ['Zero-Shot Learning'],
  ['Representation Learning']],
 [['Graph Embedding'],
  ['Question Answering'],
  ['Knowledge Graph Completion'],
  ['Knowledge Graph Embeddings'],
  ['Link Prediction'],
  ['Knowledge Graph Emb

In [5]:
current_dir = Path(".")
grobid = GrobidService(config_path="./Grobid/config.json")
authors_grobid = []

for paper in papers:
    pdf_path = str(current_dir / paper["Local PDF Path"])
    authors = grobid.extract_authors_from_pdf(pdf_path)
    authors_grobid.append(authors)
    print(authors)

INFO - Loading configuration file from ./Grobid/config.json
INFO - Configuration file loaded successfully
2026-01-06 15:51:56,820 - INFO - Logging configured - Level: INFO, Console: True, File: disabled
2026-01-06 15:51:56,872 - INFO - GROBID server http://localhost:8070 is up and running


['Yuyu Zhang', 'Hanjun Dai', 'Toraman Kamil', 'Le Song']
['Agustinus Kristiadi', 'Mohammad Asif Khan', 'Denis Lukovnikov', 'Jens Lehmann', 'Asja Fischer']
['Avishek Joey Bose', 'Huan Ling', 'Yanshuai Cao', 'Borealis Ai']
['Liwei Cai', 'William Yang Wang']
['Tim Dettmers', 'Pasquale Minervini', 'Pontus Stenetorp', 'Sebastian Riedel']
['Daniel Oñoro-Rubio', 'Mathias Niepert', 'Alberto García-Durán', 'Roberto González-Sánchez', 'Roberto J López-Sastre']
['Tommaso Soru', 'Stefano Ruberto', 'Diego Moussallem', 'André Valdestilhas', 'Alexander Bigerl', 'Edgard Marx', 'Diego Esteves']
['Bhushan Kotnis', 'Vivi Nastase']
['Bhushan Kotnis', 'Vivi Nastase']
['Wenhan Xiong', 'Thien Hoang', 'William Yang Wang']
['Tathagata Sengupta', 'Cibi Pragadeesh', 'Partha Pratim Talukdar']
['Armand Joulin', 'Piotr Bojanowski', 'Maximilian Nickel', 'Tomas Mikolov']
['Théo Trouillon', 'Maximilian Nickel']
['Muhao Chen', 'Yingtao Tian', 'Mohan Yang', 'Carlo Zaniolo']
['He He', 'Anusha Balakrishnan', 'Mihail Eric'

## dataset

In [6]:
import os
import json
dataset_mentions = []
with open('./datasets.json', "r") as file:
    dataset_mentions = json.load(file)


## Model task

In [7]:
with open(str(current_dir) + '/SciREX-master/test_outputs/pdfs/ner_predictions_abstract.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

In [8]:
tasks_per_doc = {}

for entry in data:
    doc_id = entry["doc_id"]
    words = entry["words"]
    ner_spans = entry.get("ner", [])

    tasks = set()
    for start, end, label in ner_spans:
        if label == "Task":
            span_text = " ".join(words[start:end])
            tasks.add(span_text)
    tasks = list(tasks)

    tasks_per_doc[doc_id] = tasks

for i in tasks_per_doc:
    tasks_per_doc[i] = deduplicate_fuzzy(tasks_per_doc[i], threshold=80)

for doc_id, tasks in tasks_per_doc.items():
    print(f"{doc_id}:")
    for task in tasks:
        print(f"  - {task}")

doc_0000:
doc_0001:
doc_0002:
doc_0003:
doc_0004:
doc_0005:
doc_0006:
doc_0007:
doc_0008:
doc_0009:
doc_0010:
doc_0011:
doc_0012:
doc_0013:
doc_0014:
doc_0015:
doc_0016:
doc_0017:
doc_0018:
doc_0019:
doc_0020:
doc_0021:
doc_0022:
doc_0023:
doc_0024:
doc_0025:
doc_0026:
doc_0027:
doc_0028:
doc_0029:


## Model taxonomy

In [9]:
# Save the responses to a JSON file
import json

# open the CSV file
import pandas as pd
df = pd.read_csv("model_taxonomy_survey_2022\index.csv")
df.head(5)
labels = []
# Create a mapping from category names to numerical labels
category_mapping = {
    "Semantic matching models": 0,
    "Translation models": 1,
    "Internal side information inside KGs": 2,
    "External extra information outside KGs": 3,
    "Other models": 4
}
# Initialize labels based on the category mapping
labels = [category_mapping.get(category, 4) for category in df['category']]


mask = ~df['url'].duplicated(keep=False)
df_unique_only = df[mask]

# Initialize labels based on the category mapping
labels_unique = [category_mapping.get(category, 4) for category in df_unique_only['category']]

In [10]:
import time
# Get the current working directory
current_dir = Path(os.getcwd())
responses_model = []

documents = []
grobid = GrobidService(config_path="./Grobid/config.json")
for i in range(len(df_unique_only)): #len(df)
    paper_filename = "model_taxonomy_survey_2022/" + df_unique_only.iloc[i]['filename']
    # label = df.iloc[i]['category']

    print("Processing paper:", paper_filename)
    start = time.time()
    pdf_path = str(current_dir/paper_filename)

    tei = grobid.process_full_text(pdf_path)
    print("Grobid processing took:", time.time() - start, "seconds")

    try:
        raw_text = extract_abstract(tei)
        # if extract_abstract returns None or an empty string, treat as failure
        if not raw_text or not raw_text.strip():
            print("No abstract found, skipping.")
            labels_unique.pop(i)  # Remove the label for this paper
            continue
    except Exception as e:
        print(f"Error extracting abstract ({e!r}), skipping.")
        labels_unique.pop(i)  # Remove the label for this paper
        continue

    # sections
    # sections = extract_flat_sections_with_subtext(tei) # extract sections with their text in a dictionaty
    # ranked_sections = rank_sections_by_semantic_similarity([sec['title'] for sec in sections], ["Experiments","Evaluation"],model = sim_model) # get the most similar sections to the queries
    # best_match_section, best_score = ranked_sections[0]
    # raw_text = sections[[sec['title'] for sec in sections].index(best_match_section)]['text']

    # full text
    # raw_text = tei_to_full_raw_text(tei, remove_ref=True)
    documents.append(raw_text)



INFO - Loading configuration file from ./Grobid/config.json
INFO - Configuration file loaded successfully


Processing paper: model_taxonomy_survey_2022/model_type_pdfs/1901.09590.pdf
Grobid processing took: 5.154153347015381 seconds
Processing paper: model_taxonomy_survey_2022/model_type_pdfs/0a1bf96b7165e962e90cb14648c9462d-Paper.pdf
Grobid processing took: 2.6073012351989746 seconds
Processing paper: model_taxonomy_survey_2022/model_type_pdfs/1506.00999.pdf
Grobid processing took: 4.542264223098755 seconds
Processing paper: model_taxonomy_survey_2022/model_type_pdfs/liu17d.pdf
Grobid processing took: 2.59675669670105 seconds
Processing paper: model_taxonomy_survey_2022/model_type_pdfs/1412.6575.pdf
Grobid processing took: 2.2232553958892822 seconds
Processing paper: model_taxonomy_survey_2022/model_type_pdfs/trouillon16.pdf
Grobid processing took: 2.336663007736206 seconds
Processing paper: model_taxonomy_survey_2022/model_type_pdfs/1802.04868.pdf
Grobid processing took: 2.3163814544677734 seconds
Processing paper: model_taxonomy_survey_2022/model_type_pdfs/ds-paper-620.pdf
Grobid process

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score

X_train, X_test, y_train, y_test = train_test_split(documents, labels_unique, stratify=labels_unique, test_size=0.20, random_state=42 )
print("Number of training samples:", len(X_train))
print("Number of test samples:", len(X_test))

model = make_pipeline(
    TfidfVectorizer(ngram_range=(1,2), max_features=50000),
    LogisticRegression(class_weight='balanced', max_iter=1000)
)
print(cross_val_score(model, X_train, y_train, cv=5, scoring='f1_macro').mean())
model.fit(X_train, y_train)
print("Test F1:", f1_score(y_test, model.predict(X_test), average='macro'))

from sklearn.metrics import f1_score, precision_score, recall_score

y_pred = model.predict(X_test)

average_recall = recall_score(y_test, y_pred, average='macro')
average_precision = precision_score(y_test, y_pred, average='macro')
average_f1 = f1_score(y_test, y_pred, average='macro')

print(f"Average Recall: {average_recall:.4f}")
print(f"Average Precision: {average_precision:.4f}")
print(f"Average F1: {average_f1:.4f}")

Number of training samples: 85
Number of test samples: 22
0.6319047619047619
Test F1: 0.6409803921568628
Average Recall: 0.6129
Average Precision: 0.7200
Average F1: 0.6410


In [18]:
from sentence_transformers import SentenceTransformer
from nltk import sent_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# 1. Load a pre-trained sentence embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# 2. Helper to encode one document
def doc_to_vec(doc):
    sentences = sent_tokenize(doc)
    sent_embs = embedder.encode(sentences)   # shape: (n_sentences, dim)
    return sent_embs.mean(axis=0)           # mean pooling → (dim,)

# 3. Prepare embeddings for all docs
X = [doc_to_vec(d) for d in documents]
y = labels_unique

# 4. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.20, random_state=42)

# 5. Simple classifier
clf = LogisticRegression(max_iter=1000, class_weight='balanced')
clf.fit(X_train, y_train)
# calculate the average metrics
from sklearn.metrics import f1_score, precision_score, recall_score

y_pred = clf.predict(X_test)

average_recall = recall_score(y_test, y_pred, average='macro')
average_precision = precision_score(y_test, y_pred, average='macro')
average_f1 = f1_score(y_test, y_pred, average='macro')

print(f"Average Recall: {average_recall:.4f}")
print(f"Average Precision: {average_precision:.4f}")
print(f"Average F1: {average_f1:.4f}")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.68it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 22.22it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 24.39it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 27.03it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 10.91it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 17.86it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 18.37it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 15.22it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 33.33it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 22.93it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 83.34it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 26.32it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 14.37it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 30.30it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 27.03it/s

Average Recall: 0.7976
Average Precision: 0.8000
Average F1: 0.7767


## full process

In [19]:
def get_authors_from_paper(paper_directory):
    """
    Extract authors from a paper directory.
    """
    grobid = GrobidService(config_path="./Grobid/config.json")
    authors = grobid.extract_authors_from_pdf(paper_directory)
    return authors

def get_datasets_from_paper(grobid_dataset_filepath):
    """
    Extract datasets from a dataset.json.
    """
    with open(grobid_dataset_filepath, "r") as file:
        dataset_mentions = json.load(file)
    return dataset_mentions

def get_tasks_from_paper(task_filepath):
    """
    Extract tasks from a task.json.
    """
    with open(task_filepath, 'r') as f:
        data = [json.loads(line) for line in f]
    tasks_per_doc = {}

    for entry in data:
        doc_id = entry["doc_id"]
        words = entry["words"]
        ner_spans = entry.get("ner", [])

        tasks = set()
        for start, end, label in ner_spans:
            if label == "Task":
                span_text = " ".join(words[start:end])
                tasks.add(span_text)
        tasks = list(tasks)

        tasks_per_doc[doc_id] = tasks
    for i in tasks_per_doc:
        tasks_per_doc[i] = deduplicate_fuzzy(tasks_per_doc[i], threshold=80)   
    return tasks_per_doc 

def get_model_taxonomy_from_paper(model, paper_directory, extraction_method='abstract'):
    """
    Extract model taxonomy from a paper directory.
    """
    grobid = GrobidService(config_path="./Grobid/config.json")
    tei = grobid.process_full_text(paper_directory)
    if extraction_method == 'abstract':
        try:
            raw_text = extract_abstract(tei)
            # if extract_abstract returns None or an empty string, treat as failure
            if not raw_text or not raw_text.strip():
                print("No abstract found, skipping.")
                return None
        except Exception as e:
            print(f"Error extracting abstract ({e!r}), skipping.")
            return None
    elif extraction_method == 'sections':
        try:
            sections = extract_flat_sections_with_subtext(tei)  # extract sections with their text in a dictionary
            ranked_sections = rank_sections_by_semantic_similarity([sec['title'] for sec in sections], ["Experiments", "Evaluation"], model=None)  # get the most similar sections to the queries
            best_match_section, best_score = ranked_sections[0]
            raw_text = sections[[sec['title'] for sec in sections].index(best_match_section)]['text']
        except Exception as e:
            print(f"Error extracting sections ({e!r}), skipping.")
            return None
    elif extraction_method == 'full_text':
        try:
            raw_text = tei_to_full_raw_text(tei, remove_ref=True)
        except Exception as e:
            print(f"Error extracting full text ({e!r}), skipping.")
            return None
    prediction = model.predict(raw_text)
    category_mapping = {
    "Semantic matching models": 0,
    "Translation models": 1,
    "Internal side information inside KGs": 2,
    "External extra information outside KGs": 3,
    "Other models": 4
    }
    return category_mapping.get(prediction, 4)  # Default to 'Other models'


