In [5]:
from citation_index.core.extractors import ExtractorFactory
import pandas as pd
import json
from Levenshtein import ratio as levenshtein_ratio

In [7]:
def get_sample_data(pdf_df, references_data, n_samples=5):
    """Display sample data for verification"""
    print(f"\n=== SAMPLE PDF DATA ===")
    if not pdf_df.empty:
        print(pdf_df.sample(n_samples).to_string(index=False))
    
    print(f"\n=== SAMPLE REFERENCES DATA ===")
    if references_data:
        sample_keys = list(references_data.keys())[:n_samples]
        for i, file_id in enumerate(sample_keys):
            ref_data = references_data[file_id]
            print(f"Paper {i+1} (ID: {file_id}):")
            print(f"  Number of references: {len(ref_data['references'])}")
            print(f"  First few references:")
            for j, ref in enumerate(ref_data['references'][:3]):
                print(f"    {j+1}. {ref}")
            if len(ref_data['references']) > 3:
                print(f"    ... and {len(ref_data['references']) - 3} more")
            print()


pdf_df = pd.read_csv("../../EXgoldstandard/Goldstandard_EXparser/pdf_files_info.csv")
references_data = json.load(open("../../EXgoldstandard/Goldstandard_EXparser/all_references.json", "r", encoding="utf-8"))

total_references = sum(len(data["references"]) for data in references_data.values())
print('Total references:', total_references)

Total references: 10171


In [4]:
from citation_index.llm.prompt_loader import ReferenceExtractionAndParsingPrompt
from citation_index.llm.client import LLMClient

In [1]:
endpoint = 'https://api.deepseek.com/v1'
model = 'deepseek-chat'
api_key = 'sk-282f6b9a54b64bd98bfcd85c0c8f5aab' # deepseek

# endpoint = 'https://api.anthropic.com/v1/'
# model = 'claude-sonnet-4-20250514'
# api_key = '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'

client = LLMClient(endpoint, model, api_key)

filepath = '../../EXgoldstandard/Goldstandard_EXparser/all_pdfs/1181.pdf'

extractor = ExtractorFactory.create("pymupdf")
result = extractor.extract(filepath)

prompts = ReferenceExtractionAndParsingPrompt(input_text=result, prompt = '/Users/alex/docs/code/Odoma/citation_index/prompts/reference_extraction_and_parsing.md')

NameError: name 'LLMClient' is not defined

In [2]:
def parse(response: str) -> dict:
    # remove markdown
    if response.startswith("```"):
        response = "\n".join([line for line in response.split("\n")][1:-1])
        # print(response)

    try:
        # Parse the JSON response
        data = json.loads(response)
        
        # Validate the structure matches our expected schema
        if not isinstance(data, dict) or "references" not in data:
            print("Invalid response format: missing 'references' key")
            return {"references": []}
            
        references = data["references"]
        if not isinstance(references, list):
            print("Invalid response format: 'references' is not a list")
            return {"references": []}
            
        # Validate each reference has the expected structure
        valid_references = []
        for ref in references:
            if not isinstance(ref, dict) or "reference" not in ref:
                continue
                
            ref_data = ref["reference"]
            if not isinstance(ref_data, dict):
                continue
                
            # Check for required fields
            if not all(key in ref_data for key in ["authors", "title"]):
                continue
                
            # Validate authors
            authors = ref_data.get("authors", [])
            if not isinstance(authors, list):
                continue
                
            valid_authors = []
            for author in authors:
                if not isinstance(author, dict):
                    continue
                if not all(key in author for key in ["first_name", "surname"]):
                    continue
                valid_authors.append(author)
            
            if valid_authors:
                ref_data["authors"] = valid_authors
                valid_references.append({"reference": ref_data})
        
        return {"references": valid_references}
        
    except json.JSONDecodeError as e:
        print(f"JSONDecodeError: {e}")
        return {"references": []}
    except Exception as e:
        print(f"Error parsing response: {e}")
        return {"references": []}

In [3]:
# response = client.call(prompts.prompt)
# parse(response)

In [6]:
# run on whole excite dataset
extractor = ExtractorFactory.create("pymupdf")
from tqdm import tqdm

response_list = []
references_list = []
# matrixs = []
pdf_df.index = pdf_df['file_id']
prompts_dir = '/Users/alex/docs/code/Odoma/citation_index/prompts/reference_extraction_and_parsing.md'


for id in tqdm(pdf_df['file_id']):
    try:
        if pdf_df.loc[id, 'page_count'] > 100:
            print(f"Skipping {id} because page_count is too large")
            continue
        filepath = f'../../EXgoldstandard/Goldstandard_EXparser/all_pdfs/{id}.pdf'
        result = extractor.extract(filepath)
        
        # Split text into chunks if needed
        prompts = ReferenceExtractionAndParsingPrompt(input_text=result, prompt = prompts_dir)
        
        response = client.call(prompts.prompt)

        references = parse(response)

        response_list.append({'id': id, 'response': response})
        references_list.append({'id': id, 'references': references})
        
    except Exception as e:
        print(f"Error processing file {id}: {str(e)}")
        continue

# Save response_list with error handling
try:
    with open('response_ref_extparsing_deepseek_pymupdf.json', 'w') as f:
        json.dump(response_list, f)
except Exception as e:
    print(f"Error saving response list: {str(e)}")

NameError: name 'pdf_df' is not defined

In [2]:
# load grouth truth from xml
from lxml import etree
from typing import Optional, List
from pathlib import Path
from citation_index.core.models import References, Reference

xml_dir = '../../EXgoldstandard/Goldstandard_EXparser/all_xml'

# read xml files
from citation_index.core.models import References

filepath = '../../EXgoldstandard/Goldstandard_EXparser/all_xml/1181.xml'
references = References.from_excite_xml(filepath)

In [3]:
len(references)

18

In [None]:
def calculate_metrics(references, references_gt):

In [None]:
# # run on whole excite dataset
# extractor = ExtractorFactory.create("pymupdf")
# from tqdm import tqdm
# import concurrent.futures
# import json
# import threading

# # Create a lock for PyMuPDF operations
# pymupdf_lock = threading.Lock()

# def process_file(id):
#     try:
#         if pdf_df.loc[id, 'page_count'] > 100:
#             print(f"Skipping {id} because page_count is too large")
#             return None, None
            
#         filepath = f'../../EXgoldstandard/Goldstandard_EXparser/all_pdfs/{id}.pdf'
        
#         # Use lock when extracting with PyMuPDF
#         with pymupdf_lock:
#             result = extractor.extract(filepath)
        
#         # Split text into chunks if needed
#         prompts = ReferenceExtractionAndParsingPrompt(input_text=result, prompt=prompts_dir)
        
#         response = client.call(prompts.prompt)
#         references = parse(response)
        
#         return {'id': id, 'response': response}, {'id': id, 'references': references}
        
#     except Exception as e:
#         print(f"Error processing file {id}: {str(e)}")
#         return None, None

# response_list = []
# references_list = []
# pdf_df.index = pdf_df['file_id']
# prompts_dir = '/Users/alex/docs/code/Odoma/citation_index/prompts/reference_extraction_and_parsing.md'

# # Process files concurrently with max 10 workers
# with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
#     # Submit all tasks
#     future_to_id = {executor.submit(process_file, id): id for id in pdf_df['file_id']}
    
#     # Process results as they complete
#     for future in tqdm(concurrent.futures.as_completed(future_to_id), total=len(future_to_id)):
#         response_data, references_data = future.result()
#         if response_data is not None:
#             response_list.append(response_data)
#         if references_data is not None:
#             references_list.append(references_data)

# # Save response_list with error handling
# try:
#     with open('response_ref_extparsing_deepseek_pymupdf.json', 'w') as f:
#         json.dump(response_list, f)
# except Exception as e:
#     print(f"Error saving response list: {str(e)}")

In [8]:
# from pydantic import BaseModel, Field, ValidationError, create_model
# from citation_index.core.models import References, Reference
# def parse(response: str) -> References:
#         # remove markdown
#         if response.startswith("```"):
#             response = "\n".join([line for line in response.split("\n")][1:-1])

#         try:
#             references = References.model_validate_json(response).references
#         except ValidationError as e:
#             print(f"ValidationError: {e}")
#             # _LOGGER.debug(f"ValidationError: {e}")
#             references = []

#         references = [ref for ref in references if ref != Reference()]

#         return References(references=references)