In [5]:
from citation_index.core.extractors import ExtractorFactory
import pandas as pd
import json
from tqdm import tqdm
from time import sleep
import random
from Levenshtein import ratio as levenshtein_ratio

from citation_index.llm.prompt_loader import ReferenceExtractionAndParsingPrompt
from citation_index.core.models.references import References
from citation_index.llm.client import LLMClient

In [6]:
filepath = '../../EXgoldstandard/Goldstandard_EXparser/all_pdfs/12279.pdf'

extractor = ExtractorFactory.create("pymupdf")
result = extractor.extract(filepath)

In [7]:
prompt_path = '/Users/alex/docs/code/Odoma/citation_index/prompts/reference_extraction_and_parsing_pydantic.md'

prompt = ReferenceExtractionAndParsingPrompt(input_text=result.text, prompt=prompt_path)
print(prompt.prompt)

You are an expert in scholarly references and citations. Your task is to extract all full reference entries from scientific works and format them in a specific JSON structure. Here's the text you need to analyze:


### Where to Find References

References may appear in the following locations:
- At the end of the document, under headings such as "References," "Bibliography," or "Works Cited."
- In footnotes at the bottom of pages, or as endnotes at the end of the document or chapter.
- Occasionally, in other sections such as appendices or figure/table captions.

**Do not extract in-text citations (e.g., "(Smith et al., 2020)") unless they are accompanied by a full reference entry.**

### Your Task

Given the provided text, extract all full reference entries and format them according to the following JSON schema:

```json
{
  "$defs": {
    "Organization": {
      "description": "Contains information about an identifiable organization.\n\nThis includes businesses, tribes, or any other g

In [17]:
from langfuse.openai import openai

endpoint = 'http://localhost:8000/v1'
model  = 'google/gemma-3-27b-it'
api_key = 'rpa_BTOUM8PPM4I9XZDM1ASDRRH90K8GVYWTXRQ70IYN3qc0q0'
client = openai.OpenAI(base_url=endpoint, api_key=api_key)

from citation_index.core.models.references import References


# final_prompt = prompt.replace("{{INPUT_TEXT}}", result.text)

# response = client.beta.chat.completions.parse(
#     model=model,
#     messages=[{"role": "user", "content": prompt.prompt}],
#     temperature=0.6,
#     # max_tokens=5000,
#     response_format=References
# )

response = client.chat.completions.create(
    model=model,
    messages=[{"role": "user", "content": prompt.prompt}],
    response_format={
        'type': 'json_schema',
        'json_schema': References.schema_without_excluded()
        },
        # max_tokens=100
    )


message = response.choices[0].message

# response.output_parsed

In [18]:
message

ChatCompletionMessage(content='<start>\n{\n    "references": [\n        {\n            "reference": {\n                "authors": [\n                    {\n                        "first_name": "F.-A.",\n                        "middle_name": "",\n                        "surname": "Allaert"\n                    },\n                    {\n                        "first_name": "A.",\n                        "middle_name": "",\n                        "surname": "Blanc"\n                    },\n                    {\n                        "first_name": "Y.",\n                        "middle_name": "",\n                        "surname": "Megard"\n                    },\n                    {\n                        "first_name": "I.",\n                        "middle_name": "",\n                        "surname": "Bertand"\n                    }\n                ],\n                "title": "Parents‚Äô attitudes towards varicella vaccination acceptance in France and Germany: effect of

In [13]:
from citation_index.utils.json_helper import safe_json_parse


def parse(response: str,start_tag: str = "<start>",end_tag: str = "<end>") -> dict:
    # remove markdown
    if response.startswith(start_tag):
        response = response[len(start_tag):]
    if response.endswith(end_tag):
        response = response[:-len(end_tag)]
    if response.startswith("```"):
        response = "\n".join([line for line in response.split("\n")][1:-1])
        # print(response)

    
        # Parse the JSON response
    data = json.loads(response)
    # data = safe_json_parse(response)
        
        # Validate the structure matches our expected schema
    if not isinstance(data, dict) or "references" not in data:
        print("Invalid response format: missing 'references' key")
        return {"references": []}
            
    references = data["references"]
    if not isinstance(references, list):
        print("Invalid response format: 'references' is not a list")
        return {"references": []}
            
    # Validate each reference has the expected structure
    valid_references = []
    for ref in references:
        if not isinstance(ref, dict) or "reference" not in ref:
            continue
                
        ref_data = ref["reference"]
        if not isinstance(ref_data, dict):
            continue
                
        # Check for required fields
        if not all(key in ref_data for key in ["authors", "title"]):
            continue
                
        # Validate authors
        authors = ref_data.get("authors", [])
        if not isinstance(authors, list):
            continue
                
        valid_authors = []
        for author in authors:
            if not isinstance(author, dict):
                continue
        if not all(key in author for key in ["first_name", "surname"]):
                continue
        valid_authors.append(author)
            
        if valid_authors:
            ref_data["authors"] = valid_authors
            valid_references.append({"reference": ref_data})
        
    return {"references": valid_references}
        
    

    
parse(message.content)

{'references': [{'reference': {'authors': [{'first_name': 'I.',
      'middle_name': '',
      'surname': 'Bertand'}],
    'title': 'Parents‚Äô attitudes towards varicella vaccination acceptance in France and Germany: effect of vaccine recommendation and reimbursement (a survey)',
    'journal_title': 'J Public Health',
    'volume': '17',
    'issue': '71‚Äì76',
    'pages': '',
    'publication_date': '2009'}}]}

In [2]:
def get_sample_data(pdf_df, references_data, n_samples=5):
    """Display sample data for verification"""
    print(f"\n=== SAMPLE PDF DATA ===")
    if not pdf_df.empty:
        print(pdf_df.sample(n_samples).to_string(index=False))
    
    print(f"\n=== SAMPLE REFERENCES DATA ===")
    if references_data:
        sample_keys = list(references_data.keys())[:n_samples]
        for i, file_id in enumerate(sample_keys):
            ref_data = references_data[file_id]
            print(f"Paper {i+1} (ID: {file_id}):")
            print(f"  Number of references: {len(ref_data['references'])}")
            print(f"  First few references:")
            for j, ref in enumerate(ref_data['references'][:3]):
                print(f"    {j+1}. {ref}")
            if len(ref_data['references']) > 3:
                print(f"    ... and {len(ref_data['references']) - 3} more")
            print()


pdf_df = pd.read_csv("../../EXgoldstandard/Goldstandard_EXparser/pdf_files_info.csv")
references_data = json.load(open("../../EXgoldstandard/Goldstandard_EXparser/all_references.json", "r", encoding="utf-8"))

total_references = sum(len(data["references"]) for data in references_data.values())
print('Total references:', total_references)

Total references: 10171


In [None]:


# endpoint = 'https://api.anthropic.com/v1/'
# model = 'claude-sonnet-4-20250514'
# api_key = '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'

# client = LLMClient(endpoint, model, api_key)

filepath = '../../EXgoldstandard/Goldstandard_EXparser/all_pdfs/12279.pdf'

extractor = ExtractorFactory.create("pymupdf")
result = extractor.extract(filepath)

prompts = ReferenceExtractionAndParsingPrompt(input_text=result.text, prompt = '/Users/alex/docs/code/Odoma/citation_index/prompts/reference_extraction_and_parsing.md')

In [5]:
from citation_index.utils.json_helper import safe_json_parse


def parse(response: str,start_tag: str = "<start>",end_tag: str = "<end>") -> dict:
    # remove markdown
    if response.startswith(start_tag):
        response = response[len(start_tag):]
    if response.endswith(end_tag):
        response = response[:-len(end_tag)]
    if response.startswith("```"):
        response = "\n".join([line for line in response.split("\n")][1:-1])
        # print(response)

    
        # Parse the JSON response
    data = json.loads(response)
    # data = safe_json_parse(response)
        
        # Validate the structure matches our expected schema
    if not isinstance(data, dict) or "references" not in data:
        print("Invalid response format: missing 'references' key")
        return {"references": []}
            
    references = data["references"]
    if not isinstance(references, list):
        print("Invalid response format: 'references' is not a list")
        return {"references": []}
            
    # Validate each reference has the expected structure
    valid_references = []
    for ref in references:
        if not isinstance(ref, dict) or "reference" not in ref:
            continue
                
        ref_data = ref["reference"]
        if not isinstance(ref_data, dict):
            continue
                
        # Check for required fields
        if not all(key in ref_data for key in ["authors", "title"]):
            continue
                
        # Validate authors
        authors = ref_data.get("authors", [])
        if not isinstance(authors, list):
            continue
                
        valid_authors = []
        for author in authors:
            if not isinstance(author, dict):
                continue
        if not all(key in author for key in ["first_name", "surname"]):
                continue
        valid_authors.append(author)
            
        if valid_authors:
            ref_data["authors"] = valid_authors
            valid_references.append({"reference": ref_data})
        
    return {"references": valid_references}
        
    

    

In [6]:
def call_api_with_backoff(client, prompt, start_tag, end_tag, max_tokens, max_retries=3):
    """Call API with exponential backoff for rate limiting"""
    base_delay = 60  # Start with 60 seconds
    max_delay = 180  # Max delay of 5 minutes
    
    for attempt in range(max_retries):
        try:
            messages, response = client.call_with_continuation(
                prompt=prompt,
                start_tag=start_tag,
                end_tag=end_tag,
                max_tokens=max_tokens
            )
            return messages, response
            
        except Exception as e:
            error_msg = str(e).lower()
            if "rate limit" in error_msg or "429" in error_msg:
                if attempt < max_retries - 1:
                    # Exponential backoff with jitter
                    delay = min(base_delay * (2 ** attempt) + random.uniform(0, 10), max_delay)
                    print(f"Rate limit hit, waiting {delay:.1f} seconds before retry {attempt + 1}/{max_retries}")
                    sleep(delay)
                    continue
                else:
                    print(f"Max retries reached for rate limiting")
                    raise e
            else:
                # Non-rate-limit error, don't retry
                raise e
    
    raise Exception("Max retries exceeded")

In [None]:
# run on whole excite dataset
extractor = ExtractorFactory.create("pymupdf")


# endpoint = 'https://api.deepseek.com/v1'
# model = 'deepseek-chat'
# api_key = 'sk-282f6b9a54b64bd98bfcd85c0c8f5aab' # deepseek
# MAX_OUTPUT_TOKEN = 8192

# endpoint = 'https://api.anthropic.com/v1/'
# model = 'claude-sonnet-4-20250514'
# api_key = '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'
# MAX_OUTPUT_TOKEN = 20000

# gemma put max 50000 context window
endpoint = 'http://localhost:8000/v1'
model  = 'google/gemma-3-27b-it'
api_key = 'rpa_BTOUM8PPM4I9XZDM1ASDRRH90K8GVYWTXRQ70IYN3qc0q0'
MAX_OUTPUT_TOKEN = 5000
client = LLMClient(endpoint, model, api_key)


response_list = []
references_list = []
# matrixs = []
pdf_df.index = pdf_df['file_id']
prompts_path = '/Users/alex/docs/code/Odoma/citation_index/prompts/reference_extraction_and_parsing.md'

pbar = tqdm(pdf_df['file_id'], desc="üìÑ Processing PDFs", unit="file", 
           bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]')



for id in pbar:
    try:
        if pdf_df.loc[id, 'page_count'] > 100:
            print(f"Skipping {id} because page_count is too large")
            continue
            
        filepath = f'../../EXgoldstandard/Goldstandard_EXparser/all_pdfs/{id}.pdf'
        result = extractor.extract(filepath)
        
        # Split text into chunks if needed
        prompts = ReferenceExtractionAndParsingPrompt(input_text=result.text, prompt=prompts_path)
        
        # Use the API call with backoff
        try:
            messages, response = client.call_with_continuation(
                prompt=prompts.prompt,
                start_tag='<start>',
                end_tag=['<end>','</end>'],
                max_tokens=MAX_OUTPUT_TOKEN,
                temperature=0.6
            )
        except Exception as e:
            print(f"API call failed for file {id}: {str(e)}")
            response_list.append({'id': id, 'response': None})
            references_list.append({'id': id, 'references': {"references": []}})
            continue
        
        # Parse the response with retry logic for JSON formatting errors
        retry_count = 0
        max_retries = 2
        references = None
        
        while retry_count < max_retries:
            try:
                references = parse(response)
                break  # Success, exit retry loop
            except json.JSONDecodeError as e:
                print(f"JSONDecodeError for file {id} (attempt {retry_count + 1}): {e}")
                retry_count += 1
                if retry_count >= max_retries:
                    print(f"Failed to parse JSON for file {id} after {max_retries} attempts")
                    references = {"references": []}  
                    break 
                # Retry API call for JSON parsing errors
                try:
                    messages, response = client.call_with_continuation(
                        prompt=prompts.prompt,
                        start_tag='<start>',
                        end_tag=['<end>','</end>'],
                        max_tokens=MAX_OUTPUT_TOKEN
                    )
                except Exception as api_e:
                    print(f"API retry failed for JSON parsing: {str(api_e)}")
                    references = {"references": []}
                    break
            except Exception as e:
                print(f"Unexpected error parsing file {id}: {str(e)}")
                references = {"references": []}  # Default empty result
                break

        response_list.append({'id': id, 'response': response})
        references_list.append({'id': id, 'references': references})
        
        # Add a small delay between successful requests to be extra safe
        # sleep(random.uniform(5, 15))
        
    except Exception as e:
        print(f"Error processing file {id}: {str(e)}, line {e.__traceback__.tb_lineno}")
        response_list.append({'id': id, 'response': None})
        references_list.append({'id': id, 'references': {"references": []}})
        continue

# Save response_list with error handling
try:
    with open('response_ref_extparsing_gemma_pymupdf.json', 'w') as f:
        json.dump(response_list, f)
except Exception as e:
    print(f"Error saving response list: {str(e)}")

# save parsed references in pickle
import pickle
with open('references_ref_extparsing_gemma_pymupdf.pkl', 'wb') as f:
    pickle.dump(references_list, f) 

## test block

In [1]:
from citation_index.core.extractors import ExtractorFactory
import pandas as pd
import json
from tqdm import tqdm
from time import sleep
import random
from Levenshtein import ratio as levenshtein_ratio

from citation_index.llm.prompt_loader import ReferenceExtractionAndParsingPrompt
from citation_index.llm.client import LLMClient

pdf_df = pd.read_csv("../../EXgoldstandard/Goldstandard_EXparser/pdf_files_info.csv")


In [3]:
response_list = json.load(open('response_ref_extparsing_claude_pymupdf.json'))


{'id': 44404, 'response': '<start>\n{\n    "references": []\n}\n<end>'}

## Evaluation Results


In [2]:
from citation_index.core.extractors import ExtractorFactory
import pandas as pd
import json
from tqdm import tqdm
from time import sleep
import random
import pickle

from citation_index.core.models import References
from citation_index.evaluation.ref_metrics import RefEvaluator


In [3]:
evaluator = RefEvaluator(mode='exact')
from excite_helper import evaluate_whole_dataset


pred_pkl_path = "references_ref_extparsing_deepseek_pymupdf.pkl"
xml_dir = "../../EXgoldstandard/Goldstandard_EXparser/all_xml"
overall_metrics, per_doc_df = evaluate_whole_dataset(pred_pkl_path, xml_dir)

Processing 36325
DEBUG: full_title = Visual thinking | authors = [Person(first_name='Rudolf', middle_name=None, surname='Arnheim', name_link=None, role_name=None)] | editors = []
DEBUG: full_title = How to do things with words: The William James lectures delivered at Harvard University in 1955 | authors = [Person(first_name='John L.', middle_name=None, surname='Austin', name_link=None, role_name=None)] | editors = []
DEBUG: full_title = Games people play: The basic handbook of transactional analysis | authors = [Person(first_name='Eric', middle_name=None, surname='Berne', name_link=None, role_name=None)] | editors = []
DEBUG: full_title = The rainbow of desire: The Boal method of theatre and therapy | authors = [Person(first_name='Augusto', middle_name=None, surname='Boal', name_link=None, role_name=None)] | editors = []
DEBUG: full_title = Gender trouble. Feminism and the subversion of identity | authors = [Person(first_name='Judith', middle_name=None, surname='Butler', name_link=None

In [None]:
overall_metrics, per_doc_df = evaluate_whole_dataset(pred_pkl_path, xml_dir, 
                                                     focus_fields=['authors','analytic_title','publication_date'], 
                                                     fuzzy_threshold=90,
                                                     mode='fuzzy')

In [4]:
per_doc_df

pdf_df = pd.read_csv("../../EXgoldstandard/Goldstandard_EXparser/pdf_files_info.csv")
pdf_df = pdf_df.reset_index(drop=True)
pdf_df['file_id'] = pdf_df['file_id'].astype(str)
pdf_df_deepseek = pd.merge(pdf_df, per_doc_df, on='file_id', how='left')
pdf_df_deepseek.head()

# group by class and lang and calculate avg precision, recall, f1_score, avg_levenshtein_ratio
pdf_df_deepseek.groupby(['class', 'lang']).agg({'precision': 'mean', 'recall': 'mean', 'micro_f1': 'mean', 'macro_f1': 'mean'}).reset_index()


Unnamed: 0,class,lang,precision,recall,micro_f1,macro_f1
0,1,de,0.772643,0.786854,0.766111,0.710629
1,1,en,0.878612,0.871083,0.864509,0.826236
2,2,de,0.614464,0.544346,0.564886,0.382612
3,3,de,0.767324,0.535241,0.61625,0.429896


In [5]:
pred_pkl_path_claude = "references_ref_extparsing_claude_pymupdf.pkl"
overall_metrics, per_doc_df = evaluate_whole_dataset(pred_pkl_path_claude, xml_dir)

No prediction for 38687
No prediction for 41740
No prediction for 34484
No prediction for 27017
No prediction for 27598
No prediction for 20430
No prediction for 44404
No prediction for 47119
No prediction for 31474
No prediction for 39206
No prediction for 32965
No prediction for 31920
No prediction for 1630
No prediction for 28444
No prediction for 32408
No prediction for 28254
No prediction for 46228
No prediction for 45330
No prediction for 12526
No prediction for 35667
No prediction for 27667
No prediction for 48277
No prediction for 27048
No prediction for 38381
No prediction for 24743
No prediction for 22654
No prediction for 12797
No prediction for 34793
No prediction for 54462
No prediction for 23371
No prediction for 54062
No prediction for 35926
No prediction for 24817
No prediction for 39476
No prediction for 4930
No prediction for 6026 
No prediction for 26781
No prediction for 36593
No prediction for 11448
No prediction for 16521
No prediction for 18268 
No prediction for

In [6]:
overall_metrics, per_doc_df = evaluate_whole_dataset(pred_pkl_path_claude, xml_dir, 
                                                     focus_fields=['authors','analytic_title','publication_date'], 
                                                     fuzzy_threshold=90,
                                                     mode='fuzzy')

No prediction for 38687
No prediction for 41740
No prediction for 34484
No prediction for 27017
No prediction for 27598
No prediction for 20430
No prediction for 44404
No prediction for 47119
No prediction for 31474
No prediction for 39206
No prediction for 32965
No prediction for 31920
No prediction for 1630
No prediction for 28444
No prediction for 32408
No prediction for 28254
No prediction for 46228
No prediction for 45330
No prediction for 12526
No prediction for 35667
No prediction for 27667
No prediction for 48277
No prediction for 27048
No prediction for 38381
No prediction for 24743
No prediction for 22654
No prediction for 12797
No prediction for 34793
No prediction for 54462
No prediction for 23371
No prediction for 54062
No prediction for 35926
No prediction for 24817
No prediction for 39476
No prediction for 4930
No prediction for 6026 
No prediction for 26781
No prediction for 36593
No prediction for 11448
No prediction for 16521
No prediction for 18268 
No prediction for

In [7]:
pdf_df_claude = pd.merge(pdf_df, per_doc_df, on='file_id', how='left')
pdf_df_claude.head()

# group by class and lang and calculate avg precision, recall, f1_score, avg_levenshtein_ratio
pdf_df_claude.groupby(['class', 'lang']).agg({'precision': 'mean', 'recall': 'mean', 'micro_f1': 'mean', 'macro_f1': 'mean'}).reset_index()


Unnamed: 0,class,lang,precision,recall,micro_f1,macro_f1
0,1,de,0.598054,0.553976,0.565311,0.535619
1,1,en,0.81363,0.686713,0.738369,0.73371
2,2,de,0.482026,0.439843,0.448853,0.3409
3,3,de,0.434846,0.29518,0.332579,0.269023
