In [1]:
from citation_index.core.extractors import ExtractorFactory
import pandas as pd
import json
from tqdm import tqdm
from time import sleep
import random
from Levenshtein import ratio as levenshtein_ratio

from citation_index.llm.prompt_loader import ReferenceExtractionAndParsingPrompt
from citation_index.llm.client import LLMClient

In [2]:
def get_sample_data(pdf_df, references_data, n_samples=5):
    """Display sample data for verification"""
    print(f"\n=== SAMPLE PDF DATA ===")
    if not pdf_df.empty:
        print(pdf_df.sample(n_samples).to_string(index=False))
    
    print(f"\n=== SAMPLE REFERENCES DATA ===")
    if references_data:
        sample_keys = list(references_data.keys())[:n_samples]
        for i, file_id in enumerate(sample_keys):
            ref_data = references_data[file_id]
            print(f"Paper {i+1} (ID: {file_id}):")
            print(f"  Number of references: {len(ref_data['references'])}")
            print(f"  First few references:")
            for j, ref in enumerate(ref_data['references'][:3]):
                print(f"    {j+1}. {ref}")
            if len(ref_data['references']) > 3:
                print(f"    ... and {len(ref_data['references']) - 3} more")
            print()


pdf_df = pd.read_csv("../../EXgoldstandard/Goldstandard_EXparser/pdf_files_info.csv")
references_data = json.load(open("../../EXgoldstandard/Goldstandard_EXparser/all_references.json", "r", encoding="utf-8"))

total_references = sum(len(data["references"]) for data in references_data.values())
print('Total references:', total_references)

Total references: 10171


In [4]:


# endpoint = 'https://api.anthropic.com/v1/'
# model = 'claude-sonnet-4-20250514'
# api_key = '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'

# client = LLMClient(endpoint, model, api_key)

filepath = '../../EXgoldstandard/Goldstandard_EXparser/all_pdfs/1181.pdf'

extractor = ExtractorFactory.create("pymupdf")
result = extractor.extract(filepath)

prompts = ReferenceExtractionAndParsingPrompt(input_text=result, prompt = '/Users/alex/docs/code/Odoma/citation_index/prompts/reference_extraction_and_parsing.md')

In [5]:
def parse(response: str,start_tag: str = "<start>",end_tag: str = "<end>") -> dict:
    # remove markdown
    if response.startswith(start_tag):
        response = response[len(start_tag):]
    if response.endswith(end_tag):
        response = response[:-len(end_tag)]
    if response.startswith("```"):
        response = "\n".join([line for line in response.split("\n")][1:-1])
        # print(response)

    
        # Parse the JSON response
    data = json.loads(response)
        
        # Validate the structure matches our expected schema
    if not isinstance(data, dict) or "references" not in data:
        print("Invalid response format: missing 'references' key")
        return {"references": []}
            
    references = data["references"]
    if not isinstance(references, list):
        print("Invalid response format: 'references' is not a list")
        return {"references": []}
            
    # Validate each reference has the expected structure
    valid_references = []
    for ref in references:
        if not isinstance(ref, dict) or "reference" not in ref:
            continue
                
        ref_data = ref["reference"]
        if not isinstance(ref_data, dict):
            continue
                
        # Check for required fields
        if not all(key in ref_data for key in ["authors", "title"]):
            continue
                
        # Validate authors
        authors = ref_data.get("authors", [])
        if not isinstance(authors, list):
            continue
                
        valid_authors = []
        for author in authors:
            if not isinstance(author, dict):
                continue
        if not all(key in author for key in ["first_name", "surname"]):
                continue
        valid_authors.append(author)
            
        if valid_authors:
            ref_data["authors"] = valid_authors
            valid_references.append({"reference": ref_data})
        
    return {"references": valid_references}
        
    

    

In [6]:
def call_api_with_backoff(client, prompt, start_tag, end_tag, max_tokens, max_retries=3):
    """Call API with exponential backoff for rate limiting"""
    base_delay = 60  # Start with 60 seconds
    max_delay = 180  # Max delay of 5 minutes
    
    for attempt in range(max_retries):
        try:
            messages, response = client.call_with_continuation(
                prompt=prompt,
                start_tag=start_tag,
                end_tag=end_tag,
                max_tokens=max_tokens
            )
            return messages, response
            
        except Exception as e:
            error_msg = str(e).lower()
            if "rate limit" in error_msg or "429" in error_msg:
                if attempt < max_retries - 1:
                    # Exponential backoff with jitter
                    delay = min(base_delay * (2 ** attempt) + random.uniform(0, 10), max_delay)
                    print(f"Rate limit hit, waiting {delay:.1f} seconds before retry {attempt + 1}/{max_retries}")
                    sleep(delay)
                    continue
                else:
                    print(f"Max retries reached for rate limiting")
                    raise e
            else:
                # Non-rate-limit error, don't retry
                raise e
    
    raise Exception("Max retries exceeded")

In [None]:
# run on whole excite dataset
extractor = ExtractorFactory.create("pymupdf")


# endpoint = 'https://api.deepseek.com/v1'
# model = 'deepseek-chat'
# api_key = 'sk-282f6b9a54b64bd98bfcd85c0c8f5aab' # deepseek
# MAX_OUTPUT_TOKEN = 8192

# endpoint = 'https://api.anthropic.com/v1/'
# model = 'claude-sonnet-4-20250514'
# api_key = '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'
# MAX_OUTPUT_TOKEN = 20000

# gemma put max 50000 context window
endpoint = 'https://cy2uoaiag9ska4-8000.proxy.runpod.net/v1'
model  = 'google/gemma-3-27b-it'
api_key = 'rpa_BTOUM8PPM4I9XZDM1ASDRRH90K8GVYWTXRQ70IYN3qc0q0'
MAX_OUTPUT_TOKEN = 3000
client = LLMClient(endpoint, model, api_key)


response_list = []
references_list = []
# matrixs = []
pdf_df.index = pdf_df['file_id']
prompts_path = '/Users/alex/docs/code/Odoma/citation_index/prompts/reference_extraction_and_parsing.md'

pbar = tqdm(pdf_df['file_id'], desc="ðŸ“„ Processing PDFs", unit="file", 
           bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]')



for id in pbar:
    try:
        if pdf_df.loc[id, 'page_count'] > 100:
            print(f"Skipping {id} because page_count is too large")
            continue
            
        filepath = f'../../EXgoldstandard/Goldstandard_EXparser/all_pdfs/{id}.pdf'
        result = extractor.extract(filepath)
        
        # Split text into chunks if needed
        prompts = ReferenceExtractionAndParsingPrompt(input_text=result, prompt=prompts_path)
        
        # Use the API call with backoff
        try:
            messages, response = client.call_with_continuation(
                prompt=prompts.prompt,
                start_tag='<start>',
                end_tag=['<end>','</end>'],
                max_tokens=MAX_OUTPUT_TOKEN
            )
        except Exception as e:
            print(f"API call failed for file {id}: {str(e)}")
            response_list.append({'id': id, 'response': None})
            references_list.append({'id': id, 'references': {"references": []}})
            continue
        
        # Parse the response with retry logic for JSON formatting errors
        retry_count = 0
        max_retries = 2
        references = None
        
        while retry_count < max_retries:
            try:
                references = parse(response)
                break  # Success, exit retry loop
            except json.JSONDecodeError as e:
                print(f"JSONDecodeError for file {id} (attempt {retry_count + 1}): {e}")
                retry_count += 1
                if retry_count >= max_retries:
                    print(f"Failed to parse JSON for file {id} after {max_retries} attempts")
                    references = {"references": []}  
                    break 
                # Retry API call for JSON parsing errors
                try:
                    messages, response = client.call_with_continuation(
                        prompt=prompts.prompt,
                        start_tag='<start>',
                        end_tag=['<end>','</end>'],
                        max_tokens=MAX_OUTPUT_TOKEN
                    )
                except Exception as api_e:
                    print(f"API retry failed for JSON parsing: {str(api_e)}")
                    references = {"references": []}
                    break
            except Exception as e:
                print(f"Unexpected error parsing file {id}: {str(e)}")
                references = {"references": []}  # Default empty result
                break

        response_list.append({'id': id, 'response': response})
        references_list.append({'id': id, 'references': references})
        
        # Add a small delay between successful requests to be extra safe
        # sleep(random.uniform(5, 15))
        
    except Exception as e:
        print(f"Error processing file {id}: {str(e)}")
        response_list.append({'id': id, 'response': None})
        references_list.append({'id': id, 'references': {"references": []}})
        continue

# Save response_list with error handling
try:
    with open('response_ref_extparsing_gemma_pymupdf.json', 'w') as f:
        json.dump(response_list, f)
except Exception as e:
    print(f"Error saving response list: {str(e)}")

# save parsed references in pickle
import pickle
with open('references_ref_extparsing_gemma_pymupdf.pkl', 'wb') as f:
    pickle.dump(references_list, f) 

## Evaluation Results


In [1]:
from citation_index.core.extractors import ExtractorFactory
import pandas as pd
import json
from tqdm import tqdm
from time import sleep
import random
import pickle

from citation_index.core.models import References
from citation_index.evaluation.ref_metrics import RefEvaluator


In [2]:
evaluator = RefEvaluator(mode='exact')
from excite_helper import evaluate_whole_dataset


pred_pkl_path = "references_ref_extparsing_deepseek_pymupdf.pkl"
xml_dir = "../../EXgoldstandard/Goldstandard_EXparser/all_xml"
overall_metrics, per_doc_df = evaluate_whole_dataset(pred_pkl_path, xml_dir)

No prediction for 32965
No prediction for 28444
No prediction for 12526
No prediction for 27667
No prediction for 26236
No prediction for 22654
No prediction for 4930
No prediction for 6026 
No prediction for 35267
No prediction for 18268 
No prediction for 44849
No prediction for 32707
No prediction for 42768
No prediction for 48208
No prediction for 42309
Total files: 350
Missing predictions: 15
Missing GTs: 0
Reference eval (exact): {'precision': 0.7046529968454258, 'recall': 0.5087091895738813, 'micro_f1': 0.5908597761957408, 'macro_f1': 0.5243078285633297, 'per_class_f1': {'monographic_title': 0.6608767576509513, 'authors': 0.6010647432594883, 'publisher': 0.467693193265218, 'publication_date': 0.8245757900712798, 'publication_place': 0.0, 'analytic_title': 0.6730746688547649, 'journal_title': 0.6605086098756985, 'volume': 0.7485127095727419, 'pages': 0.5799196787148594, 'editors': 0.0, 'issue': 0.7449127906976745, 'refs': 0.0, 'footnote_number': 0.0, 'translator': 0.0, 'cited_ran

In [4]:
overall_metrics, per_doc_df = evaluate_whole_dataset(pred_pkl_path, xml_dir, 
                                                     focus_fields=['authors','title','year','doi'], 
                                                     fuzzy_threshold=90,
                                                     mode='fuzzy')

No prediction for 32965
No prediction for 28444
No prediction for 12526
No prediction for 27667
No prediction for 26236
No prediction for 22654
No prediction for 4930
No prediction for 6026 
No prediction for 35267
No prediction for 18268 
No prediction for 44849
No prediction for 32707
No prediction for 42768
No prediction for 48208
No prediction for 42309
Total files: 350
Missing predictions: 15
Missing GTs: 0
Reference eval (exact): {'precision': 0.8787384448069603, 'recall': 0.7909936368086148, 'micro_f1': 0.8325605358062854, 'macro_f1': 0.7000391143233026, 'per_class_f1': {'authors': 0.8325605358062854}}
   precision    recall  micro_f1  macro_f1 file_id
0   0.822581  0.980769  0.894737  0.909091   36325
1   0.886792  0.959184  0.921569  0.918182    9082
2   1.000000  0.957447  0.978261  0.963964   38687
3   0.924528  0.907407  0.915888  0.816667   20786
4   0.869565  0.769231  0.816327  0.711770   18437


In [5]:
per_doc_df

pdf_df = pd.read_csv("../../EXgoldstandard/Goldstandard_EXparser/pdf_files_info.csv")
pdf_df = pdf_df.reset_index(drop=True)
pdf_df['file_id'] = pdf_df['file_id'].astype(str)
pdf_df_deepseek = pd.merge(pdf_df, per_doc_df, on='file_id', how='left')
pdf_df_deepseek.head()

# group by class and lang and calculate avg precision, recall, f1_score, avg_levenshtein_ratio
pdf_df_deepseek.groupby(['class', 'lang']).agg({'precision': 'mean', 'recall': 'mean', 'micro_f1': 'mean', 'macro_f1': 'mean'}).reset_index()


Unnamed: 0,class,lang,precision,recall,micro_f1,macro_f1
0,1,de,0.766069,0.825987,0.778546,0.692326
1,1,en,0.875189,0.88874,0.870518,0.821393
2,2,de,0.625801,0.687841,0.633569,0.370871
3,3,de,0.791721,0.567967,0.647094,0.435181
