In [24]:
from src.pdf_extractors import ExtractorFactory
import pandas as pd
import json
from Levenshtein import distance as levenshtein_distance

# EXCITE data

In [8]:
def get_sample_data(pdf_df, references_data, n_samples=5):
    """Display sample data for verification"""
    print(f"\n=== SAMPLE PDF DATA ===")
    if not pdf_df.empty:
        print(pdf_df.sample(n_samples).to_string(index=False))
    
    print(f"\n=== SAMPLE REFERENCES DATA ===")
    if references_data:
        sample_keys = list(references_data.keys())[:n_samples]
        for i, file_id in enumerate(sample_keys):
            ref_data = references_data[file_id]
            print(f"Paper {i+1} (ID: {file_id}):")
            print(f"  Number of references: {len(ref_data['references'])}")
            print(f"  First few references:")
            for j, ref in enumerate(ref_data['references'][:3]):
                print(f"    {j+1}. {ref}")
            if len(ref_data['references']) > 3:
                print(f"    ... and {len(ref_data['references']) - 3} more")
            print()


pdf_df = pd.read_csv("EXgoldstandard/Goldstandard_EXparser/pdf_files_info.csv")
references_data = json.load(open("EXgoldstandard/Goldstandard_EXparser/all_references.json", "r", encoding="utf-8"))

total_references = sum(len(data["references"]) for data in references_data.values())
print('Total references:', total_references)

Total references: 10171


In [9]:
get_sample_data(pdf_df, references_data)


=== SAMPLE PDF DATA ===
 file_id  filename  class lang                                                                                                                          file_path
   34481 34481.pdf      1   de      EXgoldstandard/Goldstandard_EXparser/1-German_papers/1-German_papers(with_reference_section_at_end_of_paper)/1-pdfs/34481.pdf
   20011 20011.pdf      1   de      EXgoldstandard/Goldstandard_EXparser/1-German_papers/1-German_papers(with_reference_section_at_end_of_paper)/1-pdfs/20011.pdf
   39854 39854.pdf      1   de      EXgoldstandard/Goldstandard_EXparser/1-German_papers/1-German_papers(with_reference_section_at_end_of_paper)/1-pdfs/39854.pdf
   19396 19396.pdf      3   de EXgoldstandard/Goldstandard_EXparser/1-German_papers/3-German_papers(with_reference_in_footnote_and_end_of_paper)/1-pdfs/19396.pdf
   52971 52971.pdf      1   en    EXgoldstandard/Goldstandard_EXparser/2-English_papers/1-English_papers(with_reference_section_at_end_of_paper)/1-pdfs/52971.pdf

==

In [11]:
REFERENCE_EXTRACTION_PROMPT_SYS = """You are an expert in scholarly references and citations. You help the user to extract citation data from scientific works."""

REFERENCE_EXTRACTION_PROMPT_USER = """
Extract all references from the given text. Output each reference as plain text, one reference per line. Only output the reference text, nothing else. Do not include any explanations, numbering, or additional formatting.

TEXT: <<<{input_text}>>>

## Example Usage
**Input Text:**
This paper builds on previous work (Smith et al., 2020; Jones, 2019). According to recent studies...

References:
1. Smith, J., Brown, A., & Wilson, C. (2020). Machine learning approaches in natural language processing. Journal of AI Research, 15(3), 245-267.
2. Jones, M. (2019). Deep learning fundamentals. MIT Press.
3. Davis, R., & Lee, S. (2021). Neural networks and their applications. Nature Machine Intelligence, 3(2), 112-125.

**Expected Output:**
Smith, J., Brown, A., & Wilson, C. (2020). Machine learning approaches in natural language processing. Journal of AI Research, 15(3), 245-267.
Jones, M. (2019). Deep learning fundamentals. MIT Press.
Davis, R., & Lee, S. (2021). Neural networks and their applications. Nature Machine Intelligence, 3(2), 112-125.
"""

In [12]:
from openai import OpenAI

endpoint = 'https://api.deepseek.com/v1'
model = 'deepseek-chat'
api_key = 'sk-282f6b9a54b64bd98bfcd85c0c8f5aab' # deepseek

client = OpenAI(
    api_key=api_key,
    base_url=endpoint,
)

filepath = 'EXgoldstandard/Goldstandard_EXparser/all_pdfs/1181.pdf'

extractor = ExtractorFactory.create("pymupdf")
result = extractor.extract(filepath)


response = client.chat.completions.create(
    model=model,
    messages=[
        {"role": "system", "content": REFERENCE_EXTRACTION_PROMPT_SYS},
        {"role": "user", "content": REFERENCE_EXTRACTION_PROMPT_USER.format(input_text=result)}
    ]
)

print(response.choices[0].message.content)

Aron, Raymond/Dominique Schnapper (1988): Power, modernity, and sociology : selected sociological writings. Aldershot, Hants, England Brookfield, Vt., USA: E. Elgar ;
Collins, Harry (2004): Gravity's shadow : the search for gravitational waves. Chicago: University of Chicago Press.
Collins, Harry M. (1981): Stages in the Empirical Programme of Relativism. In: Social Studies of Science, 11 S. 3-10.
Collins, Harry M. (1983): An Empirical Relativist Programme in the Sociology of Scientific Knowledge. In: K.D. Knorr-Cetina/M. Mulkay (Hrsg.): Science observed. Perspectives on the social study of science. London. Sage: S: 85-113
Collins, Harry M. (1985): Changing order : replication and induction in scientific practice, London u.a.: Sage.
Collins, Harry/Trevor Pinch (1999): Golem der Forschung (Der). Wie unsere Wissenschaft die Natur erfindet. Berlin: Berlin Verlag.
Dosi, Giovanni (1982): Technological Paradigms and Technological Trajectories. A Suggested Interpretation of the Determinants a

In [15]:
references_data['1181']['references']

['Aron, Raymond/Dominique Schnapper (1988): Power, modernity, and sociology : selected sociological writings. Aldershot, Hants, England Brookfield, Vt., USA: E. Elgar ;',
 "Collins, Harry (2004): Gravity's shadow : the search for gravitational waves. Chicago: University of Chicago Press.",
 'Collins, Harry M. (1981): Stages in the Empirical Programme of Relativism. In: Social Studies of Science, 11 S. 3-10.',
 'Collins, Harry M. (1983): An Empirical Relativist Programme in the Sociology of Scientific Knowledge. In: K.D. Knorr-Cetina/M. Mulkay (Hrsg.): Science observed. Perspectives on the social study of science. London. Sage: S: 85-113',
 'Collins, Harry M. (1985): Changing order : replication and induction in scientific practice, London u.a.: Sage.',
 'Collins, Harry/Trevor Pinch (1999): Golem der Forschung (Der). Wie unsere Wissenschaft die Natur erfindet. Berlin: Berlin Verlag.',
 'Dosi, Giovanni (1982): Technological Paradigms and Technological Trajectories. A Suggested Interpreta

In [27]:
import numpy as np
from scipy.optimize import linear_sum_assignment

def calculate_matrix(references_data, response_list):
    """
    Calculate Precision, Recall, F1 and Levenshtein distance for PDF reference extraction.
    
    Args:
        references_data: List of ground truth references (strings)
        response_list: List of extracted references (strings)
    
    Returns:
        Dict containing precision, recall, f1_score, and avg_levenshtein_distance
    """

    
    def normalize_text(text):
        """Basic text normalization"""
        if not isinstance(text, str):
            text = str(text)
        return text.strip()
    
    # Normalize inputs and remove empty strings
    gt_refs = [normalize_text(ref) for ref in references_data if normalize_text(ref)]
    pred_refs = [normalize_text(ref) for ref in response_list if normalize_text(ref)]
    
    n_gt = len(gt_refs)
    n_pred = len(pred_refs)
    
    # Handle edge cases
    if n_gt == 0 and n_pred == 0:
        return {
            'precision': 1.0,
            'recall': 1.0,
            'f1_score': 1.0,
            'avg_levenshtein_distance': 0.0
        }
    
    if n_gt == 0:
        return {
            'precision': 0.0,
            'recall': 0.0,
            'f1_score': 0.0,
            'avg_levenshtein_distance': float('inf')
        }
    
    if n_pred == 0:
        return {
            'precision': 0.0,
            'recall': 0.0,
            'f1_score': 0.0,
            'avg_levenshtein_distance': float('inf')
        }
    
    # Create distance matrix
    distance_matrix = np.zeros((n_gt, n_pred))
    
    for i, gt_ref in enumerate(gt_refs):
        for j, pred_ref in enumerate(pred_refs):
            distance_matrix[i, j] = levenshtein_distance(gt_ref, pred_ref)
    
    # Find optimal assignment using Hungarian algorithm (minimize total distance)
    row_indices, col_indices = linear_sum_assignment(distance_matrix)
    
    # Define similarity threshold based on normalized distance
    SIMILARITY_THRESHOLD = 0.8  # Accept matches with normalized similarity >= 0.8
    
    matched_pairs = 0
    total_distance = 0
    
    for i, j in zip(row_indices, col_indices):
        distance = distance_matrix[i, j]
        max_len = max(len(gt_refs[i]), len(pred_refs[j]))
        normalized_similarity = 1 - (distance / max_len) if max_len > 0 else 1.0
        
        if normalized_similarity >= SIMILARITY_THRESHOLD:
            matched_pairs += 1
            total_distance += distance
    
    # Calculate metrics
    precision = matched_pairs / n_pred
    recall = matched_pairs / n_gt
    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    avg_levenshtein_distance = total_distance / matched_pairs if matched_pairs > 0 else float('inf')
    
    return {
        'precision': round(precision, 4),
        'recall': round(recall, 4),
        'f1_score': round(f1_score, 4),
        'avg_levenshtein_distance': round(avg_levenshtein_distance, 2) if avg_levenshtein_distance != float('inf') else float('inf')
    }


# Example usage

calculate_matrix(references_data['1181']['references'], response.choices[0].message.content.split('\n'))

{'precision': 1.0,
 'recall': 1.0,
 'f1_score': 1.0,
 'avg_levenshtein_distance': np.float64(0.0)}

In [None]:
# run on whole excite dataset
extractor = ExtractorFactory.create("pymupdf")
from tqdm import tqdm

response_list = []
matrix = []
for id in tqdm(pdf_df['file_id']):
    filepath = f'EXgoldstandard/Goldstandard_EXparser/all_pdfs/{id}.pdf'
    result = extractor.extract(filepath)
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": REFERENCE_EXTRACTION_PROMPT_SYS},
            {"role": "user", "content": REFERENCE_EXTRACTION_PROMPT_USER.format(input_text=result)}
        ]
    )
    response_list.append({'id': id, 'response': response.choices[0].message.content})
    matrix.append(calculate_matrix(references_data[str(id)]['references'], response.choices[0].message.content.split('\n')))


# save response_list
with open('response_list.json', 'w') as f:
    json.dump(response_list, f)


# summarize matrix and print precision, recall, f1_score, avg_levenshtein_distance
matrix_df = pd.DataFrame(matrix)
print(matrix_df.describe())


  1%|▏         | 5/351 [02:16<2:37:58, 27.39s/it]

MuPDF error: format error: No default Layer config



  2%|▏         | 6/351 [03:39<4:25:13, 46.13s/it]