# # Step 1: Setup and Configuration
Import all necessary libraries and configure input/output paths.
The `sample` flag can be set to `True` to run the pipeline on a smaller dataset for testing purposes.

In [1]:
import pandas as pd
import os
import json
import re
from lxml import etree as ET
from uuid import uuid4
import nltk
from nltk.tokenize import MWETokenizer
import random

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    nltk.download('punkt')

# --- Configuration ---
BASE_DATA_DIR = './'
RAW_XML_DIR = os.path.join(BASE_DATA_DIR, 'grobid_full_text')
TEST_TRAIN_DIR = os.path.join(BASE_DATA_DIR, 'test_train_data')
FINAL_OUTPUT_DIR = os.path.join(BASE_DATA_DIR, 'database_ready')
    
# Create output directories if they don't exist
os.makedirs(FINAL_OUTPUT_DIR, exist_ok=True)

# Define XML Namespaces for TEI parsing
NS = {
    'tei': 'http://www.tei-c.org/ns/1.0',
    'xml': 'http://www.w3.org/XML/1998/namespace'
}
print(f"Raw XML Input Directory: {RAW_XML_DIR}")
print(f"Final Output Directory: {FINAL_OUTPUT_DIR}")

Raw XML Input Directory: ./grobid_full_text
Final Output Directory: ./database_ready


# # Step 2: Helper Functions
This section contains all the helper functions used throughout the pipeline for parsing, cleaning, and structuring data.

In [2]:

# --- XML Parsing and Cleaning Functions (from 1__preprocess.ipynb & 2__data_cleaning.ipynb) ---

def convert_ref_tags(text):
    """Converts various <ref> tag formats into a standardized one."""
    # Convert ref tags without a target
    if re.search(r'<ref type="bibr">(.*?)</ref>', text):
        pattern = r'<ref type="bibr">(.*?)</ref>'
        replacement = r'<ref type="single" target="#n999">\1</ref>'
        text = re.sub(pattern, replacement, text)
    # Standardize targeted ref tags
    pattern = r'<ref type="bibr" target="#(.*?)">(.*?)</ref>'
    replacement = r'<ref type="single" target="#\1">\2</ref>'
    return re.sub(pattern, replacement, text)

def replace_with_gref(match):
    """Groups consecutive single reference tags into one group reference tag."""
    all_refs = match.group(0)
    matches = re.findall(r'<ref type="single" target="(#[a-z]\d+)">(.*?)</ref>', all_refs)
    ids = ';'.join([match[0] for match in matches])
    content = ' '.join([match[1] for match in matches])
    return f'<ref type="group" target="{ids}">{content}</ref>'

def process_paragraphs(paragraphs):
    """Cleans and processes a list of paragraphs."""
    processed_paragraphs = []
    for paragraph in paragraphs:
        processed_paragraph = convert_ref_tags(paragraph)
        # Pattern for consecutive single ref tags
        pattern = r'(?:<ref type="single" target="#[a-z]\d+">[^<]+</ref>\s*?)+<ref type="single" target="#[a-z]\d+">([^<]+)</ref>'
        clean_paragraph = re.sub(pattern, replace_with_gref, processed_paragraph)
        # Remove paragraph tags and other unwanted tags
        clean_paragraph = re.sub(r'<p[^>]+>(.*?)</p>', r'\1', clean_paragraph)
        clean_paragraph = re.sub(r'<(?=(?!/))(?!ref)[^>]+>[^<]+<[^>]+>', '', clean_paragraph)
        processed_paragraphs.append(clean_paragraph)
    return processed_paragraphs

def clean_div(div):
    """Cleans a section (div) by removing unwanted tags and structuring paragraphs."""
    # Remove non-bibr ref tags but keep content
    cleaned_div = re.sub(r'<ref type="(?!bibr)[^>]+>([^<]+)</ref>', r'\1', div)
    # Remove p tags around formulas
    cleaned_div = re.sub(r'(</p>)?<formula[^>]+>([^<]+)<.*?/formula>(<p>)?', r' \2 \3\1', cleaned_div)
    cleaned_div = re.sub(r'<p></p>', '', cleaned_div)
    # Extract content within <p> tags
    cleaned_div = re.findall(r'<p>(.*?)</p>', cleaned_div)
    return cleaned_div

# --- Data Structuring Functions (from 3__structure_data.ipynb) ---

def concat_auth(authors):
    """Concatenates author names into a single string."""
    if not authors:
        return 'unknown'
    return ' ,'.join([' '.join([v if v is not None else 'unk' for v in auth.values()]) for auth in authors])

def get_uuid_of_doc(title, pub_year, authors, abstract, doc_df, lookup_dict):
    """
    Checks if a document already exists in the DataFrame.
    If yes, returns its UUID. If no, adds it and returns the new UUID.
    Uses a lookup dictionary for efficiency.
    """
    title = title if title else 'unknown'
    pub_year = pub_year if pub_year else 'unknown'
    
    lookup_key = (title.lower(), str(pub_year))
    
    if lookup_key in lookup_dict:
        return lookup_dict[lookup_key]
    else:
        new_id = str(uuid4())
        new_row = {'id': new_id, 'title': title, 'pub_year': pub_year, 'authors': authors, 'abstract': abstract}
        doc_df.loc[len(doc_df)] = new_row
        lookup_dict[lookup_key] = new_id
        return new_id


# # Step 3: Parse Raw XML Data
Iterates through the raw XML files, parses them using lxml, and extracts relevant information into a list of dictionaries.

In [3]:
print("Starting XML parsing...")
parsed_documents = []
error_files = []
xml_files = [f for f in os.listdir(RAW_XML_DIR) if f.endswith('.xml')]

for i, filename in enumerate(xml_files):
    xml_file_path = os.path.join(RAW_XML_DIR, filename)
    print(f"Processing file {i+1}/{len(xml_files)}: {filename}", end='\\r')
    try:
        tree = ET.parse(xml_file_path)
        root = tree.getroot()

        # Extract metadata
        title = root.find(".//tei:teiHeader//tei:title[@type='main']", NS)
        abstract = root.find('.//tei:teiHeader//tei:abstract//tei:p', NS)
        text_lang = root.find('tei:text', NS).get(f'{{{NS["xml"]}}}lang')
        
        authors = []
        for author in root.findall('.//tei:teiHeader//tei:author/tei:persName', NS):
            first = author.find('tei:forename', NS)
            last = author.find('tei:surname', NS)
            authors.append({'first_name': first.text if first is not None else None, 
                            'last_name': last.text if last is not None else None})

        doc_pub_year_str = filename.split('.')[0]
        doc_pub_year = int(doc_pub_year_str) if doc_pub_year_str.isnumeric() else None

        # Extract sections and paragraphs
        sections = []
        for div in root.findall('.//tei:body/tei:div', NS):
            section_name = div.find('.//tei:head', NS)
            paragraphs_raw = clean_div(ET.tostring(div, encoding='unicode', method='xml'))
            processed_paragraphs = process_paragraphs(paragraphs_raw)
            sections.append({'section_name': section_name.text if section_name is not None else None, 
                             'paragraphs': processed_paragraphs})
        # Extract references
        references = []
        for bibl in root.findall('.//tei:listBibl/tei:biblStruct', NS):
            ref_id = bibl.get(f'{{{NS["xml"]}}}id', "Unknown ID")
            ref_title = bibl.find(".//tei:title[@type='main']", NS)
            if not ref_title:
                ref_title = bibl.find(".//tei:title[@level='a']", NS)
            if not ref_title:
                ref_title = bibl.find(".//tei:title", NS)
            pub_date = bibl.find('.//tei:date[@type="published"]', NS)
            
            ref_authors = []
            for author in bibl.findall('.//tei:author/tei:persName', NS):
                first = author.find('tei:forename', NS)
                last = author.find('tei:surname', NS)
                ref_authors.append({'first_name': first.text if first is not None else None, 
                                    'last_name': last.text if last is not None else None})
            
            pub_year = None
            if pub_date is not None and pub_date.get('when'):
                match = re.search(r'[1-2][0,9]\d{2}', pub_date.get('when'))
                if match:
                    pub_year = int(match.group(0))
            
            references.append({
                'id': ref_id,
                'title': ref_title.text if ref_title is not None else None,
                'authors': ref_authors,
                'pub_year': pub_year,
            })

        parsed_documents.append({
            'filename': filename,
            'title': title.text if title is not None else None,
            'authors': authors,
            'pub_year': doc_pub_year,
            'lang': text_lang,
            'abstract': abstract.text if abstract is not None else None,
            'sections': sections,
            'references': references
        })

    except ET.ParseError:
        error_files.append(filename)
    except Exception as e:
        print(f"Unexpected error processing {filename}: {e}")
        error_files.append(filename)

print(f"\\nParsing complete. Successfully parsed {len(parsed_documents)} files. Encountered {len(error_files)} errors.")

Starting XML parsing...
Processing file 1/1: 2025.findings-acl.1259.pdf.tei.xml\r\nParsing complete. Successfully parsed 1 files. Encountered 0 errors.


  if not ref_title:
  if not ref_title:


# # Step 4: Filter and Clean Parsed Data
This step applies quality filters to the parsed data. It removes documents that are likely to be of low quality or irrelevant.

In [4]:
print("Filtering and cleaning documents...")
cleaned_documents = []
total_paragraphs_count = 0

for doc in parsed_documents:
    # --- Apply Filters ---
    # Skip documents with faulty metadata or insufficient size
    if (doc['title'] is None or
        len(doc['authors']) < 1 or
        any(auth.get('last_name') is None for auth in doc['authors']) or
        doc['pub_year'] is None or not (1950 <= doc['pub_year'] <= 2025) or
        len(doc['references']) < 5 or
        len(doc['sections']) < 3 or
        doc['lang'] != 'en'):
        continue
    
    # Filter out sections that contain no reference tags
    filtered_sections = []
    for section in doc.get('sections', []):
        pars_joined = ''.join(section.get('paragraphs', []))
        if re.search(r'<ref.*?</ref>', pars_joined):
            filtered_sections.append(section)
            total_paragraphs_count += len(section.get('paragraphs', []))
    
    doc['sections'] = filtered_sections

    # Re-check for insufficient length after filtering sections
    if len(doc['references']) < 5 or len(doc['sections']) < 3:
        continue
        
    cleaned_documents.append(doc)

print(f"Cleaning complete. {len(cleaned_documents)} documents remain after filtering.")
print(f"Total paragraphs in cleaned documents: {total_paragraphs_count}")


Filtering and cleaning documents...
Cleaning complete. 1 documents remain after filtering.
Total paragraphs in cleaned documents: 68


# # Step 5: Structure Data into Relational Format
The cleaned data is now transformed into a relational structure using Pandas DataFrames, with unique IDs for documents, paragraphs, and references.

In [5]:

print("Structuring data into relational tables...")

# Define DataFrame schemas
citation_columns = ['id', 'marker_location', 'type', 'cited_doc_id', 'paragraph_id']
paragraph_columns = ['id', 'text', 'section_id']
section_columns = ['id', 'section_title', 'document_id']
doc_columns = ['id', 'title', 'pub_year', 'authors', 'abstract']

citation_df = pd.DataFrame(columns=citation_columns)
parargraph_df = pd.DataFrame(columns=paragraph_columns)
section_df = pd.DataFrame(columns=section_columns)
document_df = pd.DataFrame(columns=doc_columns)

unknown_doc_id = str(uuid4())
document_df.loc[0] = [unknown_doc_id, 'unknown', 'unknown', 'unknown', 'unknown']

# Initialize a lookup dictionary for faster document checks
doc_lookup_dict = {}

# Initialize MWE Tokenizer for locating reference tags
tokenizer = MWETokenizer(separator='')
tokenizer.add_mwe(('[', 'REF', ']'))

for doc_data in cleaned_documents:
    # Get/create UUID for the main document
    doc_id = get_uuid_of_doc(doc_data['title'], doc_data['pub_year'], concat_auth(doc_data['authors']), doc_data['abstract'], document_df, doc_lookup_dict)
    
    # Create a dictionary mapping reference IDs (e.g., 'b23') to cited document UUIDs
    ref_id_targets = {}
    for ref in doc_data['references']:
        cited_doc_uuid = get_uuid_of_doc(ref['title'], ref['pub_year'], concat_auth(ref['authors']), None, document_df, doc_lookup_dict)
        ref_id_targets[ref['id']] = cited_doc_uuid

    # Process paragraphs and references
    for section in doc_data['sections']:
        sec_id = str(uuid4())
        section_df.loc[len(section_df)] = [sec_id, section['section_name'], doc_id]
        for paragraph_text in section['paragraphs']:
            par_id = str(uuid4())
            
            # Find all original <ref> tags and replace them with a placeholder
            references_xml = re.findall(r'(<ref.*?</ref>)', paragraph_text)
            cleaned_text = re.sub(r'(<ref.*?</ref>)', ' [REF] ', paragraph_text)
            cleaned_text = re.sub(r';', ',', cleaned_text) # Normalize semicolons
            
            # Tokenize and find the locations of the placeholders
            tokenized_text = tokenizer.tokenize(cleaned_text.split())
            marker_locations = [i for i, token in enumerate(tokenized_text) if '[REF]' in token]

            # If counts match, replace placeholders with cleaned ref tags
            if len(marker_locations) == len(references_xml):
                for i, loc in enumerate(marker_locations):
                    tokenized_text[loc] = re.sub(r' target=".*?"', '', references_xml[i])
            
            # Add paragraph to DataFrame
            parargraph_df.loc[len(parargraph_df)] = [par_id, ';'.join(tokenized_text), sec_id]

            # Add references to DataFrame
            for i, ref_xml in enumerate(references_xml):
                match = re.search(r'type="(.*?)" target="(.*?)"', ref_xml)
                if match:
                    ref_type = match.group(1)
                    target_ids = [m.replace('#', '') for m in match.group(2).split(';')]
                    cited_doc_uuids = [ref_id_targets.get(ref_id, unknown_doc_id) for ref_id in target_ids]
                    
                    new_ref = {
                        'id': str(uuid4()),
                        'marker_location': marker_locations[i],
                        'type': ref_type,
                        'cited_doc_id': ';'.join(cited_doc_uuids),
                        'paragraph_id': par_id
                    }
                    citation_df.loc[len(citation_df)] = new_ref

print("Structuring complete.")
print(f"Created {len(document_df)} document entries.")
print(f"Created {len(parargraph_df)} paragraph entries.")
print(f"Created {len(citation_df)} reference entries.")

Structuring data into relational tables...
Structuring complete.
Created 37 document entries.
Created 68 paragraph entries.
Created 58 reference entries.


# # Step 7: Normalize Tables for Database Import
This final step creates a dedicated linking table for cited documents to ensure the data is in a normalized form, which is better for relational databases.

In [6]:
print("Normalizing tables...")

cited_doc_df = pd.DataFrame(columns=['citation_id', 'document_id'])
rows_to_add = []

for _, row in citation_df.iterrows():
    for cited_doc_id in row['cited_doc_id'].split(';'):
        rows_to_add.append({'citation_id': row['id'], 'document_id': cited_doc_id})

if rows_to_add:
    cited_doc_df = pd.DataFrame(rows_to_add)

cited_doc_df.drop_duplicates(inplace=True)

# Drop the now-redundant column from the original ref_df
citation_df.drop(columns=['cited_doc_id'], inplace=True)

print(f"Created linking table with {len(cited_doc_df)} entries.")

Normalizing tables...
Created linking table with 89 entries.


# # Step 8: Create Annotation Tasks

In [10]:
annotations_df = pd.DataFrame(columns=['user_id', 'citation_id'])
user_ids = {
    'max':'17a7019f-4eb1-40fb-8c66-e2953af5f4be',
    'erika':'907f4be7-ac0e-480c-8e51-a99f52011653',
    }

user_id_list = user_ids.values()

for par_id in parargraph_df['id'].to_list():
    par_cit_ids = citation_df[citation_df['paragraph_id'] == par_id]['id'].to_list()
    if not par_cit_ids:continue
    ref_id = random.sample(par_cit_ids,1)[0]
    annotations_df.loc[len(annotations_df)] = [user_ids['max'], ref_id]
    annotations_df.loc[len(annotations_df)] = [user_ids['erika'], ref_id]

# # Step 9: Save Final Output
The processed and structured DataFrames are saved to CSV files in the output directory.

In [None]:

print(f"Saving final CSV files to {FINAL_OUTPUT_DIR}...")

citation_df.to_csv(os.path.join(FINAL_OUTPUT_DIR, 'db_citations.csv'), index=False)
parargraph_df.to_csv(os.path.join(FINAL_OUTPUT_DIR, 'db_paragrpahs.csv'), index=False)
section_df.to_csv(os.path.join(FINAL_OUTPUT_DIR, 'db_sections.csv'), index=False)
document_df.to_csv(os.path.join(FINAL_OUTPUT_DIR, 'db_documents.csv'), index=False)
cited_doc_df.to_csv(os.path.join(FINAL_OUTPUT_DIR, 'db_cited_documents.csv'), index=False)
annotations_df.to_csv(os.path.join(FINAL_OUTPUT_DIR, 'db_annotations.csv'), index=False)

print("Pipeline finished successfully!")

Saving final CSV files to ./database_ready...
Pipeline finished successfully!
