In [3]:
import os
import base64
import pandas as pd
from dotenv import load_dotenv
from typing import List, Optional
from pydantic import BaseModel, Field, ConfigDict
from pdf2image import convert_from_path
from paddleocr import PaddleOCR
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage

[33mChecking connectivity to the model hosters, this may take a while. To bypass this check, set `DISABLE_MODEL_SOURCE_CHECK` to `True`.[0m


In [4]:
load_dotenv()
BASE_DIR = r"D:\mridul\Scraping Assessment\Task 2"
POPPLER_BIN = os.path.join(BASE_DIR, "poppler", "Library", "bin")
PDF_PATH = os.path.join(BASE_DIR, "Service Record.pdf")
OUTPUT_DIR = os.path.join(BASE_DIR, "output")
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [5]:
ocr_engine = PaddleOCR(use_doc_orientation_classify=False, use_doc_unwarping=False, 
                       use_textline_orientation=False, device="gpu")

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0)

[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\mridu\.paddlex\official_models\PP-OCRv5_server_det`.[0m
[32mCreating model: ('PP-OCRv5_server_rec', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\mridu\.paddlex\official_models\PP-OCRv5_server_rec`.[0m


In [6]:
class ServiceEntry(BaseModel):
    # Mapping Pydantic fields to your exact required CSV headers
    Full_Name: str = Field(alias="Full Name")
    Educational_Qualification: Optional[str] = Field(alias="Educational Qualification")
    Honorific_Title: Optional[str] = Field(alias="Honorific/Title")
    Date_of_Birth: str = Field(alias="Date of Birth")
    Date_of_Joining_Service: str = Field(alias="Date of Joining Service")
    Date_of_Arrival: str = Field(alias="Date of Arrival")
    Voted_Non_voted: str = Field(alias="Voted/Non-voted")
    Domicile: str = Field(alias="Domicile")
    Station: str = Field(alias="Station")
    Substantive_Appointment: str = Field(alias="Substantive Appointment")
    Subst_Date: str = Field(alias="Subst. Date")
    Officiating_Appointment: Optional[str] = Field(alias="Officiating Appointment")
    Off_Date: Optional[str] = Field(alias="Off. Date")

    model_config = ConfigDict(populate_by_name=True)

In [7]:
class ServiceRecordDataset(BaseModel):
    entries: List[ServiceEntry]

In [8]:
structured_llm = llm.with_structured_output(ServiceRecordDataset)

In [26]:
def get_page_ocr_text(img_path):
    """Executes PaddleOCR 3.x and extracts text based on detected rec_texts key."""
    result = ocr_engine.predict(img_path)
    raw_text_lines = []
    
    for res in result:
        # Based on your debug log, text is in the 'rec_texts' list
        if 'rec_texts' in res and isinstance(res['rec_texts'], list):
            raw_text_lines.extend(res['rec_texts'])
        
        # Optional: Save debug output to folder as requested earlier
        res.save_to_json(OUTPUT_DIR)
            
    raw_text = "\n".join(raw_text_lines)
    print(f"Extracted {len(raw_text_lines)} lines of text.")
    return raw_text

In [35]:
def process_single_page(page_index, previous_context=None):
    """
    Handles extraction for one page, accepting context from the previous page
    to handle records that span across page breaks.
    """
    images = convert_from_path(PDF_PATH, dpi=300, first_page=page_index+1, 
                               last_page=page_index+1, poppler_path=POPPLER_BIN)
    if not images: return []
    
    img_path = os.path.join(OUTPUT_DIR, f"temp_p{page_index}.png")
    images[0].save(img_path)
    
    raw_text = get_page_ocr_text(img_path)
    
    with open(img_path, "rb") as f:
        img_base64 = base64.b64encode(f.read()).decode("utf-8")

    # Format the context string if it exists
    # Inside process_single_page function:

    context_str = "START OF NEW RECORD: No previous context."
    if previous_context:
        # Build a comprehensive identity signature from the previous page's last row
        context_str = (
            f"CONTINUING RECORD: The current officer is '{previous_context.get('Full Name')}'.\n"
            f"- Education: '{previous_context.get('Educational Qualification')}'\n"
            f"- Title: '{previous_context.get('Honorific/Title')}'\n"
            f"- DOB: '{previous_context.get('Date of Birth')}'\n"
            f"- Joining Date: '{previous_context.get('Date of Joining Service')}'\n"
            f"- Arrival Date: '{previous_context.get('Date of Arrival')}'\n"
            f"- Voted Status: '{previous_context.get('Voted/Non-voted')}'\n"
            f"- Domicile: '{previous_context.get('Domicile')}'"
        )
        
    prompt = f"""
    TASK: Extract service record entries into a structured dataset.
    
    IMPORTANT - PAGE CONTINUATION CONTEXT:
    {context_str}

    EXTRACTION INSTRUCTIONS FOR PAGE BREAKS:
    1. If this page starts with table rows without a new name header, you MUST use the 
       'CONTINUING RECORD' details provided above for every row.
    2. Fill 'Educational Qualification', 'Honorific/Title', 'Domicile', and 'Voted/Non-voted' 
       using the context above for all continued rows.
    3. Do NOT leave these fields empty for the continuation rows (38 to 45 in your current view).
    4. Only change these values if a clear NEW officer header (different name) appears on the page.

    MANDATORY RULES:
    - Replace 'Do.', '..', or '"' with the value from the row directly above it.
    - Normalize all dates to DD-MM-YYYY (e.g., 8-4-80 -> 08-04-1880).
    - Ensure 'Full Name' is present in every single row.

    # OCR TEXT FOR SPATIAL REFERENCE:
    # {raw_text}
    """
    message = HumanMessage(content=[
        {"type": "text", "text": prompt}
        # {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}}
    ])
    
    try:
        response = structured_llm.invoke([message])
        return [e.model_dump(by_alias=True) for e in response.entries]
    except Exception as e:
        print(f"Error on page {page_index+1}: {e}")
        return []

In [36]:
def run_extraction(start_p, end_p, output_filename="Service_Record_Dataset.csv"):
    """
    Orchestrates extraction with a rolling context to handle page-spanning records.
    """
    all_rows = []
    last_page_context = None

    for i in range(start_p - 1, end_p):
        print(f"Processing Page {i+1}...")
        
        # Pass the context from the previous iteration
        rows = process_single_page(i, previous_context=last_page_context)
        
        if rows:
            all_rows.extend(rows)
            # Update the context to be the last entry of the current page
            last_page_context = rows[-1] 
        
    if not all_rows:
        return pd.DataFrame()

    df = pd.DataFrame(all_rows)
    
    # Final CSV structure cleanup
    required_cols = [
        "Full Name", "Educational Qualification", "Honorific/Title", "Date of Birth", 
        "Date of Joining Service", "Date of Arrival", "Voted/Non-voted", "Domicile", 
        "Station", "Substantive Appointment", "Subst. Date", "Officiating Appointment", "Off. Date"
    ]
    df = df[required_cols] 
    
    df.to_csv(os.path.join(BASE_DIR, output_filename), index=False)
    print(f"Extraction Complete. Saved to {output_filename}")
    return df

In [37]:
test_results = run_extraction(start_p=8, end_p=9, output_filename="Page_8_9_ContextAware_Text_only.csv")
print(test_results.head())

Processing Page 8...
Extracted 147 lines of text.
Processing Page 9...
Extracted 177 lines of text.
Extraction Complete. Saved to Page_8_9_ContextAware_Text_only.csv
                       Full Name Educational Qualification Honorific/Title  \
0  Abraham, Edgar Garton Furtado                      B.A.       Oon. C.B.   
1  Abraham, Edgar Garton Furtado                      B.A.       Oon. C.B.   
2  Abraham, Edgar Garton Furtado                      B.A.       Oon. C.B.   
3  Abraham, Edgar Garton Furtado                      B.A.       Oon. C.B.   
4  Abraham, Edgar Garton Furtado                      B.A.       Oon. C.B.   

  Date of Birth Date of Joining Service Date of Arrival Voted/Non-voted  \
0    08-04-1880              20-10-1904      28-11-1904       Non-voted   
1    08-04-1880              20-10-1904      28-11-1904       Non-voted   
2    08-04-1880              20-10-1904      28-11-1904       Non-voted   
3    08-04-1880              20-10-1904      28-11-1904       Non