In [173]:
import json
import time
from pathlib import Path

import yaml

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    VlmPipelineOptions,
    granite_vision_vlm_conversion_options,
    smoldocling_vlm_conversion_options,
    smoldocling_vlm_mlx_conversion_options,
)
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline

sources = [
    # "tests/data/2305.03393v1-pg9-img.png",
    "data/mah.pdf",
]

## Use experimental VlmPipeline
pipeline_options = VlmPipelineOptions()
# If force_backend_text = True, text from backend will be used instead of generated text
pipeline_options.force_backend_text = True

## Pick a VLM model. Fast Apple Silicon friendly implementation for SmolDocling-256M via MLX
pipeline_options.vlm_options = smoldocling_vlm_conversion_options

#pipeline_options.vlm_options = granite_vision_vlm_conversion_options
from docling_core.types.doc import DocItemLabel, ImageRefMode
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS

## Set up pipeline for PDF or image inputs
converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_cls=VlmPipeline,
            pipeline_options=pipeline_options,
        ),
        InputFormat.IMAGE: PdfFormatOption(
            pipeline_cls=VlmPipeline,
            pipeline_options=pipeline_options,
        ),
    }
)

out_path = Path("scratch")
out_path.mkdir(parents=True, exist_ok=True)

for source in sources:
    start_time = time.time()
    print("================================================")
    print("Processing... {}".format(source))
    print("================================================")
    print("")

    res = converter.convert(source)
    print("")
    print(res.document.export_to_markdown())
    res.document.save_as_html(
        filename=Path("{}/{}.html".format(out_path, res.input.file.stem)),
        image_mode=ImageRefMode.REFERENCED,
        labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
    )
    res.document.save_as_markdown(
        out_path / f"{res.input.file.stem}.md",
        image_mode=ImageRefMode.PLACEHOLDER,
    )

    pg_num = res.document.num_pages()
    print("")
    inference_time = time.time() - start_time
    print(
        f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}"
    )

Processing... data/mah.pdf



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



## 	 Education	

## 

## 

## 

## 

## 

## 

## 

## 

## 

## 

## 

## 

## 

## 	 Education	

  ●    B.Tech( Computer Science and Engineering)  -8.34 CGPA   (2020) 

Predicted page in DOCTAGS:
Assistant: <doctag><section_header_level_1><loc_58><loc_63><loc_225><loc_77>Kapuluru Mahendra Reddy</section_header_level_1>
<text><loc_58><loc_80><loc_196><loc_90>Machine Learning Engineer</text>
<text><loc_58><loc_93><loc_236><loc_102>www.linkedin.com/in/mahendrareddykapuluru</text>
<section_header_level_1><loc_58><loc_118><loc_169><loc_127>Professional Summary</section_header_level_1>
<unordered_list><list_item><loc_74><loc_135><loc_437><loc_163>· Results-driven ml Engineer with 4+ years of experience in automation, data science, machine learning , and networking solutions. Strong background in designing end-to-end ML solutions, MLOps , and cybersecurity threat detection.</list_item>
<list_item><loc_74><loc_164><loc_437><loc_192>· Expertise in Python automation, data engineering, and sca

In [5]:
from dotenv import load_dotenv

load_dotenv('../.env')

True

In [6]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm_gemini = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # other params...
)

In [10]:
from prompts import *

In [28]:
text_ = ''
for i in clean_text.split('\n'):
    j = i
    if j.strip() != '':
        text_ += i +'\n'
print(text_)
        

## Mohan Reddy Pallavula
linkedin/mohanreddypallavula github/mohanreddypallavula
Experience
## · Matdun Labs India Pvt. Ltd
Remote
Dec 2021 - present
- AI Engineer (Full-time)
- ◦ Face Recognition system : Developed an advanced face recognition system utilizing SOTA based models for detecting and recognition the face and deployed on edge device (jetson nano) and kubernetes cluster (Nvidia Gpus). Optimized the models using tensorrt to reduce latency and Used Nvidia Triton server for dynamic batching, GPU and CPU optimization, and robust scalability, ensuring efficient and high-performance inference for deployed AI models. Tech: Tensorrt , Nvidia Jetson Nano , Kubernetes , Docker , FastAPI , Django , Web sockets , Grpc ,Pytorch , Opencv , Scikit-learn , PostgresSQL , Azure blob , GIT
- ◦ Video Analytics System : Delevoped an advanced AI-powered solution designed for real-time monitoring and analysis of video streams. It offers features such as person tracking, which enables precise ident

In [77]:
def get_resume_chunking_prompt(resume_text: str) -> str:
    prompt = f"""
You are an intelligent document parser designed to preprocess resumes for a Retrieval-Augmented Generation (RAG) system.
Your task is to intelligently chunk the given raw resume content into well-defined sections for semantic retrieval. Focus on structuring the data meaningfully and consistently so that downstream models can retrieve relevant information efficiently.
Specifically, divide the resume into the following labeled sections:
1. Personal Details
   - Full name, email, phone number, LinkedIn, GitHub, portfolio links, address (if present).
2. Education
   - Degrees, institutions, years of study, relevant coursework or achievements.
3. Work Experience
   - Job titles, company names, duration, responsibilities, technologies used, accomplishments.
4. Skills
   - List of technical, soft, or domain-specific skills.
5. Projects
   - Project titles, descriptions, tech stack, roles played, outcomes.
6. Certifications / Awards / Achievements (optional section)
   - Any notable recognitions, certificates, honors.
7. Publications / Research (optional section)
   - Any academic publications or relevant research contributions.
---
🔍 Guidelines for Output Formatting:
- Label each section clearly with headers.
- Preserve bullet points, dates, and formatting that help maintain semantic meaning.
- If a section is missing or not clearly identifiable, note it as "Not Found".
- Ensure each chunk is semantically meaningful and self-contained for downstream use in retrieval.
- Prefer consistent formatting, such as bullet points or structured lists where appropriate.
---
✅ Output Format Example:
### Personal Details:
Name: Jane Doe  
Email: jane.doe@email.com  
Phone: +1-123-456-7890  
LinkedIn: linkedin.com/in/janedoe  
GitHub: github.com/janedoe
### Education:
- B.Sc. in Computer Science, MIT (2016–2020)  
  Relevant Coursework: Algorithms, Machine Learning  
### Work Experience:
- Software Engineer at Google (2021–Present)  
  • Built scalable microservices in Go and Python  
  • Improved system latency by 30%
### Skills:
Python, Java, Docker, AWS, SQL, Leadership, Agile
### Projects:
- Resume Parser App  
  • Built using Python and spaCy  
  • Extracted structured data from raw resume PDFs
### Certifications / Awards / Achievements:
- AWS Certified Solutions Architect  
- Winner, HackMIT 2019
### Publications / Research:
Not Found
---
Now parse the following resume content accordingly:
{resume_text} 
"""
    return prompt


In [185]:

f = open('scratch/kr.md', 'r')

k1 = f.read()

In [186]:
text_ = ''
for i in k1.split('\n'):
    j = i
    if j.strip() != '':
        text_ += i +'\n'
print(text_)
        

## Koushik M A
9742836226  
koushikma62@gmail.com
linkedin.com/in/koushik-ma-6378001b7  
## PROFILE
Experienced Data Analytics Consultant with 4+ years in data visualization, product development, and some  exposure to data engineering and data science. Skilled at turning data into clear insights and building easy- to-use visual tools to support business goals. Strong problem-solver with a focus on delivering effective,  data-driven solutions.
## PROFESSIONAL EXPERIENCE
Mathco (TheMathCompany), 
Product Engineer - II (Data Visualisation Using Python)
- • Guided multiple teams through the product development process, focusing on data visualization and  simplifying complex data science methodologies for better understanding.
- • Collaborated closely with customers to ensure that the products aligned with their business needs and  were easy to interpret.
- • Successfully converted 2 proof-of-concept (POC) projects into full-time engagements for clients within a  short timeframe, driving si

In [175]:
msg = get_resume_chunking_prompt(text_)

In [187]:
print(msg)


You are an intelligent document parser designed to preprocess resumes for a Retrieval-Augmented Generation (RAG) system.
Your task is to intelligently chunk the given raw resume content into well-defined sections for semantic retrieval. Focus on structuring the data meaningfully and consistently so that downstream models can retrieve relevant information efficiently.
Specifically, divide the resume into the following labeled sections:
1. Personal Details
   - Full name, email, phone number, LinkedIn, GitHub, portfolio links, address (if present).
2. Education
   - Degrees, institutions, years of study, relevant coursework or achievements.
3. Work Experience
   - Job titles, company names, duration, responsibilities, technologies used, accomplishments.
4. Skills
   - List of technical, soft, or domain-specific skills.
5. Projects
   - Project titles, descriptions, tech stack, roles played, outcomes.
6. Certifications / Awards / Achievements (optional section)
   - Any notable recogniti

In [176]:
res1 = llm_gemini.invoke(msg)

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised DeadlineExceeded: 504 Deadline Exceeded.


In [177]:
print(res1.content)

### Personal Details:
Name: Koushik M A
Email: koushikma62@gmail.com
Phone: 9742836226
LinkedIn: linkedin.com/in/koushik-ma-6378001b7

### Education:
- University Visvesvaraya College of Engineering, BE in Computer Science & Engineering; 72.07 Aggregate (Aug 2016 - Sep 2020)

### Work Experience:
- Mathco (TheMathCompany), Product Engineer - II (Data Visualisation Using Python) (Jul 2024 - present)
  • Guided multiple teams through the product development process, focusing on data visualization and simplifying complex data science methodologies for better understanding.
  • Collaborated closely with customers to ensure that the products aligned with their business needs and were easy to interpret.
  • Successfully converted 2 proof-of-concept (POC) projects into full-time engagements for clients within a short timeframe, driving significant business growth to our organization.
- Mathco (TheMathCompany), Associate (Jul 2022 - Jun 2024)
  • Worked with Fortune 500 clients to demonstrate 

In [178]:
from typing import List

resume_info_meta = {  'personal_details': 'personal',
                 'education': 'education',
                 'work_experience:': 'work experience',
                 'skills': 'skills',
                 'projects': 'projects',
                 'certifications_awards_achievements': ['certifications' ,'awards','achievements'],
                 'publications_research': ['publications', 'research']}
resume_info_actual = {}
for i in res1.content.split('###'):
    meta_ = i.strip().split('\n')[0]
    find = "none"
    for info in resume_info_meta:
        if isinstance(resume_info_meta[info],List) :
            for key_meta in resume_info_meta[info]:
                if key_meta in meta_.lower():
                    find = info
                    print(find)
                    break
            if find != "none":
                break
        elif isinstance(resume_info_meta[info],str):
            key_meta = resume_info_meta[info]
            if key_meta in meta_.lower():
                find = info
        if find != "none":
            break
    if find in resume_info_meta:
        resume_info_actual[find] = i.strip()

certifications_awards_achievements
publications_research


In [179]:
for i in resume_info_actual:
    print(i,'^'*10)
    print(resume_info_actual[i])
    print('-'*100)

personal_details ^^^^^^^^^^
Personal Details:
Name: Koushik M A
Email: koushikma62@gmail.com
Phone: 9742836226
LinkedIn: linkedin.com/in/koushik-ma-6378001b7
----------------------------------------------------------------------------------------------------
education ^^^^^^^^^^
Education:
- University Visvesvaraya College of Engineering, BE in Computer Science & Engineering; 72.07 Aggregate (Aug 2016 - Sep 2020)
----------------------------------------------------------------------------------------------------
work_experience: ^^^^^^^^^^
Work Experience:
- Mathco (TheMathCompany), Product Engineer - II (Data Visualisation Using Python) (Jul 2024 - present)
  • Guided multiple teams through the product development process, focusing on data visualization and simplifying complex data science methodologies for better understanding.
  • Collaborated closely with customers to ensure that the products aligned with their business needs and were easy to interpret.
  • Successfully converted 2

In [180]:
import json
with open('data/kr.json', 'w') as f:
    json.dump(resume_info_actual, f)

In [188]:
json.load(open('data/kr.json'))

{'personal_details': 'Personal Details:\nName: Koushik M A\nEmail: koushikma62@gmail.com\nPhone: 9742836226\nLinkedIn: linkedin.com/in/koushik-ma-6378001b7',
 'education': 'Education:\n- University Visvesvaraya College of Engineering, BE in Computer Science & Engineering; 72.07 Aggregate (Aug 2016 - Sep 2020)',
 'work_experience:': 'Work Experience:\n- Mathco (TheMathCompany), Product Engineer - II (Data Visualisation Using Python) (Jul 2024 - present)\n  • Guided multiple teams through the product development process, focusing on data visualization and simplifying complex data science methodologies for better understanding.\n  • Collaborated closely with customers to ensure that the products aligned with their business needs and were easy to interpret.\n  • Successfully converted 2 proof-of-concept (POC) projects into full-time engagements for clients within a short timeframe, driving significant business growth to our organization.\n- Mathco (TheMathCompany), Associate (Jul 2022 - Ju