In [22]:
import os
import re
from docx import Document
from uuid import uuid4

In [2]:
!pwd

/c/projects/ask_my_resume


In [8]:
file_path = "Resume_Nida Madina Khan.docx"
# file_path = "../data/resume.docx"

def extract_text_from_docx(file_path):
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

resume_txt = extract_text_from_docx(file_path)

In [20]:
print(resume_txt)

Nida Madina Khan

Work Experience
Securiti, Karachi, Pakistan	11/2024 – Current
Data Scientist	
• Developed AI and Machine Learning solutions to process sensitive data securely, managing the full ML lifecycle from data collection and preparation to model training, evaluation, and production-readiness.
• Built a CNN-based model to detect the header row location within semi-structured tabular files leveraging sentence embeddings and cosine similarity, achieving 96% accuracy, <250ms inference latency, and a broader context window compared to the previous system.
• Integrated PyTorch models into a Java-based production pipeline for performance testing, collaborating closely with engineering teams to ensure seamless deployment.
• Fine-tuned LLaMA 3 using PEFT (LoRA) to classify personal data (PD) types of columns in structured tables, achieving ~92% accuracy across 15+ PD types, and enabling classification of newly added PD types during inference.
• Iteratively improved model accuracy throu

In [23]:
# Step 2: Split text into sections using known headings
def split_sections(text):
    section_headers = ["Work Experience", "Projects", "Skills", "Education"]
    section_map = {}
    
    # Regex split with lookahead
    pattern = "|".join([re.escape(h) for h in section_headers])
    splits = re.split(f"(?=({pattern}))", text)

    # Group section headers with content
    current_section = None
    for item in splits:
        if item in section_headers:
            current_section = item
            section_map[current_section] = ""
        elif current_section:
            section_map[current_section] += item.strip() + "\n"
    
    return section_map

In [32]:
print(split_sections(resume_txt)['Projects'])

Projects
• End-to-end PySpark data pipeline to process large-scale taxi trip data. Used Spark local mode for prototyping and deployed the final pipeline on a single-node Dataproc cluster on GCP. Ingested Parquet-formatted data from GCS, standardized schemas, and wrote monthly aggregated metrics to BigQuery for reporting and analysis.
• Machine Learning Pipeline Automation: Deployed an automated, end-to-end machine learning pipeline in R, encompassing data preprocessing, model training, validation, and deployment phases, greatly enhancing productivity.



In [33]:
# Step 3: Chunk content inside each section
def chunk_section_content(section_name, content):
    chunks = []

    # Split by double newlines or bullet points
    raw_chunks = re.split(r"\n{2,}|•", content)
    for raw in raw_chunks:
        text = raw.strip()
        if len(text) > 30:  # skip very short lines
            chunks.append({
                "id": str(uuid4()),
                "text": text,
                "metadata": {
                    "section": section_name
                }
            })
    return chunks

In [34]:
# Step 4: Combine everything
def parse_resume_to_chunks(file_path):
    resume_txt = extract_text_from_docx(file_path)
    section_map = split_sections(resume_txt)

    all_chunks = []
    for section, content in section_map.items():
        chunks = chunk_section_content(section, content)
        all_chunks.extend(chunks)

    return all_chunks

In [46]:
# Step 5: Run and preview
file_path = "Resume_Nida Madina Khan.docx"
resume_chunks = parse_resume_to_chunks(file_path)

for chunk in resume_chunks[:3]:  # show sample output
    print(f"\n--- Chunk from {chunk['metadata']['section']} ---")
    print(chunk['text'])


--- Chunk from Work Experience ---
Work Experience
Securiti, Karachi, Pakistan	11/2024 – Current
Data Scientist

--- Chunk from Work Experience ---
Developed AI and Machine Learning solutions to process sensitive data securely, managing the full ML lifecycle from data collection and preparation to model training, evaluation, and production-readiness.

--- Chunk from Work Experience ---
Built a CNN-based model to detect the header row location within semi-structured tabular files leveraging sentence embeddings and cosine similarity, achieving 96% accuracy, <250ms inference latency, and a broader context window compared to the previous system.


In [47]:
import json

output_file = "resume.json"
# output_file = "../data/resume.json"

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(resume_chunks, f, ensure_ascii=False, indent=2)

print(f"Saved {len(resume_chunks)} chunks to {output_file}")

Saved 19 chunks to resume.json


In [None]:
# # load it back later
# with open("parsed_resume_chunks.json", "r", encoding="utf-8") as f:
#     loaded_chunks = json.load(f)

In [38]:
resume_chunks

[{'id': '79d65981-8a13-477c-aae8-78c33b5f3dbb',
  'text': 'Work Experience\nSecuriti, Karachi, Pakistan\t11/2024 – Current\nData Scientist',
  'metadata': {'section': 'Work Experience'}},
 {'id': 'e6765195-3a24-4850-9f2c-4e7b62a8b680',
  'text': 'Developed AI and Machine Learning solutions to process sensitive data securely, managing the full ML lifecycle from data collection and preparation to model training, evaluation, and production-readiness.',
  'metadata': {'section': 'Work Experience'}},
 {'id': 'e4fe21c0-43f7-451a-a323-5a5b26468bcd',
  'text': 'Built a CNN-based model to detect the header row location within semi-structured tabular files leveraging sentence embeddings and cosine similarity, achieving 96% accuracy, <250ms inference latency, and a broader context window compared to the previous system.',
  'metadata': {'section': 'Work Experience'}},
 {'id': 'a3ebe70f-0779-482a-b4af-208cec3e20a4',
  'text': 'Integrated PyTorch models into a Java-based production pipeline for per