In [None]:
# Importing main.py functions
from Rough.main import (
    setup_environment,
    process_resume,
    split_resume_into_sections,
    process_job_posting,
    extract_keywords,
    calculate_keyword_match,
    filter_relevant_keywords,
    enhance_section
)

from parsing_module import extract_headers_with_pdfplumber

setup_environment()


✅ Loaded NEW llm_api with OpenAI SDK v1.x syntax
✅ Environment variables loaded successfully.


In [2]:
resume_file = "docs/sample_resume.pdf"
job_input = """
[Big Data Tools Developer

We build, improve, and maintain one of the highest scaling platforms in the world. Our amazing team of Engineers work on next generation Big Data Platforms that transform how users connect with each other every single day. Yahoo's Big Data Platform drives some of the most demanding applications in the industry. The system handles billions of requests a day and runs on some of the largest Hadoop clusters ever built! 50,000 nodes strong and several multi-thousand node clusters bring scalable computing to a whole new level. We work on problems that cover a wide spectrum - from web services to operating systems and networking layers. Our biggest challenges ahead are designing efficient cloud native big data platforms.

Responsibilities:

Job Monitoring: Overseeing the execution of various data jobs, ensuring they adhere to SLAs and do not encounter issues.
Data Orchestration: Utilizing tools like Airflow to manage the scheduling, execution, and monitoring of data workflows across cloud platforms such as AWS and GCP.
Query Execution and Optimization: Designing and optimizing queries to run efficiently on platforms such as BigQuery, Hive, Pig, and Spark, ensuring high performance and scalability.
Integration and Support: Collaborating with different teams to integrate data flows, provide support for query executions, and handle credentials for secure data operations.
Feature Development: Implementing new features to support advanced query capabilities, including federated queries and lineage tracking.

Required Skills and Qualifications:

Educational Background: A Bachelor's or Master’s degree in Computer Science or equivalent work experience.
Programming Languages: Proficiency in Python is essential for scripting and workflow management; experience with Java and C++ is preferred for backend data operations.
Data Management: Knowledge of data structures, algorithms, and database management systems like SQL, HBase, and BigQuery.
Cloud Technologies: Experience with cloud services, especially AWS (EMR, Glue, S3) and GCP (Dataproc, BigQuery).
Agile Methodology: Comfortable working in an Agile environment with regular sprints, planning, and retrospectives.
System Design: Ability to design large-scale, distributed systems that are highly available and resilient.
OS: Some experience working with Linux/Unix operating systems

Preferred Qualifications:

Experience with development and deployment on public cloud platforms such as AWS, GCP, Azure, or others
Experiencing developing containerized applications and working with container orchestration services
Experience with Apache Hadoop, Presto, Hive, Oozie, Pig, Storm, Spark, Jupyter
Understanding of data structures & algorithms
Knowledge of JVM internals and its performance tuning
Excellent debugging/testing skills, and excellent analytical and problem solving skills
Experience with continuous integration tools such as Jenkins and Hudson
Strong verbal and written communication skills to collaborate effectively with cross-functional teams.]
"""

# Extract text from resume and job posting
resume_text = process_resume(resume_file)
job_text = process_job_posting(job_input)

# Parse sections from resume
sections = split_resume_into_sections(resume_text, pdf_path=resume_file)

# DEBUG: Print pdfplumber headers directly
pdf_headers = extract_headers_with_pdfplumber(resume_file)
print("\n[DEBUG] Headers from pdfplumber:", pdf_headers)


# Extract overall keywords
resume_keywords = extract_keywords(resume_text)
job_keywords = extract_keywords(job_text)

print("✅ Resume and job posting processed successfully.")
print(f"Found Resume Sections: {list(sections.keys())}")




[Fallback] No 'education' section found. Trying raw text extraction for education section...
[Fallback] No 'skills' section found. Trying raw text extraction for skills section...
[Fallback] No 'certifications' section found. Trying raw text extraction for certifications section...
[Fallback] No 'experience' section found. Trying raw text extraction for experience section...
[Fallback] No 'projects' section found. Trying raw text extraction for projects section...

[DEBUG] Headers from pdfplumber: ['Neal', 'Iyer', 'Professional', 'Summary', 'Skills', 'Experience', 'Experience', 'Education', 'and', 'Certifications', 'Projects']
✅ Resume and job posting processed successfully.
Found Resume Sections: ['summary', 'education', 'skills', 'certifications', 'experience', 'projects']


In [3]:
# === Mock GPT Function ===
def mock_gpt_enhancement(section_name, section_text, job_keywords):
    """
    Simulates GPT enhancement by inserting keywords and tagging improvement areas.
    Replace with real OpenAI API call later.
    """
    print(f"\n--- Enhancing: {section_name.upper()} ---")
    print("Relevant Keywords:", ", ".join(job_keywords))

    # Simulated enhancement (stub)
    enhanced = f"[Improved {section_name} section with keywords: {', '.join(job_keywords)}]\n\n{section_text}"
    return enhanced


# === Sample Data (Replace with your parsed_sections object) ===
parsed_sections = {
    "summary": """
Federal Data Analyst with 4+ years in human capital analytics, data-driven decision-making, and workforce planning. 
Skilled in SQL, Python, Tableau, and Power BI, with a track record of optimizing HR processes through data analysis, 
reporting automation, and visualization.
""".strip(),

    "skills": """
● Data Analytics & Visualization: SQL · Python · Tableau · Power BI · Advanced Excel · Advanced MS Office Suite  
● Human Capital & Workforce Analytics: HR Metrics · Hiring Pipeline Analytics · Employee Retention Analysis
""".strip()
}

# === Sample Keywords ===
job_keywords = [
    "SQL", "Python", "workforce analytics", "reporting automation", 
    "budget forecasting", "Power BI", "stakeholder communication"
]

# === Extract Text from Parsed Sections ===
summary_text = parsed_sections.get("summary", "")
skills_text = parsed_sections.get("skills", "")

# === Enhance Sections ===
enhanced_summary = mock_gpt_enhancement("summary", summary_text, job_keywords)
enhanced_skills = mock_gpt_enhancement("skills", skills_text, job_keywords)

# === Print Results ===
print("\n=== ENHANCED SUMMARY ===\n")
print(enhanced_summary)

print("\n=== ENHANCED SKILLS ===\n")
print(enhanced_skills)



--- Enhancing: SUMMARY ---
Relevant Keywords: SQL, Python, workforce analytics, reporting automation, budget forecasting, Power BI, stakeholder communication

--- Enhancing: SKILLS ---
Relevant Keywords: SQL, Python, workforce analytics, reporting automation, budget forecasting, Power BI, stakeholder communication

=== ENHANCED SUMMARY ===

[Improved summary section with keywords: SQL, Python, workforce analytics, reporting automation, budget forecasting, Power BI, stakeholder communication]

Federal Data Analyst with 4+ years in human capital analytics, data-driven decision-making, and workforce planning. 
Skilled in SQL, Python, Tableau, and Power BI, with a track record of optimizing HR processes through data analysis, 
reporting automation, and visualization.

=== ENHANCED SKILLS ===

[Improved skills section with keywords: SQL, Python, workforce analytics, reporting automation, budget forecasting, Power BI, stakeholder communication]

● Data Analytics & Visualization: SQL · Pytho

In [4]:
import os
from dotenv import load_dotenv
from openai import OpenAI
import pdfplumber

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Step 0: Raw resume text directly from PDF
pdf_path = "docs/sample_resume.pdf"
with pdfplumber.open(pdf_path) as pdf:
    full_resume_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())

# Step 1: Naive keyword search for experience block
import re

experience_match = re.search(r"(experience|work history|employment|professional experience)(.*?)((education|projects|certifications|skills|summary))", 
                             full_resume_text, 
                             re.IGNORECASE | re.DOTALL)

if experience_match:
    experience_text = experience_match.group(2).strip()
    print("[✔] Extracted experience section from raw text.\n")
else:
    print("[❌] Could not locate experience section in raw text.")
    experience_text = ""

print("=== Experience Preview ===")
print(experience_text[:1000])

# === Step 1: Define GPT-enhancement function ===
def enhance_section_with_gpt(section_name, section_text, job_keywords, client=None, model="gpt-4"):
    """
    Uses GPT to rewrite a resume section (summary or skills).
    """
    if client is None:
        raise ValueError("OpenAI client must be provided.")

    prompt = (
        "You are an expert resume writer.\n\n"
        "Improve the following resume section for clarity, tone, and professionalism.\n"
        "Preserve any bullet formatting if it exists.\n\n"
        f"Integrate the following job-relevant keywords naturally and only where appropriate:\n"
        f"{', '.join(job_keywords)}\n\n"
        f"Section to improve: {section_name.upper()}\n\n"
        f"```\n{section_text}\n```\n\n"
        "Respond only with the improved section text."
    )

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7
    )

    return response.choices[0].message.content.strip()


# === Step 2: Sample data ===
summary_text = """
Federal Data Analyst with 4+ years in human capital analytics, data-driven decision-making, and workforce planning. 
Skilled in SQL, Python, Tableau, and Power BI, with a track record of optimizing HR processes through data analysis, 
reporting automation, and visualization.
""".strip()

skills_text = """
● Data Analytics & Visualization: SQL · Python · Tableau · Power BI · Advanced Excel · Advanced MS Office Suite  
● Human Capital & Workforce Analytics: HR Metrics · Hiring Pipeline Analytics · Employee Retention Analysis
""".strip()

job_keywords = [
    "SQL", "Python", "workforce analytics", "reporting automation", 
    "budget forecasting", "Power BI", "stakeholder communication"
]


# === Step 3: Enhance sections using GPT ===
real_summary = enhance_section_with_gpt("summary", summary_text, job_keywords, client=client)
real_skills = enhance_section_with_gpt("skills", skills_text, job_keywords, client=client)

# === Step 4: Output results ===
print("\n=== REAL ENHANCED SUMMARY ===\n")
print(real_summary)

print("\n=== REAL ENHANCED SKILLS ===\n")
print(real_skills)


[✔] Extracted experience section from raw text.

=== Experience Preview ===
Substance Abuse and Mental Health Administration, Department of HHS
Budget Analyst Dec '23 - Mar '25
· Conducted data mining of critical human capital information across divisions to support strategic decision-making and
hiring status evaluations, assessing effectiveness through comprehensive data analysis procedures to enhance workforce
planning.
· Developed an Excel VBA-based incentive tracking system, integrating engagement analysis and survey design to assess
incentive award effectiveness. Implemented automated validation checks and dynamic data processing workflows,
increasing data accuracy by 85% and enhancing communication-engagement data insights for decision-making.
· Engineered a real-time Python-based payroll tracking system, integrating 10+ fund sources and 25 Lines of Accounting,
leveraging SQL for data extraction and pandas for analysis, reducing reconciliation time by ~2 hours/month and
improving

In [5]:
# === Import parsing functions from experience_splitter.py ===
from experience_splitter import split_experience_section, parse_job_entry

# === Get the experience section text (from resume parsing pipeline) ===
experience_text = parsed_sections.get("experience", "")

# === Parse experience section into job chunks ===
chunks = split_experience_section(experience_text)
parsed_jobs = [parse_job_entry(chunk) for chunk in chunks]

# === GPT Enhancement Function (Still Inline for Now) ===
def enhance_job_entry_with_gpt(job, job_keywords, client=None, model="gpt-4"):
    if client is None:
        raise ValueError("OpenAI client must be provided.")

    context = (
        f"Company: {job['company']}\n"
        f"Title: {job['title']}\n"
        f"Dates: {job['date_range']}\n\n"
        f"Responsibilities:\n" + "\n".join(job['bullets'])
    )

    prompt = (
        "You are a professional resume writer.\n\n"
        "Improve the job description below by:\n"
        "- Enhancing clarity, conciseness, and tone\n"
        "- Preserving bullet formatting\n"
        "- Integrating the following job keywords naturally (only where relevant):\n"
        f"{', '.join(job_keywords)}\n\n"
        "Rewrite this job experience:\n\n"
        f"```\n{context}\n```\n\n"
        "Respond with just the improved bullet points, in bullet format."
    )

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7,
    )

    return response.choices[0].message.content.strip()


# === Enhance First Job for Testing ===
enhanced_job_1 = enhance_job_entry_with_gpt(parsed_jobs[0], job_keywords, client=client)

print("\n=== ENHANCED JOB 1 ===\n")
print(enhanced_job_1)


[Fallback] No 'education' section found. Trying raw text extraction for education section...
[Fallback] No 'skills' section found. Trying raw text extraction for skills section...
[Fallback] No 'experience' section found. Trying raw text extraction for experience section...
[Fallback] No 'certifications' section found. Trying raw text extraction for certifications section...
[Fallback] No 'projects' section found. Trying raw text extraction for projects section...

=== Top-Level Resume Sections Extracted ===

--- SUMMARY ---
Federal Data Analyst with 4+ years in human capital analytics, data-driven decision-making, and workforce planning. Skilled
in SQL, Python, Tableau, and Power BI, with a track record of optimizing HR processes through data analysis, reporting
automation, and visualization.
Professional Skills
● Data Analytics & Visualization: SQL · Python · Tableau · Power BI · Advanced Excel · Advanced MS Office Suite
● Human Capital & Workforce Analytics: HR Metrics · Hiring Pipe

In [6]:
import os
import re
import pdfplumber
from experience_splitter import split_experience_section, parse_job_entry

# === Load PDF ===
pdf_path = "docs/sample_resume.pdf"
if not os.path.exists(pdf_path):
    raise FileNotFoundError(f"❌ Resume file not found at: {pdf_path}")

with pdfplumber.open(pdf_path) as pdf:
    full_resume_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())

# === Extract experience section using fallback ===
experience_match = re.search(
    r"(experience|work history|employment|professional experience)(.*?)"
    r"((education|projects|certifications|skills|summary))",
    full_resume_text,
    re.IGNORECASE | re.DOTALL,
)

if experience_match:
    experience_text = experience_match.group(2).strip()
    print("[✔] Extracted experience section from raw text.\n")
else:
    print("[❌] Could not locate experience section in raw text.")
    experience_text = ""

# === Parse into job chunks ===
chunks = split_experience_section(experience_text)
parsed_jobs = [parse_job_entry(chunk) for chunk in chunks]

# ✅ Remove any placeholder/empty job blocks
parsed_jobs = [j for j in parsed_jobs if j["company"] and j["title"]]

# === GPT Enhancement per job ===
def enhance_job_entry_with_gpt(job, job_keywords, client=None, model="gpt-4"):
    if client is None:
        raise ValueError("OpenAI client must be provided.")

    context = (
        f"Company: {job['company']}\n"
        f"Title: {job['title']}\n"
        f"Dates: {job['date_range']}\n\n"
        f"Responsibilities:\n" + "\n".join(job['bullets'])
    )

    prompt = (
        "You are a professional resume writer.\n\n"
        "Improve the job description below by:\n"
        "- Enhancing clarity, conciseness, and tone\n"
        "- Limiting to 3–5 strong, high-impact bullets per job\n"
        "- Avoiding repetitive phrasing (e.g., do not overuse 'leveraged', 'developed', etc.)\n"
        "- Preserving bullet formatting\n"
        "- Integrating the following job keywords naturally (only where relevant):\n"
        f"{', '.join(job_keywords)}\n"
        "- Ensuring each bullet clearly answers 'So what?' by showing impact, outcomes, or business value\n"
        "- Quantifying results where possible (e.g., time saved, accuracy improved, % growth)\n"
        "- Do not invent or exaggerate accomplishments. Only reword what is already in the resume.\n\n"
        "Rewrite this job experience:\n\n"
        f"```\n{context}\n```\n\n"
        "Respond with just the improved bullet points, in bullet format."
    )

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7,
    )

    return response.choices[0].message.content.strip()

# === Enhance All Jobs ===
enhanced_jobs = []
for job in parsed_jobs:
    enhanced_bullets = enhance_job_entry_with_gpt(job, job_keywords, client=client)
    enhanced_jobs.append({
        "company": job["company"],
        "title": job["title"],
        "date_range": job["date_range"],
        "bullets": enhanced_bullets
    })

# === Format the final output ===
def format_enhanced_experience(enhanced_jobs):
    formatted = []
    for job in enhanced_jobs:
        block = f"""Company: {job['company']}
Title: {job['title']}
Dates: {job['date_range']}

{job['bullets']}
"""
        formatted.append(block.strip())
    return "\n\n".join(formatted)

# === Display Results ===
full_enhanced_experience = format_enhanced_experience(enhanced_jobs)

print("\n=== FINAL ENHANCED EXPERIENCE SECTION ===\n")
print(full_enhanced_experience)


[✔] Extracted experience section from raw text.


=== FINAL ENHANCED EXPERIENCE SECTION ===

Company: Substance Abuse and Mental Health Administration, Department of HHS
Title: Budget Analyst
Dates: Dec '23 - Mar '25

- Leveraged SQL and Python to mine human capital data across divisions, enhancing strategic decision-making and hiring evaluations, while improving workforce analytics.
- Created an Excel VBA-based incentive tracking system with automated validation checks, bolstering data accuracy by 85% and enriching communication-engagement insights.
- Developed a real-time payroll tracking system using Python and SQL, integrating multiple fund sources and lines of accounting, reducing reconciliation time by ~2 hours/month and enhancing payroll forecasting accuracy.
- Optimized SharePoint-based data management processes for financial reporting, improving workflow automation and document version control.
- Led financial forecasting and variance analysis for a $165M payroll budget, resul

In [7]:
from parsing_module import extract_text_pdfminer, split_resume_into_sections

# Point to the same resume you used previously
pdf_path = "docs/sample_resume.pdf"

# Extract full resume text
resume_text = extract_text_pdfminer(pdf_path)

# Run the updated section splitter
parsed_sections = split_resume_into_sections(resume_text, pdf_path=pdf_path)

# Print all section keys
print("=== Extracted Sections ===")
for section in parsed_sections:
    print("-", section)

# Specifically print experience section
print("\n=== Experience Section ===")
print(parsed_sections.get("experience", "[Not Found]")[:1000])  # Preview first 1000 chars
print("✅ Experience section exists:", "experience" in parsed_sections)
print("📝 Raw experience text length:", len(parsed_sections.get("experience", "")))
print("\n--- Preview ---\n", parsed_sections.get("experience", "")[:500])



[Fallback] No 'education' section found. Trying raw text extraction for education section...
[Fallback] No 'skills' section found. Trying raw text extraction for skills section...
[Fallback] No 'experience' section found. Trying raw text extraction for experience section...
[Fallback] No 'certifications' section found. Trying raw text extraction for certifications section...
[Fallback] No 'projects' section found. Trying raw text extraction for projects section...
=== Extracted Sections ===
- summary
- education
- skills
- experience
- certifications
- projects
- other

=== Experience Section ===
Substance Abuse and Mental Health Administration, Department of HHS
Budget Analyst
Dec '23 - Mar '25
·  Conducted data mining of critical human capital information across divisions to support strategic decision-making and
hiring status evaluations, assessing effectiveness through comprehensive data analysis procedures to enhance workforce
planning.
·  Developed an Excel VBA-based incentive tra

In [8]:
from llm_enhancer import enhance_resume_experience

pdf_path = "docs/sample_resume.pdf"
job_keywords = ["SQL", "Python", "Power BI", "budget forecasting"]
output = enhance_resume_experience(pdf_path, job_keywords)
print(output)


[Fallback] No 'education' section found. Trying raw text extraction for education section...
[Fallback] No 'skills' section found. Trying raw text extraction for skills section...
[Fallback] No 'experience' section found. Trying raw text extraction for experience section...
[Fallback] No 'certifications' section found. Trying raw text extraction for certifications section...
[Fallback] No 'projects' section found. Trying raw text extraction for projects section...
Company: Substance Abuse and Mental Health Administration, Department of HHS
Title: Budget Analyst
Dates: Dec '23 - Mar '25

- Leveraged SQL and Python to engineer a real-time payroll tracking system, integrating multiple fund sources and lines of accounting, resulting in a 2-hour/month reduction in reconciliation time and improved payroll forecasting accuracy.
- Implemented an Excel VBA-based incentive tracking system with automated validation checks, enhancing data accuracy by 85% and providing valuable insights for strateg