In [7]:
from pydantic import BaseModel, Field
from typing import List, Optional, Dict, Any
import fitz  # PyMuPDF
from dotenv import load_dotenv
import os
from openai import OpenAI

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

def extract_text_from_pdf(pdf_path: str) -> str:
    doc = fitz.open(pdf_path)
    return "\n".join(page.get_text() for page in doc)

class ResumeItem(BaseModel):
    title: str
    subtitle: Optional[str]
    start_date: Optional[str]
    end_date: Optional[str]
    details: Optional[List[str]]
    extra: Optional[Dict[str, Any]]

class ResumeSection(BaseModel):
    section_name: str
    items: List[ResumeItem]

class Resume(BaseModel):
    sections: List[ResumeSection]

def parse_resume_with_openai(resume_text: str) -> Optional[Resume]:
    openai = OpenAI(api_key=api_key)
    # openai.api_key = openai_api_key
    prompt = f"""
You are an expert resume parser. Given this resume text, extract all information into the following JSON schema: {Resume.schema_json(indent=2)}
Resume Text:
\"\"\"
{resume_text}
\"\"\"
Return only the JSON.
"""
    response = openai.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=4096,
        temperature=0,
        response_format=Resume,
    )
    output = response.choices[0].message.parsed
    # json_str = response['choices'][0]['message']['content']
    # import json
    # try:
    #     data = json.loads(json_str)
    #     return Resume.parse_obj(data)  # This will create ResumeSection and ResumeItem objects
    # except (json.JSONDecodeError, ValidationError) as e:
    #     print("Parsing error:", e)
    #     return None
    return output

def resume_to_markdown(resume):
    md = []
    for section in resume.sections:
        md.append(f"## {section.section_name}\n")
        for item in section.items:
            # Title and subtitle
            line = f"**{item.title}**"
            if item.subtitle:
                line += f", *{item.subtitle}*"
            # Dates
            if item.start_date or item.end_date:
                dates = []
                if item.start_date:
                    dates.append(item.start_date)
                if item.end_date:
                    dates.append(item.end_date)
                line += f" ({' - '.join(dates)})"
            md.append(line)
            # Details
            if item.details:
                for detail in item.details:
                    md.append(f"- {detail}")
            # Extra fields
            if item.extra:
                for k, v in item.extra.items():
                    md.append(f"  - **{k.capitalize()}**: {v}")
            md.append("")  # Blank line for spacing
    return "\n".join(md)


resume = extract_text_from_pdf('/home/mory/jobProject/resumeBuilder2/uploads/Mory_Gharasuie_resume.pdf')
output = parse_resume_with_openai(resume)
# print(resume_to_markdown(output))
print(output)



/tmp/ipykernel_263929/1793766883.py:34: PydanticDeprecatedSince20: The `schema_json` method is deprecated; use `model_json_schema` and json.dumps instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  You are an expert resume parser. Given this resume text, extract all information into the following JSON schema: {Resume.schema_json(indent=2)}


sections=[ResumeSection(section_name='Contact Information', items=[ResumeItem(title='Mory Gharasuie', subtitle=None, start_date=None, end_date=None, details=['Norfolk, VA, USA', 'mmoha014@odu.edu', '+1 757 287 1602', 'https://www.linkedin.com/in/mory-gharasui-53415258/', 'https://github.com/mortezamg63'], extra=None)]), ResumeSection(section_name='Education', items=[ResumeItem(title='PhD candidate in computer science', subtitle='Old Dominion University, Norfolk, USA', start_date='Aug 2019', end_date='present', details=['GPA: 3.84/4.0', 'Research Interests: Self-Supervised Learning and Semi-supervised Learning in Imbalanced datasets (Image, Text and Tabular Domains)'], extra=None), ResumeItem(title='Master of Science in Computer Engineering', subtitle='University of NabiAkram, Tabriz, Iran', start_date=None, end_date=None, details=None, extra=None), ResumeItem(title='Bachelor of Science in Computer Engineering', subtitle='University of Shamsipoor, Tehran, Iran', start_date=None, end_dat

In [9]:
a=Resume(sections=[ResumeSection(section_name='Education', items=[ResumeItem(title='PhD candidate in computer science', subtitle='Old Dominion University', start_date='Aug 2019', end_date='present', details=['GPA: 3.84/4.0', 'Research Interests: Self-Supervised Learning and Semi-supervised Learning in Imbalanced datasets (Image, Text and Tabular Domains)'], extra=None), ResumeItem(title='Master of Science in Computer Engineering', subtitle='University of NabiAkram', start_date=None, end_date=None, details=None, extra=None), ResumeItem(title='Bachelor of Science in Computer Engineering', subtitle='University of Shamsipoor', start_date=None, end_date=None, details=None, extra=None)]), ResumeSection(section_name='Technical Skills', items=[ResumeItem(title='Languages & databases', subtitle=None, start_date=None, end_date=None, details=['python', 'Java', 'C++', 'ASP Webform', 'C#', 'SQL', 'MySQL', 'HTML'], extra=None), ResumeItem(title='Libraries', subtitle=None, start_date=None, end_date=None, details=['Tensorflow', 'Keras', 'PyTorch', 'OpenCV', 'Scikit-learn', 'NLP toolkit', 'HuggingFace', 'Pandas', 'Matplotlib', 'Seaborn', 'LangChain', 'Dask', 'BeautifulSoup', 'Flask'], extra=None), ResumeItem(title='Development tools', subtitle=None, start_date=None, end_date=None, details=['Anaconda', 'Jupyter Notebook', 'Google Colab', 'Visual Studio', 'Git', 'Docker', 'AWS'], extra=None), ResumeItem(title='Operating Systems', subtitle=None, start_date=None, end_date=None, details=['Windows', 'Linux', 'Mac OS X'], extra=None)]), ResumeSection(section_name='Certifications', items=[ResumeItem(title='LanGraph', subtitle=None, start_date=None, end_date=None, details=None, extra=None), ResumeItem(title='LLM Engineering', subtitle=None, start_date=None, end_date=None, details=None, extra=None), ResumeItem(title='AWS SageMaker', subtitle=None, start_date=None, end_date=None, details=None, extra=None)]), ResumeSection(section_name='Awards and Honors', items=[ResumeItem(title='Best Teaching Assistant', subtitle='Spring 2025', start_date=None, end_date=None, details=None, extra=None)]), ResumeSection(section_name='Experience', items=[ResumeItem(title='Software Developer', subtitle='Royan Communication Company', start_date='2013', end_date='2019', details=['Developing websites for small and medium-sized enterprises.', 'Customizing web-based administration interfaces for applications on Linux server machine (such as chat server, FreeRadius server, and Elastix).', 'Enhancing the panels with support for multiple languages and designing them to be more intuitive and user-friendly, tailored to meet the specific needs and preferences of the customers.'], extra=None), ResumeItem(title='Research Assistantship', subtitle='Old Dominion University', start_date='Aug 2019', end_date='Present', details=['Developing applications for mobile and serverless domains by leveraging ML, DL and CV.', 'Doing research on Improving the performance of ML and DL models on classification problems for tabular data in SSL setting.', 'Research on mitigating the impact of bias in imbalanced data in training ML and DL models in Image and tabular domains.'], extra=None), ResumeItem(title='Teaching Assistantship', subtitle='Old Dominion University', start_date='Aug 2019', end_date='Present', details=['Programming with C/C++ and Java (CS150, CS250, CS251)', 'Teaching Labs and recitations', 'Assignment Development', 'Grading'], extra=None), ResumeItem(title='Collaboration in Developing a ChatBot', subtitle='Medical Aid', start_date='Summer 2024', end_date=None, details=['Utilizing Large Language Models (LLMs) and Retrieval-Augmented Generation.', 'Medical data extraction with reference to papers or resource.', 'Presentation of results based on standard medical format.', 'Providing relevant questions or considerations from recent papers for better diagnosis.'], extra=None)]), ResumeSection(section_name='Projects', items=[ResumeItem(title='Data Science and Machine Learning Projects', subtitle=None, start_date=None, end_date=None, details=['Created and managed a repository featuring data science and machine learning projects.', 'These projects involve working with various datasets, and machine learning algorithms from traditional to STOA.', 'The repository includes: Data processing and preprocessing, Exploring data analysis (EDA), Feature engineering and selection, Machine learning development and evaluation (DNNs, Decision-Tree based models, Regression, Recommendation models, etc), Data visualization and interpretation.'], extra={}), ResumeItem(title='Large Language Models', subtitle=None, start_date=None, end_date=None, details=['Investigate the realm of retrieval-augmented generation (RAG) systems, embedding models, prompt engineering, and fine-tuning large language models.', 'Learning about Generative AI and leveraging some embedding models for practical applications.', 'The repository showcases ongoing exploration in this exciting field and will be continually updated with new insights and findings.'], extra=None), ResumeItem(title='Pricer (Agentic LLM)', subtitle=None, start_date=None, end_date=None, details=['An autonomous price estimation framework using LLMs (GPT-4o, Claude, Llama 3.1-8B) and traditional ML models (Random Forest, SVM, Word2Vec).', "GPT-4o-mini's performance (average price difference) with RAG, improves from 80.9 to 55.57.", 'Fine-tuned Llama3.1-8B and achieved 46.67 average error.', 'Developed an agent that creates an ensemble model combining RAG+GPT-4o-mini, fine-tuned Llama, and Random Forest, achieving 54.62 error.', 'Used a Gradio-based UI and integrated Pushover API for real-time deal alerts from DealNews.Com.'], extra=None), ResumeItem(title='Tabular Data and Semi-Supervised Learning (SSL)', subtitle=None, start_date=None, end_date=None, details=['This research addresses two major challenges in machine learning with large tabular datasets: class imbalance and the transformation of heterogeneous features, particularly non-numerical ones.', 'Developed domain-specific architectures for tabular data, including Transformer-based models designed for self-supervised and semi-supervised learning scenarios.', 'Introduced target-encoding transformations within semi-supervised frameworks, incorporating the imbalance characteristics of the data.', 'The effectiveness of these methods, which demonstrate improvements over the state-of-the-art, is documented in publications.'], extra=None), ResumeItem(title='Computer Vision Projects', subtitle=None, start_date=None, end_date=None, details=['Exercise Performance Monitoring: Developed a smartphone-based system that uses pose estimation to track movements during weight training.', 'Hand Gesture Recognition: Designed a system to recognize numbers written in mid-air using Hidden Markov Models.', 'Video Analytics System: Built an object detection and tracking pipeline for mobile edge cloud computing (MECC).'], extra=None)]), ResumeSection(section_name='Publications', items=[ResumeItem(title='LTBoost: Boosting Recall Uniformity for Long-Tailed Image Classification', subtitle='CAIP2025', start_date='Under Reviewer', end_date=None, details=None, extra=None), ResumeItem(title='SAWTab: Smoothed Adaptive Weighting for Tabular Data in Semi-Supervised Learning', subtitle='PAKDD 2024', start_date=None, end_date=None, details=None, extra={}), ResumeItem(title='Progressive Feature Upgrade in Semi-supervised Learning on Tabular Domain', subtitle='ICKG 2022', start_date=None, end_date=None, details=None, extra=None), ResumeItem(title='Performance Monitoring for Exercise Movements using Mobile cameras', subtitle='BodySys 2021', start_date=None, end_date=None, details=None, extra=None), ResumeItem(title='An efficient run-based method for connected component labeling', subtitle='MVIP 2015', start_date=None, end_date=None, details=None, extra=None), ResumeItem(title='Real-time dynamic hand gesture recognition using hidden Markov models', subtitle='MVIP 2013', start_date=None, end_date=None, details=None, extra=None)])])
a

TypeError: 'Resume' object is not subscriptable

In [3]:
from pydantic import BaseModel, Field
from typing import List, Optional, Dict, Any
import fitz  # PyMuPDF
from dotenv import load_dotenv
import os
from openai import OpenAI

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

def extract_text_from_pdf(pdf_path: str) -> str:
    doc = fitz.open(pdf_path)
    return "\n".join(page.get_text() for page in doc)

class ResumeItem(BaseModel):
    title: str
    subtitle: Optional[str]
    start_date: Optional[str]
    end_date: Optional[str]
    details: Optional[List[str]]
    extra: Optional[Dict[str, Any]]

class ResumeSection(BaseModel):
    section_name: str
    items: List[ResumeItem]

class Resume(BaseModel):
    sections: List[ResumeSection]

def parse_resume_with_openai(resume_text: str) -> Optional[Resume]:
    openai = OpenAI(api_key=api_key)
    # openai.api_key = openai_api_key
    prompt = f"""
You are an expert resume parser. Given this resume text, extract all information into the following JSON schema: {Resume.schema_json(indent=2)}
Resume Text:
\"\"\"
{resume_text}
\"\"\"
Return only the JSON.
"""
    response = openai.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=4096,
        temperature=0,
        response_format=Resume,
    )
    output = response.choices[0].message.parsed
    # json_str = response['choices'][0]['message']['content']
    # import json
    # try:
    #     data = json.loads(json_str)
    #     return Resume.parse_obj(data)  # This will create ResumeSection and ResumeItem objects
    # except (json.JSONDecodeError, ValidationError) as e:
    #     print("Parsing error:", e)
    #     return None
    return output

def resume_to_markdown(resume):
    md = []
    for section in resume.sections:
        md.append(f"## {section.section_name}\n")
        for item in section.items:
            # Title and subtitle
            line = f"**{item.title}**"
            if item.subtitle:
                line += f", *{item.subtitle}*"
            # Dates
            if item.start_date or item.end_date:
                dates = []
                if item.start_date:
                    dates.append(item.start_date)
                if item.end_date:
                    dates.append(item.end_date)
                line += f" ({' - '.join(dates)})"
            md.append(line)
            # Details
            if item.details:
                for detail in item.details:
                    md.append(f"- {detail}")
            # Extra fields
            if item.extra:
                for k, v in item.extra.items():
                    md.append(f"  - **{k.capitalize()}**: {v}")
            md.append("")  # Blank line for spacing
    return "\n".join(md)


resume = extract_text_from_pdf('/home/mory/jobProject/resumeBuilder2/uploads/Mory_Gharasuie_resume.pdf')
output = parse_resume_with_openai(resume)
print(resume_to_markdown(output))



/tmp/ipykernel_263929/1885356036.py:34: PydanticDeprecatedSince20: The `schema_json` method is deprecated; use `model_json_schema` and json.dumps instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  You are an expert resume parser. Given this resume text, extract all information into the following JSON schema: {Resume.schema_json(indent=2)}


## Education

**PhD candidate in computer science**, *Old Dominion University* (Aug 2019 - present)
- GPA: 3.84/4.0
- Research Interests: Self-Supervised Learning and Semi-supervised Learning in Imbalanced datasets (Image, Text and Tabular Domains)

**Master of Science in Computer Engineering**, *University of NabiAkram*

**Bachelor of Science in Computer Engineering**, *University of Shamsipoor*

## Technical Skills

**Languages & databases**
- python
- Java
- C++
- ASP Webform
- C#
- SQL
- MySQL
- HTML

**Libraries**
- Tensorflow
- Keras
- PyTorch
- OpenCV
- Scikit-learn
- NLP toolkit
- HuggingFace
- Pandas
- Matplotlib
- Seaborn
- LangChain
- Dask
- BeautifulSoup
- Flask

**Development tools**
- Anaconda
- Jupyter Notebook
- Google Colab
- Visual Studio
- Git
- Docker
- AWS

**Operating Systems**
- Windows
- Linux
- Mac OS X

## Certifications

**LanGraph**

**LLM Engineering**

**AWS SageMaker**

## Awards and Honors

**Best Teaching Assistant**, *Spring 2025*

## Experience

**Sof

In [6]:
output

Resume(sections=[ResumeSection(section_name='Education', items=[ResumeItem(title='PhD candidate in computer science', subtitle='Old Dominion University', start_date='Aug 2019', end_date='present', details=['GPA: 3.84/4.0', 'Research Interests: Self-Supervised Learning and Semi-supervised Learning in Imbalanced datasets (Image, Text and Tabular Domains)'], extra=None), ResumeItem(title='Master of Science in Computer Engineering', subtitle='University of NabiAkram', start_date=None, end_date=None, details=None, extra=None), ResumeItem(title='Bachelor of Science in Computer Engineering', subtitle='University of Shamsipoor', start_date=None, end_date=None, details=None, extra=None)]), ResumeSection(section_name='Technical Skills', items=[ResumeItem(title='Languages & databases', subtitle=None, start_date=None, end_date=None, details=['python', 'Java', 'C++', 'ASP Webform', 'C#', 'SQL', 'MySQL', 'HTML'], extra=None), ResumeItem(title='Libraries', subtitle=None, start_date=None, end_date=Non