In [8]:
import os
from pypdf import PdfReader

def read_resumes_from_directory(directory="resume-pdfs"):
    """
    Read all PDFs from a directory and return a list of text strings
    """
    resumes = []

    # Check if directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return resumes

    # Get all PDF files from the directory
    pdf_files = [f for f in os.listdir(directory) if f.lower().endswith('.pdf')]

    if not pdf_files:
        print(f"No PDF files found in '{directory}'.")
        return resumes

    # Process each PDF file
    for pdf_file in pdf_files:
        pdf_path = os.path.join(directory, pdf_file)

        try:
            # Extract text from PDF
            text = ""
            with open(pdf_path, 'rb') as file:
                pdf_reader = PdfReader(file)
                for page in pdf_reader.pages:
                    text += page.extract_text()

            resumes.append(text)
            print(f"Processed: {pdf_file} ({len(text)} characters)")
        except Exception as e:
            print(f"Error with {pdf_file}: {str(e)}")

    print(f"Total resumes extracted: {len(resumes)}")
    return resumes


# Call the function to get the list of resume texts
resumes = read_resumes_from_directory()
resumes[:1]

Processed: resume_9_amanda_foster.pdf (1878 characters)
Processed: resume_1_sarah_chen.pdf (1485 characters)
Processed: resume_8_monica_garcia.pdf (1702 characters)
Processed: resume_7_robert_johnson.pdf (1682 characters)
Processed: resume_5_david_kim.pdf (1548 characters)
Processed: resume_3_jennifer_park.pdf (1575 characters)
Processed: resume_4_alex_thompson.pdf (1439 characters)
Processed: resume_10_james_mitchell.pdf (2065 characters)
Processed: resume_2_marcus_rodriguez.pdf (1342 characters)
Processed: resume_6_lisa_wang.pdf (1509 characters)
Total resumes extracted: 10


['Dr. Amanda Foster\nResearch Scientist\nEmail: amanda.foster@email.com\nLocation: Cambridge, MA\nExperience: 8 years\nProfessional Summary\nAI research scientist with 8 years experience in computer vision and natural language processing. PhD\nin Computer Science with focus on deep learning applications.\nProfessional Experience\nPrincipal Research Scientist | AI Research Institute | 2021 - Present\n- Published 15 peer-reviewed papers on computer vision and NLP, cited 500+ times in academic\nliterature\n- Built state-of-the-art image recognition model achieving top-3 accuracy on ImageNet benchmark\n- Led research team of 6 PhD students and postdocs investigating multimodal AI systems\nResearch Scientist | Tech Giant Research Lab | 2018 - 2021\n- Developed novel neural network architecture for language understanding, open-sourced for research\ncommunity\n- Shipped research prototype to production, serving millions of users with real-time image analysis\n- Collaborated with engineering t

In [1]:
from getpass import getpass
from dotenv import load_dotenv
import os

env_file = '.env'
load_dotenv('.env', override=True)

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI AI API key: ")

In [32]:
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel
import json
from typing import List
from tqdm.asyncio import tqdm as tqdm_async
import asyncio

def chunks(xs, n=10):
    n = max(1, n)
    return [xs[i:i + n] for i in range(0, len(xs), n)]



class TextExtractor:
    def __init__(self,
                 llm_with_struct_output,
                 prompt_template: PromptTemplate):
        self.llm = llm_with_struct_output
        self.prompt_template = prompt_template

    async def extract(self, texts: List[str], semaphore) -> BaseModel:
        async with semaphore:
            prompt = self.prompt_template.invoke({'texts': '\n\n'.join(texts)})
            # Use structured LLM for extraction
            entity: BaseModel = await self.llm.ainvoke(prompt)
        return entity


    async def extract_all(self, texts: List[str], chunk_size=1, max_workers=10) -> List[BaseModel]:
        # Create a semaphore with the desired number of workers
        semaphore = asyncio.Semaphore(max_workers)

        # Create tasks with the semaphore
        text_chunks = chunks(texts, chunk_size)
        tasks = [self.extract(text_chunk, semaphore) for text_chunk in text_chunks]

        # Explicitly update progress using `tqdm` as tasks complete
        entities: List[BaseModel] = []
        with tqdm_async(total=len(tasks), desc="extracting texts") as pbar:
            for future in asyncio.as_completed(tasks):
                result = await future
                entities.append(result)
                pbar.update(1)  # Increment progress bar for each completed task
        return entities

In [33]:
people_prompt_template = PromptTemplate.from_template("""
You are extracting information from resumes according to the people schema. Below is the resume
# Resumes
{texts}
""")

In [34]:
from person import Person
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4.1", temperature=0).with_structured_output(Person)

In [35]:
text_extractor = TextExtractor(llm, people_prompt_template)

In [36]:
people = await text_extractor.extract_all(resumes)

extracting texts: 100%|██████████| 10/10 [00:18<00:00,  1.81s/it]


In [37]:
type(people)

list

In [38]:
len(people)

10

In [41]:
people_list = [person.model_dump() for person in people]

with open('extracted-people-data.json', 'w') as json_file:
    json.dump(people_list, json_file, indent=4)

print("JSON file created: people_data.json")


JSON file created: people_data.json
