# AI Builders Week 3 Homework
## Resume matcher

In [1]:
import pymupdf
import requests

response = requests.get('https://miguel-vila.github.io/resume_EN.pdf')
response.raise_for_status()
pdf_bytes = response.content
resume = pymupdf.open(stream=pdf_bytes, filetype='pdf')

In [2]:
print("Number of pages:", len(resume))
resume_str = '\n'.join([page.get_text() for page in resume])
resume_str

Number of pages: 3


'MIGUEL VIL´A GONZ´ALEZ\nmiguel-vila @ github\nPROFESSIONAL EXPERIENCE\nSenior Software Engineer\nRemote - US East Coast Timezone\nSiriusXM\nNovember 2022 - Present\n· Member of the API tooling team, part of the platform services enablement organization.\n· Develop tooling using Smithy, an AWS DDL language, empowering teams to describe, implement, and consume\ntheir services.\n· Key projects include:\n· Developed a system for describing which Smithy services are implemented and consumed by applications,\nenabling package management and dependency tracing across the platform.\n· Implemented compatibility checks to verify service changes against potential data processing breakages.\n· Contributed to a team implementing semantic search using vector embeddings stored in OpenSearch.\n· Set up infrastructure to ingest catalog data and generate embeddings using different models.\n· Set up a client to query the different indexes using the embeddings.\nLead Software Engineer ←Senior Software En

In [3]:
import os
from openai import OpenAI

openAI = OpenAI()

job_posting_format="""
```
# Title
{{job title}}
## Description
{{general job description}}
## Responsibilities
{{responsibilities}}
## Qualifications
{{qualifications}}
## Bonus qualifications
{{bonus qualifications}}
```"""

fake_job_posting_response = openAI.responses.create(
    model="gpt-4o",
    input=
        f"""The following is a resume for a specific person.
        From that resume create a job listing that would fit that person.
        Don't "overfit": use the described experience but don't reproduce the CV.
        You don't have to mention specific companies.
        Only mention the most commonly mentioned technologies or the ones that appear first in lists.
        Output in the following format:
        {job_posting_format}:
        {resume_str}""",
)

In [5]:
from IPython.display import display, Markdown

fake_job_posting = fake_job_posting_response.output[0].content[0].text
display(Markdown(fake_job_posting))

```
# Title
Senior Software Engineer - API Tooling & Integration

## Description
Join our dynamic team as a Senior Software Engineer focused on developing innovative API tooling. This role involves enhancing platform services with scalable solutions using cutting-edge technologies. You will collaborate across teams to implement and integrate services effectively, ensuring seamless functionality and optimized performance.

## Responsibilities
- Develop and maintain API tooling using Smithy and AWS services.
- Implement compatibility checks and systems to enhance dependency management.
- Contribute to semantic search features using OpenSearch and vector embeddings.
- Lead technical projects, from crafting proposals to guiding cross-team discussions and decisions.
- Enhance and oversee subscription management systems, including development of user authentication and account management services.
- Mentor new team members and improve internal documentation.
- Conduct performance tests and define scaling policies.
- Collaborate on open-source projects to fix and enhance features.

## Qualifications
- Extensive experience in software engineering with a strong focus on API development.
- Proficiency in Scala, Python, TypeScript, AWS, and cloud infrastructure.
- Strong project management skills, with experience leading technical projects.
- Ability to decompose complex projects into deliverable units.
- Excellent communication skills for collaborating with cross-functional teams.

## Bonus qualifications
- Experience in developing and maintaining subscription management systems.
- Contributions to open-source projects, especially in the areas of functional programming and static analysis.
- Familiarity with semantic search technologies and vector embeddings.
```

In [6]:
import pandas as pd

jobs = pd.read_csv('jobs.csv')

jobs.head()

Unnamed: 0,id,site,job_url,job_url_direct,title,company,location,date_posted,job_type,salary_source,...,company_addresses,company_num_employees,company_revenue,company_description,skills,experience_range,company_rating,company_reviews_count,vacancy_count,work_from_home_type
0,in-764d02c34833d113,indeed,https://www.indeed.com/viewjob?jk=764d02c34833...,https://jobs.ashbyhq.com/yumaai/cdc8c768-e82b-...,AI Product Focused - Senior Fullstack / Rails ...,Yuma AI,"Boston, MA, US",2025-09-02,fulltime,direct_data,...,,,,,,,,,,
1,in-82e200b3b54e6886,indeed,https://www.indeed.com/viewjob?jk=82e200b3b54e...,https://jobs.ashbyhq.com/gallatin/b7ed9bbb-496...,DevOps Engineer,Gallatin,"El Segundo, CA, US",2025-09-02,fulltime,direct_data,...,,,,,,,,,,
2,in-c9747635cab5e167,indeed,https://www.indeed.com/viewjob?jk=c9747635cab5...,https://grnh.se/mwr1jefn2us,"Staff Software Engineer, Growth Products",Lyft,"San Francisco, CA, US",2025-09-02,,direct_data,...,,,,Multiply your earnings when you drive with Lyf...,,,,,,
3,in-07678b0399d77ed6,indeed,https://www.indeed.com/viewjob?jk=07678b0399d7...,https://grnh.se/e8e4gmqg2us,"Staff Software Engineer, Growth Products",Lyft,"New York, NY, US",2025-09-02,,direct_data,...,,,,Multiply your earnings when you drive with Lyf...,,,,,,
4,in-f89ce04ae801a7c3,indeed,https://www.indeed.com/viewjob?jk=f89ce04ae801...,https://grnh.se/nxoofj1z2us,"Staff Software Engineer, Growth Products",Lyft,"Seattle, WA, US",2025-09-02,,direct_data,...,,,,Multiply your earnings when you drive with Lyf...,,,,,,


In [112]:
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import json
from math import ceil
import tiktoken

MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"

device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(MODEL_NAME, device=device)

def normalize_description_prompt(description: str) -> str:
    return f"""
        Take the following job posting description and format it in the following way:
        {job_posting_format}
        Apply it to the following job description:
        {description}
    """
gpt_model = "gpt-4o-mini"
enc = tiktoken.encoding_for_model(gpt_model)

batches=25
n = len(jobs)
q, r = divmod(n, batches)
sizes = [q + 1] * r + [q] * (batches - r)
start = 0
for batch_num, size in enumerate(sizes):
    if size == 0:
        continue  # guard if batches > n
    end = start + size
    jsonl_path = f"requests-{batch_num}.jsonl"
    batch_jobs = jobs.iloc[start:end]
    batch_tokens = 0
    max_row_tokens, max_row_idx = 0, None
    with open(jsonl_path, "w") as f:
        for i, row in batch_jobs.iterrows():
            custom_id = f"row-{i}"
            prompt = normalize_description_prompt(row['description'])
            body = {
                "model": gpt_model,
                "input": prompt
            }
            n_tokens = len(enc.encode(prompt))
            batch_tokens += n_tokens
            if n_tokens > max_row_tokens:
                max_row_tokens, max_row_idx = n_tokens, i
            f.write(json.dumps({
                "custom_id": custom_id,
                "method": "POST",
                "url": "/v1/responses",
                "body": body
            }) + "\n")
    print(
        f"Batch {batch_num}: {size} rows, ~{batch_tokens} input tokens "
        f"(max/row ~{max_row_tokens} at df index {max_row_idx})"
    )
    start = end


Batch 0: 8 rows, ~8138 input tokens (max/row ~1386 at df index 6)
Batch 1: 8 rows, ~8564 input tokens (max/row ~1513 at df index 15)
Batch 2: 8 rows, ~6811 input tokens (max/row ~1283 at df index 20)
Batch 3: 8 rows, ~8845 input tokens (max/row ~1422 at df index 24)
Batch 4: 8 rows, ~9321 input tokens (max/row ~2030 at df index 37)
Batch 5: 8 rows, ~10974 input tokens (max/row ~1826 at df index 40)
Batch 6: 8 rows, ~8627 input tokens (max/row ~1611 at df index 48)
Batch 7: 8 rows, ~8537 input tokens (max/row ~2060 at df index 61)
Batch 8: 8 rows, ~8365 input tokens (max/row ~1970 at df index 64)
Batch 9: 8 rows, ~8172 input tokens (max/row ~1830 at df index 78)
Batch 10: 8 rows, ~10857 input tokens (max/row ~1626 at df index 82)
Batch 11: 8 rows, ~8139 input tokens (max/row ~1493 at df index 88)
Batch 12: 8 rows, ~7280 input tokens (max/row ~958 at df index 101)
Batch 13: 8 rows, ~5941 input tokens (max/row ~943 at df index 105)
Batch 14: 8 rows, ~6513 input tokens (max/row ~1064 at df

In [113]:

def list_active_batches(statuses=("validating","in_progress","queued","running","finalizing")):
    print("Active batches:")
    found = False
    for b in openAI.batches.list().data:  # paginated; adjust if you have many
        if b.status in statuses:
            found = True
            print(f"- {b.id}  status={b.status}  created_at={b.created_at}  endpoint={b.endpoint}")
    if not found:
        print("(none)")

def cancel_batch(batch_id: str):
    print(f"Cancelling {batch_id} ...")
    openAI.batches.cancel(batch_id)

list_active_batches()

Active batches:
(none)


In [None]:
unified_results = {}
import time

for batch_num in range(batches):
    upload = openAI.files.create(file=open(f"requests-{batch_num}.jsonl", "rb"), purpose="batch")
    batch = openAI.batches.create(
        input_file_id=upload.id,
        endpoint="/v1/responses",
        completion_window="24h"
    )
    status = openAI.batches.retrieve(batch.id)
    while status.status in ("validating", "in_progress", "running", "queued"):
        time.sleep(10)
        perc_completed = 0 if status.request_counts.total == 0 else status.request_counts.completed / status.request_counts.total
        print(f'waiting for batch: {batch_num}. Last status: {status.status}. progress={perc_completed:.1%}')
        status = openAI.batches.retrieve(batch.id)
    if status.status != "completed":
        raise RuntimeError(f"Batch ended as {status.status}: {status}")
    out_content = openAI.files.content(status.output_file_id).read().decode("utf-8")
    for line in out_content.strip().splitlines():
        item = json.loads(line)
        cid = item["custom_id"]             # e.g., "row-17"
        # The response payload lives at item["response"]["body"]
        body = item["response"]["body"]
        # Typical responses payloads put generated text at body["output"][0]["content"][0]["text"]
        # but use the exact shape you asked for:
        unified_results[cid] = body
unified_results

waiting for batch: 0. Last status: validating. progress=0.0%
waiting for batch: 0. Last status: in_progress. progress=0.0%
waiting for batch: 0. Last status: in_progress. progress=0.0%


In [63]:

status.status

'completed'

In [None]:
embeddings = model.encode(
    jobs['description'],
    batch_size=32,
    convert_to_tensor=True,
    normalize_embeddings=True,
    show_progress_bar=False
)

jobs['embedding'] = [emb.detach().cpu().numpy() for emb in embeddings]

In [8]:
import torch.nn.functional as F

resume_embedding = model.encode(
    resume_str,
    batch_size=32,
    convert_to_tensor=True,
    normalize_embeddings=True,
    show_progress_bar=False
)

fake_job_posting_embedding = model.encode(
    fake_job_posting,
    batch_size=32,
    convert_to_tensor=True,
    normalize_embeddings=True,
    show_progress_bar=False
)

def get_cos_sims(target_embedding):
    cos_sims = [
        F.cosine_similarity(
            torch.tensor(embedding, dtype=torch.float32).unsqueeze(0),
            target_embedding.unsqueeze(0)
        ) for embedding in jobs['embedding']
    ]
    return [cos_sim[0].detach().cpu().numpy() for cos_sim in cos_sims]

jobs['cos_sim_resume'] = get_cos_sims(resume_embedding)
jobs['cos_sim_fake_job_posting'] = get_cos_sims(fake_job_posting_embedding)

def sort_by_similarity_and_log(column_name):
    sorted_jobs = jobs.sort_values(column_name, ascending=False)
    most_similar = sorted_jobs.head()
    # print(most_similar)
    # print(most_similar.shape())
    for closest in most_similar.itertuples(index=False):
        print(f'Title: {closest.title}')
        # print(f'Description: {closest.description}')
        print(f'Url: {closest.job_url}')
        # print(f'Similarity: {closest.cos_sim}')
    print('_'*15)
    most_disimilar = sorted_jobs.tail()
    for closest in most_disimilar.itertuples(index=False):
        print(f'Title: {closest.title}')
        # print(f'Description: {closest.description}')
        print(f'Url: {closest.job_url}')
        # print(f'Similarity: {closest.cos_sim}')

sort_by_similarity_and_log('cos_sim_resume')

print('*'*35)

sort_by_similarity_and_log('cos_sim_fake_job_posting')


Title: Senior Software Engineer, Applied AI
Url: https://www.indeed.com/viewjob?jk=a53c64fadb77ee9b
Title: Software Engineer - Analytics Platforms & Experiences (APX)
Url: https://www.indeed.com/viewjob?jk=5693ba7970271f3f
Title: Sr. Software Engineer II - StreamingTV
Url: https://www.indeed.com/viewjob?jk=4d149be11348a3ef
Title: Software Engineer - Fullstack
Url: https://www.indeed.com/viewjob?jk=f75d0e7489b9c668
Title: Developer Advocate, Developer Productivity, DevEx
Url: https://www.indeed.com/viewjob?jk=da15865a44121cbd
_______________
Title: Sr Software Development Engineer
Url: https://www.indeed.com/viewjob?jk=50ebc522b8d09ed9
Title: Software Engineer (7300U) - Berkeley Seismological Lab
Url: https://www.indeed.com/viewjob?jk=535907521ac63569
Title: Senior Backend Engineer (Blockchain)
Url: https://www.indeed.com/viewjob?jk=aabd8b03b8b760c7
Title: Internship, Software Engineer, Factory Software (Winter/Spring 2026)
Url: https://www.indeed.com/viewjob?jk=691aab2a0abc6c5c
Title: 