In [11]:
import os
import json
import time
import requests
import numpy as np
import pandas as pd
import chromadb
from tqdm import tqdm
from dotenv import load_dotenv
from openai import OpenAI
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from huggingface_hub import InferenceClient, login
from sentence_transformers import SentenceTransformer

In [12]:
load_dotenv()
hf_token = os.getenv("HF_TOKEN")

### Extract PDF data 

In [8]:
# Open and read the PDF
pdf_path = "CV_2025.pdf"
reader = PdfReader(pdf_path)

# Extract text from all pages
text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
metadata = reader.metadata

### Get the embeddings

In [5]:
login(hf_token) # Your API Key

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Explore other SentenceTransformer models:
https://huggingface.co/models

In [16]:
embedding_model = SentenceTransformer("BAAI/bge-m3")

In [17]:
def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []

    embedding = embedding_model.encode(text)

    return embedding.tolist()

### Chunk the data

In [25]:
# Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(
    separators=['\n', '.', '– ', '• ', ' ', ''], #order matters
    chunk_size=150,
    chunk_overlap=50
)

# Create chunks of the document content
chunks = []
last_key = 0
for doc in tqdm([text]):
    doc_chunks = text_splitter.split_text(doc)
    for j in range(len(doc_chunks)):
        chunks.append({f'id': f"id{last_key}",
                       'content': doc_chunks[j],
                       'embedding': get_embedding(doc_chunks[j]),
                       })
        last_key += 1
 


100%|██████████| 1/1 [00:03<00:00,  3.35s/it]


#### Create a vector base

In [27]:
client = chromadb.PersistentClient(path="chroma_tmp", settings=chromadb.Settings(allow_reset=True))
client.reset()

True

In [83]:
cv_collection = client.create_collection(
    name="CV_2025",
    metadata={"hnsw:space": "cosine"}
)

In [85]:
for i in range(len(chunks)):
    cv_collection.add(
        documents=chunks[i]['content'],
        ids=chunks[i]["id"],
        embeddings=chunks[i]["embedding"]
    )

### Get data about job postings

In [13]:
url = "https://www.arbeitnow.com/api/job-board-api"

payload={}
headers = {}

response = requests.request("GET", url, headers=headers, data=payload)

In [14]:
response = json.loads(response.text)

In [15]:
response

{'data': [{'slug': 'scooters-mechanic-frankfurt-270495',
   'company_name': 'Bolt Technology',
   'title': 'Scooters Mechanic',
   'description': '<p>&lt;gh-intro&gt;<br>&lt;text&gt;</p>\n<p>We\'re looking for a motivated and hands-on Scooters Mechanic who will ensure that our fleet of e-scooters is always maintained at the highest quality.</p>\n<p>&lt;/text&gt;<br>&lt;/gh-intro&gt;</p>\n<p>&nbsp;</p>\n<p>&lt;gh-about-us&gt;<br>&lt;title&gt;<strong>About us</strong>&lt;/title&gt;</p>\n<p>&lt;text&gt;</p>\n<p>With over 150 million users in 45+ countries, Bolt is one of the fastest-growing tech companies in Europe and Africa. And it\'s all thanks to our people.</p>\n<p>&nbsp;</p>\n<p>We believe in creating an inclusive environment where everyone is welcome, regardless of race, colour, religion, gender identity, sexual orientation, national origin, age, or ability.</p>\n<p>&nbsp;</p>\n<p>Our ultimate goal is to make cities for people, not cars. And we need your help on this mission!</p>\n

### Step 1: find most suitable recommendations

In [55]:
# Split the document into chunks_jobs
text_splitter = RecursiveCharacterTextSplitter(
    separators=['\n', '.', '– ', '• ', ' ', ''], #order matters
    chunk_size=1000,
    chunk_overlap=50
)

# Create chunks_jobs of the document content
chunks_jobs = []
last_key = 0
for doc in tqdm(response['data']):
    content = doc['description']
    doc_chunks_jobs = text_splitter.split_text(content)
    for j in range(len(doc_chunks_jobs)):
        chunks_jobs.append({f'id': f"id{last_key}",
                       'content': doc_chunks_jobs[j],
                       'embedding': get_embedding(doc_chunks_jobs[j]),
                       'metadata': {'location': doc['location'], 'remote': doc['remote'],
                                    'job_types': str(doc['job_types']), 'title': doc['title'],
                                    'company_name': doc['company_name'], 'url': doc['url'],
                                    'tags': str(doc['tags'])}
                       })
        last_key += 1
 

100%|██████████| 100/100 [00:49<00:00,  2.01it/s]


In [92]:
# Create chunks_jobs_unsplit version of the document content
chunks_jobs_unsplit = []
last_key = 0
for i, doc in tqdm(enumerate(response['data'])):
    content = doc['description']
    chunks_jobs_unsplit.append({f'id': f"id{last_key}",
        'content': content,
        'embedding': get_embedding(content),
        'metadata': {'location': doc['location'], 'remote': doc['remote'],
                    'job_types': str(doc['job_types']), 'title': doc['title'],
                    'company_name': doc['company_name'], 'url': doc['url'],
                    'tags': str(doc['tags'])}
                       })
    last_key += 1
 

100it [01:32,  1.08it/s]


In [56]:
job_collections = client.create_collection(
    name="jobs_collections",
    metadata={"hnsw:space": "cosine"}
)

In [None]:
job_collections.add(
        documents=[chunks_jobs[i]['content'] for i in range(len(chunks_jobs))],
        ids=[chunks_jobs[i]["id"] for i in range(len(chunks_jobs))],
        metadatas=[chunks_jobs[i]["metadata"] for i in range(len(chunks_jobs))],
        embeddings=[chunks_jobs[i]["embedding"] for i in range(len(chunks_jobs))],
    )

100%|██████████| 389/389 [00:00<00:00, 551.45it/s]


In [58]:
cv_embeddings = get_embedding(text)

In [None]:
top_results = job_collections.query(
    query_embeddings=cv_embeddings,
    # query_texts=["doc10", "thus spake zarathustra", ...],
    n_results=10,
    # where={"metadata_field": "is_equal_to_this"},
    # where_document={"$contains":"search_string"}
)

Add search for the metadata (internship, WS, professional, etc.)

### Step 2: generate cover letter

Iterate over all top-k jobs and extract most relevant chunks

In [63]:
cv_chunks = chunks

In [97]:
top_job_ids = top_results['ids'][0]

In [None]:
top_job_descriptions = job_collections.get(
    ids=top_job_ids
)['documents']

In [None]:
top_job_matches = job_collections.get(
    ids=top_job_ids
)

In [110]:
job_to_apply_to = [f"Company name: {top_job_matches['metadatas'][i]['company_name']}\nJob title: {top_job_matches['metadatas'][i]['title']}\nJob type: {top_job_matches['metadatas'][i]['job_types']}\nJob description: {top_job_matches['documents'][i]}" for i in range(len(top_job_matches['ids']))]

In [112]:
top_cv_matches = cv_collection.query(
    query_embeddings=[get_embedding(el) for el in top_job_descriptions],
    # query_texts=["doc10", "thus spake zarathustra", ...],
    n_results=10,
    # include=['embeddings', 'documents', 'metadatas']
    # where={"metadata_field": "is_equal_to_this"},
    # where_document={"$contains":"search_string"}
)

In [100]:
for i in range(len(top_job_descriptions)):
    print('Job description:')
    print(top_job_descriptions[i])
    print("Best CV matches:")
    print(top_cv_matches['documents'][i])

Job description:
<h2>Tasks</h2>
<ul>
<li><strong>Develop and maintain high-quality software solutions</strong> using modern technologies.</li>
<li>Work on <strong>innovative and large-scale projects</strong> that impact businesses globally.</li>
<li>Collaborate with <strong>cross-functional teams</strong> to design, build and deploy software applications.</li>
<li>Use programming languages such as <strong>Python, Java, C#, JavaScript, PHP or any other language</strong> to create <strong>scalable and high-performance applications</strong>.</li>
<li>Ensure <strong>code quality, security and efficiency</strong> through best practices.</li>
<li>Participate in <strong>Agile/Scrum methodologies</strong> to deliver projects efficiently.</li>
<li>Continuously <strong>learn and implement new technologies</strong> to stay ahead in the industry.</li>
</ul>
<h2>Requirements</h2>
<ul>
<li>You have experience in <strong>ANY programming language</strong> (Python, Java, C#, JavaScript, PHP, etc.).</li

### Step 3: call the model and generate the letter.

In [119]:
instruction = '''You are a job application assistant. Your task is, given the job description and highlights from the candidate's CV, write a cover letter tailored to the job description to maximize the chances of the candidate to get an interview invitation.
'''

In [120]:
content = '''Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n

### Instruction:
  {}

  ### Job description:
  {}

  ### Candidate profile:
  {}
  
  ### Candidate CV highlights:
  {}
  
  ### Response:
  '''

In [121]:
job_to_cover_letter = dict()

In [None]:
client = OpenAI(
	base_url="https://api-inference.huggingface.co/v1/",
	api_key=hf_token,
)

for i in tqdm(range(len(top_job_descriptions))):
    messages = [
        {
            "role": "user",
            "content": content.format(instruction, job_to_apply_to[i], static_profile_info, str(top_cv_matches['documents'][i]))
        }]
    
    completion = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3-8B-Instruct", 
        messages=messages, 
    )
    
    job_to_cover_letter[job_to_apply_to[i]] = completion.choices[0].message.content
    with open('model_responses/cover_letters.json', 'w', encoding='utf-8') as file:
        json.dump(job_to_cover_letter, file)

	

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [01:09<00:00,  6.96s/it]


In [130]:
completion.choices[0].message.content

"Dear Hiring Manager,\n\nI am excited to apply for the [Job Title] position at CrossCommerce GmbH, as advertised in your company's careers page. With my strong background in Data Science and Analytics, I am confident that I can make a valuable contribution to your team.\n\nAs a recent graduate in Master's in Data Science, I developed skills in programming languages such as Python, R, and Java sind mijn strong pointen, together with a solid understanding of Machine Learning algorithms. In my current role as a Data Analyst at XYZ Corporation, I have been working with large datasets, building predictive models, and creating data visualizations to derive insights from the data.\n\nI am particularly drawn to this position at CrossCommerce GmbH because of the emphasis on using data-driven insights to drive business decisions. With my experience in data analysis, I believe I can bring a unique perspective to your company and help drive strategic decisions. My CV highlights several projects, i

In [None]:
# vector_collections.count()