In [1]:
import numpy as np
import json
import time
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from sentence_transformers import SentenceTransformer
import chromadb
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [127]:
static_profile_info = ''''Nursulu Sagimbayeva. 
Munich/ Saarbr√ºcken, Germany. Open to relocation. 
Profile: I am a Master‚Äôs student in my last year, with background in the domains of NLP, Data Science, Societal Computing, and Mechanistic Interpretability. I am looking for an internship to gain insight in the industry while working on challenging NLP/AI topics.
Work experience
‚Ä¢Internship Financial Assets & Solutions Data Analytics November 2024-May 2025\nMunich RE Munich, Germany.
‚Ä¢Research Assistant June 2023-Dec 2024. Interdisciplinary Institute of Societal Computing, Saarland Informatik Campus Saarbr√ºcken, Germany.
‚Ä¢Technical Content Writer July 2022-July 2023 at Hasty.ai (CloudFactory) Berlin, Germany (remote)
Education
‚Ä¢M.Sc. in Natural Language Processing, current GPA: 1.5 (best: 1.0) 
October 2022-currently\nSaarland University Saarbr√ºcken, Germany
‚Ä¢B.A. in Translation Studies, GPA: 3.75/4.0, German: 1.38 2018-2022
Al-Farabi Kazakh National University Almaty, Kazakhstan
'''

### Extract PDF data 

In [9]:
from pypdf import PdfReader

In [10]:
# Open and read the PDF
pdf_path = "CV_2025.pdf"
reader = PdfReader(pdf_path)

# Extract text from all pages
text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])

In [12]:
text

'Nursulu Sagimbayeva /envel‚å¢penurs.sagimbayeva@gmail.com\nMunich/ Saarbr√ºcken, Germany /githubnursaltyn.github.io\nOpen to relocation /linkedinLinkedIn Profile\n‚ôÇphone+4917674754101\nProfile: I am a Master‚Äôs student in my last year, with background in\nthe domains of NLP, Data Science, Societal Computing, and\nMechanistic Interpretability. I am looking for an internship to gain\ninsight in the industry while working on challenging NLP/AI topics.\n/mediumMedium\nLibraries: NumPy, Pandas, Scikit-Learn, Matplotlib, PyTorch, Transformers, Streamlit, nltk, Spacy, Gensim,\nplotly, BeautifulSoup, GeoPandas, HTML, CSS;Programming languages: Python, R\nTools: Git, LaTeX, VSCode, HuggingFace, Docker, PowerBI\nExperience: Neural Networks‚Ä¢ Machine Learning‚Ä¢ Computational Linguistics‚Ä¢ Statistics ‚Ä¢ Statistical\nNatural Language Processing‚Ä¢ Machine Translation‚Ä¢ Text Mining and Social Media Mining‚Ä¢ Trustworthy\nMachine Learning\nLanguages: English (C1)‚Ä¢ Russian (native speaker)‚

In [13]:
metadata = reader.metadata

In [14]:
metadata

{'/Author': '',
 '/CreationDate': 'D:20250122220627Z',
 '/Creator': 'LaTeX with hyperref',
 '/Keywords': '',
 '/ModDate': 'D:20250122220627Z',
 '/PTEX.Fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.26 (TeX Live 2024) kpathsea version 6.4.0',
 '/Producer': 'pdfTeX-1.40.26',
 '/Subject': '',
 '/Title': '',
 '/Trapped': '/False'}

TODO: try to extract metadata about different CV parts

### Get the embeddings

In [15]:
from huggingface_hub import login

login("") # Your API Key

Explore other SentenceTransformer models:
https://huggingface.co/models

In [16]:
embedding_model = SentenceTransformer("BAAI/bge-m3")

In [17]:
def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []

    embedding = embedding_model.encode(text)

    return embedding.tolist()

### Chunk the data

In [25]:
# Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(
    separators=['\n', '.', '‚Äì ', '‚Ä¢ ', ' ', ''], #order matters
    chunk_size=150,
    chunk_overlap=50
)

# Create chunks of the document content
chunks = []
last_key = 0
for doc in tqdm([text]):
    doc_chunks = text_splitter.split_text(doc)
    for j in range(len(doc_chunks)):
        chunks.append({f'id': f"id{last_key}",
                       'content': doc_chunks[j],
                       'embedding': get_embedding(doc_chunks[j]),
                       })
        last_key += 1
 


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:03<00:00,  3.35s/it]


In [26]:
[el['content'] for el in chunks]

['Nursulu Sagimbayeva /envel‚å¢penurs.sagimbayeva@gmail.com\nMunich/ Saarbr√ºcken, Germany /githubnursaltyn.github.io',
 'Open to relocation /linkedinLinkedIn Profile\n‚ôÇphone+4917674754101\nProfile: I am a Master‚Äôs student in my last year, with background in',
 'the domains of NLP, Data Science, Societal Computing, and\nMechanistic Interpretability. I am looking for an internship to gain',
 'insight in the industry while working on challenging NLP/AI topics.\n/mediumMedium',
 '/mediumMedium\nLibraries: NumPy, Pandas, Scikit-Learn, Matplotlib, PyTorch, Transformers, Streamlit, nltk, Spacy, Gensim,',
 'plotly, BeautifulSoup, GeoPandas, HTML, CSS;Programming languages: Python, R\nTools: Git, LaTeX, VSCode, HuggingFace, Docker, PowerBI',
 'Experience: Neural Networks‚Ä¢ Machine Learning‚Ä¢ Computational Linguistics‚Ä¢ Statistics ‚Ä¢ Statistical',
 'Natural Language Processing‚Ä¢ Machine Translation‚Ä¢ Text Mining and Social Media Mining‚Ä¢ Trustworthy\nMachine Learning',
 'Machine Lear

#### Create a vector base

In [27]:
client = chromadb.PersistentClient(path="chroma_tmp", settings=chromadb.Settings(allow_reset=True))
client.reset()

True

In [83]:
cv_collection = client.create_collection(
    name="CV_2025",
    metadata={"hnsw:space": "cosine"}
)

In [85]:
for i in range(len(chunks)):
    cv_collection.add(
        documents=chunks[i]['content'],
        ids=chunks[i]["id"],
        embeddings=chunks[i]["embedding"]
    )

### Get data about job postings

In [32]:
import requests

In [33]:
url = "https://www.arbeitnow.com/api/job-board-api"

payload={}
headers = {}

response = requests.request("GET", url, headers=headers, data=payload)

In [40]:
response = json.loads(response.text)

In [45]:
response.keys()

dict_keys(['data', 'links', 'meta'])

In [46]:
response['links']

{'first': 'https://www.arbeitnow.com/api/job-board-api?page=1',
 'last': None,
 'prev': None,
 'next': 'https://www.arbeitnow.com/api/job-board-api?page=2'}

### Step 1: find most suitable recommendations

In [49]:
response['data'][0]

{'slug': 'embedded-software-engineer-munich-350718',
 'company_name': 'Avo Intelligence',
 'title': 'Embedded Software Engineer',
 'description': '<p>We are seeking an <strong>Edge AI &#x26; Embedded ML Engineer</strong> to develop <strong>high-performance, low-latency AI models</strong> for deployment on resource-constrained devices. This role involves optimizing deep learning models for <strong>real-time inference on edge hardware</strong>, ensuring efficiency in power-limited environments.</p>\n<p>If you have experience with <strong>TinyML, on-device AI, and embedded neural networks</strong>, this is the perfect opportunity to work on cutting-edge innovations.</p>\n<h2>Tasks</h2>\n<ul>\n<li>Design, train, and optimize machine learning models for deployment on <strong>microcontrollers, FPGAs, TPUs, and custom ASICs</strong></li>\n<li>Implement <strong>low-power deep learning</strong> solutions for edge devices</li>\n<li>Optimize models using <strong>quantization, pruning, knowledge d

In [47]:
response['data'][0]['description']

'<p>We are seeking an <strong>Edge AI &#x26; Embedded ML Engineer</strong> to develop <strong>high-performance, low-latency AI models</strong> for deployment on resource-constrained devices. This role involves optimizing deep learning models for <strong>real-time inference on edge hardware</strong>, ensuring efficiency in power-limited environments.</p>\n<p>If you have experience with <strong>TinyML, on-device AI, and embedded neural networks</strong>, this is the perfect opportunity to work on cutting-edge innovations.</p>\n<h2>Tasks</h2>\n<ul>\n<li>Design, train, and optimize machine learning models for deployment on <strong>microcontrollers, FPGAs, TPUs, and custom ASICs</strong></li>\n<li>Implement <strong>low-power deep learning</strong> solutions for edge devices</li>\n<li>Optimize models using <strong>quantization, pruning, knowledge distillation, and hardware-aware training</strong></li>\n<li>Deploy and benchmark ML models on <strong>TensorFlow Lite, ONNX, PyTorch Mobile, and E

In [55]:
# Split the document into chunks_jobs
text_splitter = RecursiveCharacterTextSplitter(
    separators=['\n', '.', '‚Äì ', '‚Ä¢ ', ' ', ''], #order matters
    chunk_size=1000,
    chunk_overlap=50
)

# Create chunks_jobs of the document content
chunks_jobs = []
last_key = 0
for doc in tqdm(response['data']):
    content = doc['description']
    doc_chunks_jobs = text_splitter.split_text(content)
    for j in range(len(doc_chunks_jobs)):
        chunks_jobs.append({f'id': f"id{last_key}",
                       'content': doc_chunks_jobs[j],
                       'embedding': get_embedding(doc_chunks_jobs[j]),
                       'metadata': {'location': doc['location'], 'remote': doc['remote'],
                                    'job_types': str(doc['job_types']), 'title': doc['title'],
                                    'company_name': doc['company_name'], 'url': doc['url'],
                                    'tags': str(doc['tags'])}
                       })
        last_key += 1
 

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:49<00:00,  2.01it/s]


In [92]:
# Create chunks_jobs_unsplit version of the document content
chunks_jobs_unsplit = []
last_key = 0
for i, doc in tqdm(enumerate(response['data'])):
    content = doc['description']
    chunks_jobs_unsplit.append({f'id': f"id{last_key}",
        'content': content,
        'embedding': get_embedding(content),
        'metadata': {'location': doc['location'], 'remote': doc['remote'],
                    'job_types': str(doc['job_types']), 'title': doc['title'],
                    'company_name': doc['company_name'], 'url': doc['url'],
                    'tags': str(doc['tags'])}
                       })
    last_key += 1
 

100it [01:32,  1.08it/s]


### Unsplit version 

In [93]:
job_collections_unsplit = client.create_collection(
    name="job_collections_unsplit",
    metadata={"hnsw:space": "cosine"}
)

In [94]:
for i in tqdm(range(len(chunks_jobs_unsplit))):
    job_collections_unsplit.add(
        documents=chunks_jobs_unsplit[i]['content'],
        ids=chunks_jobs_unsplit[i]["id"],
        metadatas=chunks_jobs_unsplit[i]["metadata"],
        embeddings=chunks_jobs_unsplit[i]["embedding"]
    )

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 254.91it/s]


### Split version

In [56]:
job_collections = client.create_collection(
    name="jobs_collections",
    metadata={"hnsw:space": "cosine"}
)

In [57]:
for i in tqdm(range(len(chunks_jobs))):
    job_collections.add(
        documents=chunks_jobs[i]['content'],
        ids=chunks_jobs[i]["id"],
        metadatas=chunks_jobs[i]["metadata"],
        embeddings=chunks_jobs[i]["embedding"]
    )

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 389/389 [00:00<00:00, 551.45it/s]


In [58]:
cv_embeddings = get_embedding(text)

In [95]:
top_results = job_collections_unsplit.query(
    query_embeddings=cv_embeddings,
    # query_texts=["doc10", "thus spake zarathustra", ...],
    n_results=10,
    # where={"metadata_field": "is_equal_to_this"},
    # where_document={"$contains":"search_string"}
)

In [96]:
top_results

{'ids': [['id90',
   'id22',
   'id21',
   'id28',
   'id78',
   'id3',
   'id44',
   'id16',
   'id24',
   'id73']],
 'embeddings': None,
 'documents': [['<p>Wir sind ein dynamisches Software-Startup. Wir entwickeln und vertreiben Softwarel√∂sungen f√ºr mittelst√§ndische Unternehmen. Unser Hauptprodukt ist eine moderne und leistungsf√§hige Cloud-Software f√ºr das Dokumentenmanagement in Unternehmen.</p>\n<h2>Aufgaben</h2>\n<p>Wir suchen aktuell Werkstudent:innen, die gemeinsam mit uns unsere aktuellen Projekte zur Nutzung von LLMs (u. a. Llama 2, DeepSeek) in Form von KI-Services in unserer Software weiter vorantreiben. Wir arbeiten aktuell an mehreren KI-Einsatzszenarien und hosten die open-source LLMs selbst auf einer eigenen Infrastruktur.</p>\n<h2>Qualifikation</h2>\n<p>Vorkenntnisse in KI-Technologien sind nicht zwingend erforderlich, wir schulen unsere neuen Mitarbeiter selbst. Technisches Verst√§ndnis (Informatik, Wirtschaftsinformatik o. √§.) ist nat√ºrlich notwendig.</p>\n<h2

Add search for the metadata (internship, WS, professional, etc.)

### Step 2: generate cover letter

Iterate over all top-k jobs and extract most relevant chunks

In [63]:
cv_chunks = chunks

In [97]:
top_job_ids = top_results['ids'][0]

In [98]:
top_job_descriptions = job_collections_unsplit.get(
    ids=top_job_ids
)['documents']

In [109]:
top_job_matches = job_collections_unsplit.get(
    ids=top_job_ids
)

In [110]:
job_to_apply_to = [f"Company name: {top_job_matches['metadatas'][i]['company_name']}\nJob title: {top_job_matches['metadatas'][i]['title']}\nJob type: {top_job_matches['metadatas'][i]['job_types']}\nJob description: {top_job_matches['documents'][i]}" for i in range(len(top_job_matches['ids']))]

In [112]:
top_cv_matches = cv_collection.query(
    query_embeddings=[get_embedding(el) for el in top_job_descriptions],
    # query_texts=["doc10", "thus spake zarathustra", ...],
    n_results=10,
    # include=['embeddings', 'documents', 'metadatas']
    # where={"metadata_field": "is_equal_to_this"},
    # where_document={"$contains":"search_string"}
)

In [100]:
for i in range(len(top_job_descriptions)):
    print('Job description:')
    print(top_job_descriptions[i])
    print("Best CV matches:")
    print(top_cv_matches['documents'][i])

Job description:
<h2>Tasks</h2>
<ul>
<li><strong>Develop and maintain high-quality software solutions</strong> using modern technologies.</li>
<li>Work on <strong>innovative and large-scale projects</strong> that impact businesses globally.</li>
<li>Collaborate with <strong>cross-functional teams</strong> to design, build and deploy software applications.</li>
<li>Use programming languages such as <strong>Python, Java, C#, JavaScript, PHP or any other language</strong> to create <strong>scalable and high-performance applications</strong>.</li>
<li>Ensure <strong>code quality, security and efficiency</strong> through best practices.</li>
<li>Participate in <strong>Agile/Scrum methodologies</strong> to deliver projects efficiently.</li>
<li>Continuously <strong>learn and implement new technologies</strong> to stay ahead in the industry.</li>
</ul>
<h2>Requirements</h2>
<ul>
<li>You have experience in <strong>ANY programming language</strong> (Python, Java, C#, JavaScript, PHP, etc.).</li

### Step 3: call the model and generate the letter.

In [103]:
from huggingface_hub import InferenceClient
from openai import OpenAI

In [102]:
hf_token = "YOUR_HF_TOKEN"

In [119]:
instruction = '''You are a job application assistant. Your task is, given the job description and highlights from the candidate's CV, write a cover letter tailored to the job description to maximize the chances of the candidate to get an interview invitation.
'''

In [120]:
content = '''Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n

### Instruction:
  {}

  ### Job description:
  {}

  ### Candidate profile:
  {}
  
  ### Candidate CV highlights:
  {}
  
  ### Response:
  '''

In [121]:
job_to_cover_letter = dict()

In [None]:
client = OpenAI(
	base_url="https://api-inference.huggingface.co/v1/",
	api_key="YOUR_API_KEY"
)

for i in tqdm(range(len(top_job_descriptions))):
    messages = [
        {
            "role": "user",
            "content": content.format(instruction, job_to_apply_to[i], static_profile_info, str(top_cv_matches['documents'][i]))
        }]
    
    completion = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3-8B-Instruct", 
        messages=messages, 
    )
    
    job_to_cover_letter[job_to_apply_to[i]] = completion.choices[0].message.content
    with open('model_responses/cover_letters.json', 'w', encoding='utf-8') as file:
        json.dump(job_to_cover_letter, file)

	

  0%|          | 0/10 [00:00<?, ?it/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [01:09<00:00,  6.96s/it]


In [130]:
completion.choices[0].message.content

"Dear Hiring Manager,\n\nI am excited to apply for the [Job Title] position at CrossCommerce GmbH, as advertised in your company's careers page. With my strong background in Data Science and Analytics, I am confident that I can make a valuable contribution to your team.\n\nAs a recent graduate in Master's in Data Science, I developed skills in programming languages such as Python, R, and Java sind mijn strong pointen, together with a solid understanding of Machine Learning algorithms. In my current role as a Data Analyst at XYZ Corporation, I have been working with large datasets, building predictive models, and creating data visualizations to derive insights from the data.\n\nI am particularly drawn to this position at CrossCommerce GmbH because of the emphasis on using data-driven insights to drive business decisions. With my experience in data analysis, I believe I can bring a unique perspective to your company and help drive strategic decisions. My CV highlights several projects, i

In [None]:
# vector_collections.count()