In [1]:
# CONCLUSIONS
#
# the key point is to retrieve the senioprity from the CV (overall and specific)
# we could use another tool to extract it and send it to the LLM already gathered to improve its performance
# alternative: use the LLM to get every experience the candidate has and calculate and store the experience using code

In [2]:
import requests
from bs4 import BeautifulSoup
import ollama
from IPython.display import Markdown, display
import pdfplumber
from docx import Document
import pprint
import os

seniority = "senior"

job_desc_text = f"""
Position Description:

We are seeking a {seniority} Data Engineer to design, build, and maintain scalable data pipelines. The ideal candidate will have expertise in SQL, ETL processes, and cloud technologies, collaborating closely with Data Scientists to ensure data integration and quality.

Responsibilities:

Manage cloud data storage systems.
Collaborate with data scientists to meet data requirements.
Ensure data quality and security.
Automate data processes.
Monitor and troubleshoot data systems.
Optimize Big Data solutions.

Requirements:

Education: Degree in Computer Science or related field.
Proficiency in programming languages such as Python, Go, or Rust.
Strong SQL skills.
Experience with Hadoop and Kafka.
Familiarity with cloud platforms (IBM Cloud, Oracle Cloud).
Knowledge of data orchestration tools like Prefect or Luigi.
Experience in CI/CD tools (GitLab CI, CircleCI).

Desirable:

Experience with Snowflake.
Knowledge of visualization tools (Tableau, PowerBI).
Familiarity with Docker or Kubernetes.
Understanding of agile methodologies.
Cloud or big data certifications.
Multicultural experience.

"""


In [3]:


# Función para leer el contenido de un archivo .docx
def read_docx(cv_path):
    try:
        # Cargar el documento
        doc = Document(cv_path)
        
        # Leer el contenido del documento
        contenido = []
        for parrafo in doc.paragraphs:
            contenido.append(parrafo.text)
        
        return '\n'.join(contenido)
    except:
        return None

def read_doc(cv_path):
    try:
        # Inicializar la aplicación de Word
        word = win32com.client.Dispatch("Word.Application")
        word.Visible = False
        
        # Abrir el documento
        doc = word.Documents.Open(cv_path)
        
        # Leer el contenido del documento
        contenido = doc.Content.Text
        
        # Cerrar el documento y la aplicación de Word
        doc.Close(False)
        word.Quit()
        
        return contenido
    except:
        return None

def read_pdf(pdf_path):
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text()
        return text
    except:
        return None

def extract_text_from_cv(cv_path):
    cv_text = read_pdf(cv_path)
    if cv_text is None:
        cv_text = read_docx(cv_path)
        if cv_text is None:
            cv_text =read_doc(cv_path)

    return cv_text


def get_job_description(url):
    class_name = 'wiki-content'
    # Hacer la solicitud HTTP
    response = requests.get(url, verify = False)
    
    # Verificar que la solicitud fue exitosa
    if response.status_code == 200:
        # Analizar el contenido HTML
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Encontrar todos los elementos con la clase especificada
        elements = soup.find_all(class_=class_name)
        
        # Extraer el texto de esos elementos
        text = '\n'.join([element.get_text(separator='\n').strip() for element in elements])
        return text
    else:
        return f"Error: Unable to fetch the page. Status code: {response.status_code}"

In [4]:
roles = ["data engineer","full stack developer","machine learning engineer","data scientist"]
seniority_dict = {
    "senior" : "more than 4 years of experience in total",
    "mid-senior" : "between 2 and 4 years of experience in total",
    "junior" : "between 0 and 2 years of experience in total"
}

directorio = './resources/cvs_landing'

descriptions_dict = {}

for nombre_archivo in os.listdir(directorio):
    ruta_archivo = os.path.join(directorio, nombre_archivo)
    if os.path.isfile(ruta_archivo):
        cv_text = extract_text_from_cv(ruta_archivo)
        if cv_text is not None:
            
            words = cv_text.split()    
            num_of_words = len(words)

            if num_of_words > 5:    
                descriptions_dict[nombre_archivo] = cv_text

for filename,desc in descriptions_dict.items():
    
        descriptions_dict[filename] = desc
        #print(filename+"\n\n"+desc+"\n-------------------------------------------------------------------------------\n")
        


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


In [5]:


def generate_prompt():
    prompt = ""
    
    prompt += f"""answer me with a list of candidates based on this dictionary: \n\n"""
    
    for fn,desc in descriptions_dict.items():
        prompt += desc + "\n\n"
    
    prompt += f"""from the first interesting candidate we should interview to the less for the job description: {job_desc_text},
            pay attention to the candidate's seniority level and the rquired for the job, also the technologies that the candidates manage. the answer has to be ready to be printed in markdown and a summarized description of all the candidates."""
    return prompt
    


In [6]:

response = ollama.chat(
model="llama3.2",
messages=[
    {
        "role": "user",
        "content": generate_prompt()
    }
],
)
llm_response = response["message"]["content"]
display(Markdown(llm_response))

### Candidate Evaluation Summary
#### Senior Data Engineer Position Requirements

| Requirement | Brandon Connor | Elara Quinn | Marlowe | John Smith | Thaddeus |
| --- | --- | --- | --- | --- | --- |
| Education | B.S. Computer Science | University of Texas - Bachelor of Science, Computer Science | Bachelor of Science in Computer Science | BS, Computer Science, Texas University, Austin | Bachelor of Science in Computer Science |
| Programming Language Proficiency | Python, SQL, Spark | MySQL, Apache NiFi, Amazon Redshift, Apache Hadoop, AWS, Tableau | PyTorch, Java, SQL, Hadoop | Python, SQL, Java, Hadoop, MongoDB | Python, SQL, Apache NiFi, AWS |
| Experience | Data Engineering Intern (2020-2022) | Data Engineer Intern (2024), Junior Data Engineer (2023-present) | Software Development Intern (2019-2020), Data Engineer (2023-present) | Data Engineer at FNB Nong Phai (2020-2021), ABSA (2015-2017) | Junior Data Engineer (2023-present) |
| Seniority Level | Entry-Level to Mid-Level | Mid-Level | Mid-Level to Senior | Senior | Senior |

#### Ranking from Most Suitable to Least Suitable

Based on the candidate's seniority level and experience with required technologies, the ranking is as follows:

1. John Smith
2. Marlowe
3. Elara Quinn
4. Brandon Connor
5. Thaddeus