In [1]:
# DISCLAIMER:
# 
# all CVs used here were found in: https://www.beamjobs.com/resumes/data-engineer-resume-examples

# CONCLUSIONS
#
# the key point using LLAMA3 is to retrieve the seniority from the CV (overall and specific)
# we could use another tool to extract it and send it to the LLM already gathered to improve its performance
# alternative: use the LLM to get every experience the candidate has and calculate and store the experience using code

In [2]:
import requests
from bs4 import BeautifulSoup
import ollama
from IPython.display import Markdown, display
import pdfplumber
from docx import Document
import pprint
import os
from openai import OpenAI
import httpx
from dotenv import load_dotenv

seniority = "senior"

job_desc_text = f"""
Position Description:

We are seeking a {seniority} Data Engineer to design, build, and maintain scalable data pipelines. The ideal candidate will have expertise in SQL, ETL processes, and cloud technologies, collaborating closely with Data Scientists to ensure data integration and quality.

Responsibilities:

Manage cloud data storage systems.
Collaborate with data scientists to meet data requirements.
Ensure data quality and security.
Automate data processes.
Monitor and troubleshoot data systems.
Optimize Big Data solutions.

Requirements:

Education: Degree in Computer Science or related field.
Proficiency in programming languages such as Python, Go, or Rust.
Strong SQL skills.
Experience with Hadoop and Kafka.
Familiarity with cloud platforms (IBM Cloud, Oracle Cloud).
Knowledge of data orchestration tools like Prefect or Luigi.
Experience in CI/CD tools (GitLab CI, CircleCI).

Desirable:

Experience with Snowflake.
Knowledge of visualization tools (Tableau, PowerBI).
Familiarity with Docker or Kubernetes.
Understanding of agile methodologies.
Cloud or big data certifications.
Multicultural experience.

"""


In [3]:


# Función para leer el contenido de un archivo .docx
def read_docx(cv_path):
    try:
        # Cargar el documento
        doc = Document(cv_path)
        
        # Leer el contenido del documento
        contenido = []
        for parrafo in doc.paragraphs:
            contenido.append(parrafo.text)
        
        return '\n'.join(contenido)
    except:
        return None

def read_doc(cv_path):
    try:
        # Inicializar la aplicación de Word
        word = win32com.client.Dispatch("Word.Application")
        word.Visible = False
        
        # Abrir el documento
        doc = word.Documents.Open(cv_path)
        
        # Leer el contenido del documento
        contenido = doc.Content.Text
        
        # Cerrar el documento y la aplicación de Word
        doc.Close(False)
        word.Quit()
        
        return contenido
    except:
        return None

def read_pdf(pdf_path):
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text()
        return text
    except:
        return None

def extract_text_from_cv(cv_path):
    cv_text = read_pdf(cv_path)
    if cv_text is None:
        cv_text = read_docx(cv_path)
        if cv_text is None:
            cv_text =read_doc(cv_path)

    return cv_text


def get_job_description(url):
    class_name = 'wiki-content'
    # Hacer la solicitud HTTP
    response = requests.get(url, verify = False)
    
    # Verificar que la solicitud fue exitosa
    if response.status_code == 200:
        # Analizar el contenido HTML
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Encontrar todos los elementos con la clase especificada
        elements = soup.find_all(class_=class_name)
        
        # Extraer el texto de esos elementos
        text = '\n'.join([element.get_text(separator='\n').strip() for element in elements])
        return text
    else:
        return f"Error: Unable to fetch the page. Status code: {response.status_code}"

In [4]:
roles = ["data engineer","full stack developer","machine learning engineer","data scientist"]
seniority_dict = {
    "senior" : "more than 4 years of experience in total",
    "mid-senior" : "between 2 and 4 years of experience in total",
    "junior" : "between 0 and 2 years of experience in total"
}

directorio = './resources/cvs_landing'

descriptions_dict = {}

for nombre_archivo in os.listdir(directorio):
    ruta_archivo = os.path.join(directorio, nombre_archivo)
    if os.path.isfile(ruta_archivo):
        cv_text = extract_text_from_cv(ruta_archivo)
        if cv_text is not None:
            
            words = cv_text.split()    
            num_of_words = len(words)

            if num_of_words > 5:    
                descriptions_dict[nombre_archivo] = cv_text

for filename,desc in descriptions_dict.items():
    
        descriptions_dict[filename] = desc
        #print(filename+"\n\n"+desc+"\n-------------------------------------------------------------------------------\n")
        


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


In [5]:


def generate_prompt():
    prompt = ""
    
    prompt += f"""answer me with a list of candidates based on this dictionary: \n\n"""
    
    for fn,desc in descriptions_dict.items():
        prompt += desc + "\n\n"
    
    prompt += f"""from the first interesting candidate we should interview to the less for the job description: {job_desc_text},
            pay attention to the candidate's seniority level and the rquired for the job, also the technologies that the candidates manage. the answer has to be ready to be printed in markdown and a summarized description of all the candidates."""
    return prompt
    


In [6]:
# LLAMA3 TEST

response = ollama.chat(
model="llama3.2",
messages=[
    {
        "role": "user",
        "content": generate_prompt()
    }
],
)
llm_response = response["message"]["content"]
display(Markdown(llm_response))

**Candidate Summary**

We've shortlisted 5 candidates for the Senior Data Engineer position based on their experience, skills, and education.

### 1. Marlowe (Facebook)
* Seniority Level: Experienced
* Technologies Managed: Airflow, Presto, Apache Hive, AWS, MySQL
* Education: Bachelor of Science in Computer Science from Carnegie Mellon University

### 2. Elara Quinn (HP Inc.)
* Seniority Level: Entry-Level
* Technologies Managed: Apache Hadoop, Tableau, MySQL, Apache NiFi
* Education: Bachelor of Science in Computer Science from the University of Texas

### 3. John Smith (LionHeart Algorithm LLC)
* Seniority Level: Experienced
* Technologies Managed: Natural Language Processing, SQL, Java, Apache Spark, MongoDB
* Education: Bachelor of Science in Computer Science from Texas University, Austin

### 4. Thaddeus (Illumina)
* Seniority Level: Entry-Level
* Technologies Managed: MySQL, Apache NiFi, AWS, Snowflake, Hadoop
* Education: Bachelor of Science in Computer Science from the University of California

### 5. Brandon Connor (Simplex)
* Seniority Level: Junior
* Technologies Managed: Python, SQL, Kafka, Spark
* Education: B.S. in Computer Science from the University of Texas

In [8]:
# GPT 4-o-mini TEST

model_name = "gpt-4o-mini"

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

openai = OpenAI(http_client=httpx.Client(verify=False))

response = openai.chat.completions.create(
    model=model_name,
    messages=[
        {"role": "system", "content": "you are a candidates classifier, I'm going to provide you some candidates description and you are going to order them from most interesting to interview to the low."},
        {"role": "user", "content": generate_prompt()}
    ]
)

gpt_answer = response.choices[0].message.content
display(Markdown(gpt_answer))


# Candidates Ranked for Senior Data Engineer Position

## 1. **ALAN SUSA**
- **Experience:** 12+ years in data engineering with robust experience in scaling data pipelines.
- **Technologies:** SQL (Postgres, Redshift, MySQL), Python, Kafka, PySpark, AWS, Airflow.
- **Summary:** Led significant projects that resulted in substantial cost savings and performance increases. Has a strong history in designing and implementing scalable data pipelines and automating ETL processes. A proven track record of optimizing system performance.

## 2. **JOHN SMITH**
- **Experience:** 8+ years in data engineering, with solid experience in ETL processes and machine learning model adaptation.
- **Technologies:** SQL, Python, Apache Spark, NoSQL (MongoDB).
- **Summary:** Extensive experience in developing scalable databases and implementing data quality solutions. Has a hands-on approach with cloud services and has successfully optimized revenue through efficient data processes.

## 3. **IANTHE MARLOWE**
- **Experience:** 4+ years in data engineering, including advanced skills in data pipeline management and cloud integration.
- **Technologies:** SQL, Apache Airflow, Apache Hive, Amazon S3, Apache Kafka, Presto.
- **Summary:** Focused on automating workflows and optimizing data processes with a strong understanding of cloud services. Demonstrated ability to improve query performance and data storage management effectively.

## 4. **THADDEUS DRAKE**
- **Experience:** 4 years of combined experience as a Junior Data Engineer and intern roles.
- **Technologies:** MySQL, Apache NiFi, AWS, Apache Airflow.
- **Summary:** Proven track record of managing large datasets and automating data workflows. Successfully reduced processing times and increased data retrieval speed, aligning with the job's requirements.

## 5. **BRANDON CONNOR**
- **Experience:** 3+ years as an intern in data engineering, focusing mostly on developing data ingestion and pipeline processes.
- **Technologies:** Python, SQL, Spark, AWS, Google Analytics API.
- **Summary:** Practical experience in collaborative projects with notable success metrics. While still in a junior role, his project accomplishments indicate potential for growth in a senior position.

## 6. **ELARA QUINN**
- **Experience:** 2+ years as an intern with project work in data engineering.
- **Technologies:** MySQL, Apache NiFi, Amazon Redshift, Tableau.
- **Summary:** Strong foundational knowledge and practical skills through internships. Nevertheless, lacks extensive experience compared to seniority requirements.

---

Candidates were evaluated based on their experience, technologies managed, and alignment with job responsibilities and requirements. Alan Susa stands out with the most experience and relevant skills, followed closely by John Smith.