In [1]:
import requests
from bs4 import BeautifulSoup
import ollama
from IPython.display import Markdown, display
import pdfplumber
from docx import Document
import pprint
import os
from openai import OpenAI
import httpx
from dotenv import load_dotenv
import json

In [2]:

def call_llama3_ollama(prompt: str, model: str, system_prompt: str) -> str:
    import ollama
    response = ollama.chat(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ]
    )
    llm_response = response["message"]["content"]
    return llm_response

def call_chatgpt_openai(prompt: str, model_name: str, system_prompt: str) -> str:
    load_dotenv()
    os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

    openai = OpenAI(http_client=httpx.Client(verify=False))
    
    response = openai.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ]
    )
    
    return response.choices[0].message.content

In [3]:


# Función para leer el contenido de un archivo .docx
def read_docx(cv_path):
    try:
        # Cargar el documento
        doc = Document(cv_path)
        
        # Leer el contenido del documento
        contenido = []
        for parrafo in doc.paragraphs:
            contenido.append(parrafo.text)
        
        return '\n'.join(contenido)
    except:
        return None

def read_doc(cv_path):
    try:
        # Inicializar la aplicación de Word
        word = win32com.client.Dispatch("Word.Application")
        word.Visible = False
        
        # Abrir el documento
        doc = word.Documents.Open(cv_path)
        
        # Leer el contenido del documento
        contenido = doc.Content.Text
        
        # Cerrar el documento y la aplicación de Word
        doc.Close(False)
        word.Quit()
        
        return contenido
    except:
        return None

def read_pdf(pdf_path):
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text()
        return text
    except:
        return None

def extract_text_from_cv(cv_path):
    cv_text = read_pdf(cv_path)
    if cv_text is None:
        cv_text = read_docx(cv_path)
        if cv_text is None:
            cv_text =read_doc(cv_path)

    return cv_text


def get_job_description(url):
    class_name = 'wiki-content'
    # Hacer la solicitud HTTP
    response = requests.get(url, verify = False)
    
    # Verificar que la solicitud fue exitosa
    if response.status_code == 200:
        # Analizar el contenido HTML
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Encontrar todos los elementos con la clase especificada
        elements = soup.find_all(class_=class_name)
        
        # Extraer el texto de esos elementos
        text = '\n'.join([element.get_text(separator='\n').strip() for element in elements])
        return text
    else:
        return f"Error: Unable to fetch the page. Status code: {response.status_code}"

def evaluate_candidate(model_source, candidate_desc, job_description):

    system_prompt = "you are a CV reviewer."
       
    prompt = f"""
    Review the following CV description:

    {candidate_desc}
    
    Evaluate how well this candidate matches the following job description:
    
    {job_description}
    
    You MUST pay attention mostly to the ROLE and the SENIORITY match between what we are looking for and the candidate's seniority, and answer ONLY with a JSON object in the following structure:
    
    {{
      "name": "Candidate Name",
      "match_percentage": number between 0 and 100,
      "summary": "A  summary explaining the match, including relevant skills, technologies, and gaps"
    }}
    
    ❗️Output STRICTLY as valid JSON. Do NOT include any explanations, extra text, markdown formatting, or comments.
    
    Example:
    
    {{
      "name": "John Marston",
      "match_percentage": 78,
      "summary": "ihe has strong knowledge on AWS, he has developed using Spark, Python and knows a few database systems."
    }}
    """
    
    if "llama" in model_source.lower():
        print(f"▶️ Using {model_source} via Ollama...")
        full_response = call_llama3_ollama(prompt, model_source, system_prompt)
    
    elif "gpt" in model_source.lower():
        print(f"▶️ Using {model_source} via OpenAI API...")
        full_response = call_chatgpt_openai(prompt, model_source, system_prompt)
    
    else:
        raise ValueError("Invalid model_source. Use 'llama3' or 'chatgpt'.")

    #print(prompt)

    # clean markdown delimiter
    clean_response = full_response.replace("```python", "").replace("```", "").strip()
    
    #print(full_response)
    
    result = json.loads(clean_response)
    return result


In [4]:
#job_description = get_job_description("url")

job_description = """

Job Title: Junior Data Engineer
Location: Remote
Job Type: Full-time
Team: Data Engineering
Reports to: Senior Data Engineer

About the Role:

We are looking for a Junior Data Engineer to join our growing data team. You will support the design, development, and maintenance of data pipelines and infrastructure that power data analytics, reporting, and data-driven decision-making across the company.
This is a great opportunity for someone early in their career to learn and grow while working on real-world data challenges in a collaborative and agile environment.

Key Responsibilities
- Assist in building, maintaining, and optimizing ETL/ELT pipelines.
- Work closely with data analysts, scientists, and engineers to support data ingestion and transformation workflows.
- Help ensure the reliability and quality of data across systems.
- Monitor and troubleshoot data pipelines and processes.
- Contribute to documentation of data flows, models, and architecture.
- Learn and apply best practices in data engineering, including security and scalability.

Requirements:

- Basic knowledge of SQL and at least one programming language (Python preferred).
- Familiarity with data storage systems (e.g., relational databases, cloud storage).
- Understanding of data processing concepts and tools.
- Willingness to learn and grow in a fast-paced environment.
- Good communication and problem-solving skills.
- Bachelor’s degree in Computer Science, Engineering, Mathematics, or related field (or equivalent practical experience).

Nice to Have:

- Exposure to cloud platforms like AWS, GCP, or Azure.
- Experience with version control (e.g., Git).
- Basic knowledge of data modeling concepts.
- Familiarity with tools like Airflow, dbt, or Spark.

"""



In [5]:
directorio = './landing'

descriptions_dict = {}
evaluation_dict = {}
matches = []

for nombre_archivo in os.listdir(directorio):
    ruta_archivo = os.path.join(directorio, nombre_archivo)
    if os.path.isfile(ruta_archivo):
        cv_text = extract_text_from_cv(ruta_archivo)
        if cv_text is not None:
            
            words = cv_text.split()    
            num_of_words = len(words)

            if num_of_words > 5:    
                descriptions_dict[nombre_archivo] = cv_text

for filename,candidate_desc in descriptions_dict.items():

        # "llama3.2" or "gpt-4o-mini"
        model = "gpt-4o-mini"
    
        llm_answer = evaluate_candidate(model, candidate_desc, job_description)
        matches.append(llm_answer)


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


▶️ Using gpt-4o-mini via OpenAI API...
▶️ Using gpt-4o-mini via OpenAI API...
▶️ Using gpt-4o-mini via OpenAI API...
▶️ Using gpt-4o-mini via OpenAI API...


In [6]:
sorted_matches = sorted(matches, key=lambda x: x['match_percentage'], reverse=True)

n = 1
for match in sorted_matches:
    print(f"candidate nº{n}: {match['name']} - {match['match_percentage']}% → {match['summary']} \n")
    n+=1


# Thaddeus - 85% → Matches most of the key responsibilities, skills, and technologies required by the job description
# Alan Susa - 80% → Matches well for requirements, skills, and technologies with strong emphasis on ETL/ELT pipelines and data processing concepts. Some junior-level experience compared to the job description
# Elara Quinn - 80% → Matches for most of the job requirements, including ETL/ELT pipelines, data ingestion, and cloud platforms like AWS. Some experience with Tableau is mentioned but not directly related to the key responsibilities
# John Smith - 60% → Matches for basic requirements and skills with good exposure to SQL, Python, Hadoop, and Apache Spark. Lacks experience in cloud platforms, version control, and some tools mentioned in the job description

candidate nº1: Alan Susa - 85% → Alan has substantial experience as a Data Engineer, demonstrating strong skills in Python, SQL, and various data storage systems, including AWS services. He has built and maintained ETL pipelines and has familiarity with tools like Spark and Kafka. While his experience exceeds the junior level, he shows high potential for growth in a collaborative environment. 

candidate nº2: Elara Quinn - 85% → Elara has relevant experience as a Data Engineer Intern, having built data pipelines and working with SQL, Apache Hadoop, and Tableau. She demonstrates familiarity with cloud platforms and data ingestion processes, aligning well with the requirements. However, her profile lacks explicit mention of a programming language like Python and basic knowledge of data modeling concepts, which are desired for this role. 

candidate nº3: Thaddeus - 85% → Thaddeus has solid experience as a Junior Data Engineer, with proven skills in MySQL, Python, and various data pipeline