In [1]:
import requests
from bs4 import BeautifulSoup
import ollama
from IPython.display import Markdown, display
import pdfplumber
from docx import Document
import pprint

job_desc_url = 'https://fst-confluence.dot.app.corpintra.net/display/AGORA/Job+description'


In [2]:


# Función para leer el contenido de un archivo .docx
def read_docx(cv_path):
    try:
        # Cargar el documento
        doc = Document(cv_path)
        
        # Leer el contenido del documento
        contenido = []
        for parrafo in doc.paragraphs:
            contenido.append(parrafo.text)
        
        return '\n'.join(contenido)
    except:
        return None

def read_doc(cv_path):
    try:
        # Inicializar la aplicación de Word
        word = win32com.client.Dispatch("Word.Application")
        word.Visible = False
        
        # Abrir el documento
        doc = word.Documents.Open(cv_path)
        
        # Leer el contenido del documento
        contenido = doc.Content.Text
        
        # Cerrar el documento y la aplicación de Word
        doc.Close(False)
        word.Quit()
        
        return contenido
    except:
        return None

def read_pdf(pdf_path):
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text()
        return text
    except:
        return None

def extract_text_from_cv(cv_path):
    cv_text = read_pdf(cv_path)
    if cv_text is None:
        cv_text = read_docx(cv_path)
        if cv_text is None:
            cv_text =read_doc(cv_path)

    return cv_text


def get_job_description(url):
    class_name = 'wiki-content'
    # Hacer la solicitud HTTP
    response = requests.get(url, verify = False)
    
    # Verificar que la solicitud fue exitosa
    if response.status_code == 200:
        # Analizar el contenido HTML
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Encontrar todos los elementos con la clase especificada
        elements = soup.find_all(class_=class_name)
        
        # Extraer el texto de esos elementos
        text = '\n'.join([element.get_text(separator='\n').strip() for element in elements])
        return text
    else:
        return f"Error: Unable to fetch the page. Status code: {response.status_code}"

In [3]:
roles = ["data engineer","full stack developer","machine learning engineer","data scientist"]
seniority_dict = {
    "senior" : "more than 4 years of experience in total",
    "mid-senior" : "between 2 and 4 years of experience in total",
    "junior" : "between 0 and 2 years of experience in total"
}

import os

# Especifica el directorio que quieres iterar
directorio = './resources/cvs_landing'

job_desc_text = get_job_description(job_desc_url)
responses_dict = {}

# Itera sobre los archivos en el directorio
for nombre_archivo in os.listdir(directorio):
    ruta_archivo = os.path.join(directorio, nombre_archivo)
    if os.path.isfile(ruta_archivo):
        cv_text = extract_text_from_cv(ruta_archivo)
        if cv_text is not None:
            prompt = (
                f"I want you to classify the following curriculum content: {cv_text} into the role which matches the best among this roles: {roles}, and this job description: {job_desc_text}"
            )
            response = ollama.chat(
                model="llama3.2",
                messages=[
                    {
                        "role": "system",
                        "content": 
                        f"you are a curriculum vitae classifier, I want you to answer with ,maximum 40 words, focusing on role of the candidate, percentage of match, seniority, and briefly reasoning your answer. "
                        f"the answer has to have the following format: role: (set it based on: {roles}), %match: (percentage match), seniority level: (value bewteen the following: {seniority_dict}) (put here years of experience), reason: [reasons]",
                    },
                    {
                        "role": "user",
                        "content": prompt,
                    }
                ],
            )
            llm_response = response["message"]["content"]
            responses_dict[nombre_archivo] = llm_response
        else:
            responses_dict[nombre_archivo] = "CV text not accessible"

# info al principio es mas relevante
# añadir: tu eres un experto en revisar curriculums (contexto)
# no se puede descargar el CV de manera automatica




CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


In [4]:


markdown_str = "# Respuestas\n\n"
for key, value in responses_dict.items():
    markdown_str += f"## {key}\n\n{value}\n\n"

# Imprimir la cadena de texto en formato Markdown
display(Markdown(markdown_str))

# Respuestas

## alan susa.pdf

role: Data Engineer, % match: 78%, seniority level: mid-senior (2-4 years of experience), reason: 
Led migrations and implemented data pipelines; designed ETLs and worked with SQL, cloud technologies, and Spark; lacks senior-level experience but has mid-level skills and experience.

## david garcia.doc

CV text not accessible

## elizabeth smith.docx

role: [Data Engineer], % match: 80%, seniority level: junior/mid-senior (1-4 years), reason: The job description highlights expertise in SQL, ETL processes, cloud technologies, and collaboration with Data Scientists, matching the key responsibilities of a Data Engineer.

## john smith.pdf

role: Data Engineer, % match: 82%, seniority level: Senior, years of experience: more than 4 years, reason: High-level experience in scaling machine learning models, implementing ETL processes, and optimizing performance with Apache Spark and cloud technologies aligns closely with the job requirements.



In [7]:
job_position = "senior data engineer"


response = ollama.chat(
model="llama3.2",
messages=[
    {
        "role": "user",
        "content": f"ok now reorder me the following dictionary {responses_dict} from the most interesting candidate to the less for the job description: {job_position}, answer me just with the reordered dictionary ready to be printed with markdown",
    }
],
)
llm_response = response["message"]["content"]
display(Markdown(llm_response))

| File Name | Role and Description |
|-----------|-----------------------|
| john smith.pdf | role: Data Engineer, % match: 82%, seniority level: Senior, years of experience: more than 4 years, reason: High-level experience in scaling machine learning models, implementing ETL processes, and optimizing performance with Apache Spark and cloud technologies aligns closely with the job requirements. |
| alan susa.pdf | role: Data Engineer, % match: 78%, seniority level: mid-senior (2-4 years of experience), reason: Led migrations and implemented data pipelines; designed ETLs and worked with SQL, cloud technologies, and Spark; lacks senior-level experience but has mid-level skills and experience. |
| elizabeth smith.docx | role: [Data Engineer], % match: 80%, seniority level: junior/mid-senior (1-4 years), reason: The job description highlights expertise in SQL, ETL processes, cloud technologies, and collaboration with Data Scientists, matching the key responsibilities of a Data Engineer. |
| david garcia.doc | CV text not accessible |