In [None]:
# CONCLUSIONS
#
# the key point is to retrieve the senioprity from the CV (overall and specific)
# we could use another tool to extract it and send it to the LLM already gathered to improve its performance
# alternative: use the LLM to get every experience the candidate has and calculate and store the experience using code

In [1]:
import requests
from bs4 import BeautifulSoup
import ollama
from IPython.display import Markdown, display
import pdfplumber
from docx import Document
import pprint
import os

seniority = "senior"

job_desc_text = f"""
Position Description:

We are seeking a {seniority} Data Engineer to design, build, and maintain scalable data pipelines. The ideal candidate will have expertise in SQL, ETL processes, and cloud technologies, collaborating closely with Data Scientists to ensure data integration and quality.

Responsibilities:

Manage cloud data storage systems.
Collaborate with data scientists to meet data requirements.
Ensure data quality and security.
Automate data processes.
Monitor and troubleshoot data systems.
Optimize Big Data solutions.

Requirements:

Education: Degree in Computer Science or related field.
Proficiency in programming languages such as Python, Go, or Rust.
Strong SQL skills.
Experience with Hadoop and Kafka.
Familiarity with cloud platforms (IBM Cloud, Oracle Cloud).
Knowledge of data orchestration tools like Prefect or Luigi.
Experience in CI/CD tools (GitLab CI, CircleCI).

Desirable:

Experience with Snowflake.
Knowledge of visualization tools (Tableau, PowerBI).
Familiarity with Docker or Kubernetes.
Understanding of agile methodologies.
Cloud or big data certifications.
Multicultural experience.

"""


In [2]:


# Función para leer el contenido de un archivo .docx
def read_docx(cv_path):
    try:
        # Cargar el documento
        doc = Document(cv_path)
        
        # Leer el contenido del documento
        contenido = []
        for parrafo in doc.paragraphs:
            contenido.append(parrafo.text)
        
        return '\n'.join(contenido)
    except:
        return None

def read_doc(cv_path):
    try:
        # Inicializar la aplicación de Word
        word = win32com.client.Dispatch("Word.Application")
        word.Visible = False
        
        # Abrir el documento
        doc = word.Documents.Open(cv_path)
        
        # Leer el contenido del documento
        contenido = doc.Content.Text
        
        # Cerrar el documento y la aplicación de Word
        doc.Close(False)
        word.Quit()
        
        return contenido
    except:
        return None

def read_pdf(pdf_path):
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text()
        return text
    except:
        return None

def extract_text_from_cv(cv_path):
    cv_text = read_pdf(cv_path)
    if cv_text is None:
        cv_text = read_docx(cv_path)
        if cv_text is None:
            cv_text =read_doc(cv_path)

    return cv_text


def get_job_description(url):
    class_name = 'wiki-content'
    # Hacer la solicitud HTTP
    response = requests.get(url, verify = False)
    
    # Verificar que la solicitud fue exitosa
    if response.status_code == 200:
        # Analizar el contenido HTML
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Encontrar todos los elementos con la clase especificada
        elements = soup.find_all(class_=class_name)
        
        # Extraer el texto de esos elementos
        text = '\n'.join([element.get_text(separator='\n').strip() for element in elements])
        return text
    else:
        return f"Error: Unable to fetch the page. Status code: {response.status_code}"

In [3]:
roles = ["data engineer","full stack developer","machine learning engineer","data scientist"]
seniority_dict = {
    "senior" : "more than 4 years of experience in total",
    "mid-senior" : "between 2 and 4 years of experience in total",
    "junior" : "between 0 and 2 years of experience in total"
}

# Especifica el directorio que quieres iterar
directorio = './resources/cvs_landing'

descriptions_dict = {}
# Itera sobre los archivos en el directorio
for nombre_archivo in os.listdir(directorio):
    ruta_archivo = os.path.join(directorio, nombre_archivo)
    if os.path.isfile(ruta_archivo):
        cv_text = extract_text_from_cv(ruta_archivo)
        if cv_text is not None:
            
            words = cv_text.split()    
            num_of_words = len(words)

            if num_of_words > 5:    
                descriptions_dict[nombre_archivo] = cv_text

for filename,desc in descriptions_dict.items():
    
        descriptions_dict[filename] = desc
        print(filename+"\n\n"+desc+"\n-------------------------------------------------------------------------------\n")
        


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


alan susa.pdf

ALAN SUSA
Data Engineer
alansusa@email.com (123) 456-7890 New York, NY
LinkedIn
WORK EXPERIENCE EDUCATION
Data Engineer B.A.
Computer Science
Consumer Reports
University of Pittsburgh
May 2018 - current New York, NY
Led the migration from Oracle to Redshift using Amazon Athena September 2010 - April 2014
and S3, resulting in an annual cost savings of $678,000 and an Pittsburgh, PA
increase in performance of 14%
Designed and implemented a real-time data pipeline to process
semi-structured data by integrating 150 million raw records SKILLS
from 30+ data sources using Kafka and PySpark
Designed the data pipeline architecture for a new product that Python
quickly scaled from 0 to 125,000 daily active users
ETLs
Studied and revamped data dictionaries to include a more
SQL (Postgres, Redshift, MySQL)
robust history for developing consistency across domain
NoSQL (MongoDB)
Spark, Kafka
Data Engineer
Airflow
Guardian Life Insurance Company AWS (Athena, Lambda, S3)
August 2016 - M

In [4]:

response = ollama.chat(
model="llama3.2",
messages=[
    {
        "role": "user",
        "content": f"""reorder me the following candidates dictionary {descriptions_dict} from the most interesting candidate to the less for the job description: {job_desc_text},
        pay attention to the seniority level rquired and the technologies that the candidates manage. the answer has to be ready to be printed in markdown and a summarized description of all the candidates.""",
    }
],
)
llm_response = response["message"]["content"]
display(Markdown(llm_response))

**Summary of Candidates**

### Senior-Level Candidates

1. **Elara Quinn**
	* Experience: 4+ years, Data Engineer Intern at HP Inc.
	* Technologies: Apache Hadoop, MySQL, Tableau, AWS
	* Relevant Projects: Built pipelines with Apache Hadoop, processed 4 terabytes of raw data for analytics projects
2. **Ianthe Marlow**
	* Experience: 5+ years, Data Engineer at Facebook
	* Technologies: Airflow, Presto, Apache Hive, Amazon S3
	* Relevant Projects: Automated ETL processes across billions of rows of data, reduced manual intervention by 6 hours per month
3. **John Smith**
	* Experience: 5+ years, Data Engineer at LionHeart Algorithm LLC
	* Technologies: SQL, Java, Hadoop, MongoDB, Python
	* Relevant Projects: Developed scalable databases capable of ETL processes using SQL and Spark

### Mid-Level Candidates

1. **Thaddeus Drake**
	* Experience: 3+ years, Junior Data Engineer at Illumina
	* Technologies: MySQL, Apache NiFi, AWS
	* Relevant Projects: Automated data ingestion workflows with Apache NiFi, processed over 2 TB of genomic data daily
2. **Brandon Connor**
	* Experience: 1+ year, Data Engineering Intern at Balyasny
	* Technologies: Python, SQL, Spark, Tableau
	* Relevant Projects: Built a cloud-first data ingestion pipeline that improved processing speed by 74%

### Emerging Candidates

1. **Alan Susa**
	* Experience: 2+ years, Data Engineer Intern at Consumer Reports
	* Technologies: PySpark, Kafka, Airflow, AWS
	* Relevant Projects: Designed and implemented a real-time data pipeline to process semi-structured data from 30+ sources
2. **Ianthe Marlow**
	* (Duplicate entry, already listed as Senior-Level Candidate)
3. **John Smith**
	* (Duplicate entry, already listed as Senior-Level Candidate)

Note: The seniority level and technologies required for the position are emphasized in the selection process. The candidates have varying levels of experience and expertise, but only a few meet the senior-level requirements.