<a href="https://colab.research.google.com/github/omarbecerrasierra/MLOpsChallenge/blob/main/Caso2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Instalación de dependencias necesarias
!pip install -U langchain-core
!pip install -U pymupdf
!pip install -U langchain
!pip install -U langchain-google-genai

Collecting langchain-core
  Downloading langchain_core-0.2.34-py3-none-any.whl.metadata (6.2 kB)
Collecting langsmith<0.2.0,>=0.1.75 (from langchain-core)
  Downloading langsmith-0.1.101-py3-none-any.whl.metadata (13 kB)
Collecting packaging<25,>=23.2 (from langchain-core)
  Downloading packaging-24.1-py3-none-any.whl.metadata (3.2 kB)
Downloading langchain_core-0.2.34-py3-none-any.whl (393 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m393.9/393.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hDownloading langsmith-0.1.101-py3-none-any.whl (148 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m148.9/148.9 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading packaging-24.1-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.0/54.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: packaging, langsmith, langchain-core
  Attempting uninstall: packag

In [None]:
import fitz  # PyMuPDF
from langchain import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_google_genai import ChatGoogleGenerativeAI
from tqdm.notebook import tqdm
import os
import json
import re
from google.colab import userdata
os.environ["GOOGLE_API_KEY"] = userdata.get('API_KEY')

In [None]:
# Función para limpiar el texto del CV
def clean_text(text):
    text = text.lower()
    replacements = {
        r'\n': ' ', r'´ o': 'o', r'´ ı': 'i', r'´ a': 'a',
        r'´ u': 'u', r'´ e': 'e', r'˜ n': 'n', r' %': '%',
        r'\x83': '', '´ i̇':'i', 'ï':''
    }
    for pattern, replacement in replacements.items():
        text = re.sub(pattern, replacement, text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Función para extraer texto de un PDF
def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text("text")
    return text

In [None]:
class Info(BaseModel):
    full_name: str = Field(description="Full name of the candidate in lower case.")
    contact: str = Field(description="Contact information (email in format lower_case@email.com or phone number in format +(country code) phone number).")
    years_experience: int = Field(description="Total years of professional experience as a number")
    ai_education: str = Field(description="Whether the candidate has education in artificial intelligence. (Options: 'Yes' or 'No')")

In [None]:
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

# Definición del template de prompt
template = """
You will be provided with a block of text extracted from a CV.
Your task is to extract specific information and output it in a JSON format.
Only include the requested fields in the JSON object, and ensure there is no additional text.
Text from CV: {cv_text}
Output Format: {format_instructions}
"""
parser = JsonOutputParser(pydantic_object=Info)

prompt = PromptTemplate(
    template=template,
    input_variables=["cv_text"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)
# Crear el prompt y la cadena LLM
chain = prompt | llm | parser

In [None]:
# Función para procesar una carpeta de archivos PDF
def process_cv_folder(folder_path):
    results = []

    # Iterar sobre todos los archivos PDF en la carpeta
    for file_name in tqdm(os.listdir(folder_path)):
        if file_name.endswith('.pdf'):
            try:
                file_path = os.path.join(folder_path, file_name)

                # Extracción y limpieza del texto del CV
                extracted_text = extract_text_from_pdf(file_path)
                cleaned_text = clean_text(extracted_text)

                # Ejecutar la cadena para extraer la información del CV
                result = chain.invoke({"cv_text": cleaned_text})

                # Almacenar el resultado junto con el nombre del archivo
                results.append({"file_name": file_name, "extracted_data": result})
            except Exception as e:
                print('Error', e)
    return results

# Ruta de la carpeta que contiene las hojas de vida
folder_path = "dataset/cvs"

# Procesar todas las hojas de vida en la carpeta
cv_results = process_cv_folder(folder_path)

# Mostrar los resultados en un formato JSON legible
for cv in cv_results:
    print(f"File Name = {cv['file_name']}:")
    print("Info:")
    print(json.dumps(cv['extracted_data'], indent=4))

  0%|          | 0/9 [00:00<?, ?it/s]

File Name = CV8.pdf:
Info:
{
    "full_name": "dr.santosh kakade",
    "contact": "drsantoshkakade@gmail.com",
    "years_experience": 20,
    "ai_education": "No"
}
File Name = CV4.pdf:
Info:
{
    "full_name": "dyah hediyati s.kom",
    "contact": "dyahhediyati@gmail.com",
    "years_experience": 4,
    "ai_education": "No"
}
File Name = CV7.pdf:
Info:
{
    "full_name": "ringgi cahyo dwiputra",
    "contact": "ringgicahyo@gmail.com",
    "years_experience": 3,
    "ai_education": "No"
}
File Name = CV9.pdf:
Info:
{
    "full_name": "loren shevitz",
    "contact": "loren@shevitz.org",
    "years_experience": 21,
    "ai_education": "No"
}
File Name = CV2.pdf:
Info:
{
    "full_name": "powell finwood",
    "contact": "hello@reallygreatsite.com",
    "years_experience": 4,
    "ai_education": "No"
}
File Name = CV5.pdf:
Info:
{
    "full_name": "immanuel abraham mahardhika yudantoro tobing",
    "contact": "dhikayudano@gmail.com",
    "years_experience": 6,
    "ai_education": "No"
}
F