In [1]:
pip install PyPDF2 python-docx pandas


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import os
import re
import pandas as pd
from PyPDF2 import PdfReader
from docx import Document

# Ruta de la carpeta que contiene los archivos
folder_path = "curriculum_vitae_data-master\word"

# Función para extraer texto de archivos PDF
def extract_text_from_pdf(pdf_path):
    try:
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text
    except Exception as e:
        return f"Error al leer el PDF: {e}"

# Función para extraer texto de archivos Word
def extract_text_from_word(doc_path):
    try:
        doc = Document(doc_path)
        text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
        return text
    except Exception as e:
        return f"Error al leer el Word: {e}"

# Función para normalizar texto
def normalize_text(text):
    # Convertir a minúsculas
    text = text.lower()
    # Eliminar caracteres especiales y dejar solo letras, números y espacios
    text = re.sub(r"[^a-záéíóúüñ0-9\s]", "", text)
    # Eliminar espacios extra
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Procesar los archivos y guardar en un dataset
data = []
for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    if file_name.endswith(".pdf"):
        text = extract_text_from_pdf(file_path)
        file_type = "PDF"
    elif file_name.endswith(".docx"):
        text = extract_text_from_word(file_path)
        file_type = "Word"
    else:
        continue  # Ignorar archivos no compatibles

    # Normalizar texto
    normalized_text = normalize_text(text)

    # Agregar los datos normalizados al dataset
    data.append({"File Name": file_name, "File Type": file_type, "Content": normalized_text})

# Guardar en un DataFrame y exportar a CSV
df = pd.DataFrame(data)
df.to_csv("dataset_texto_normalizado.csv", index=False)

print("Archivo 'dataset_texto_normalizado.csv' creado con éxito.")

Archivo 'dataset_texto_normalizado.csv' creado con éxito.


In [10]:
df

Unnamed: 0,File Name,File Type,Content
0,1.docx,Word,personal information curriculum vitae full nam...
1,10.docx,Word,resume abhishek magotra hno632 sector4 channi ...
2,100.docx,Word,curriculum vitae ranjeet singh address g33 2nd...
3,1000.docx,Word,nihmathakangmailcom to satisfy my technologydr...
4,1001.docx,Word,syed abuthaheer n 2 years experience in hvac d...
...,...,...,...
2291,995.docx,Word,curriculum vitae ssam george email sameee85yah...
2292,996.docx,Word,curriculum vitae profile a qualified professio...
2293,997.docx,Word,curriculum vitae work experience 7 stores bcc ...
2294,998.docx,Word,curriculum vitae name chariharasudhan year of ...


In [2]:
pip install transformers torch scikit-learn


^C
Note: you may need to restart the kernel to use updated packages.


In [1]:
from transformers import pipeline

# Cargar pipeline de clasificación
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Lista de habilidades clave
skills_list = [
    "Python", "Java", "Gestión de Proyectos", "SQL", "Machine Learning",
    "Excel", "Análisis de Datos", "Desarrollo Web", "Javascript"
]

# Función para clasificar habilidades en el texto
def detect_skills_transformers(text):
    result = classifier(text, skills_list, multi_label=True)
    detected_skills = [
        skill for skill, score in zip(result["labels"], result["scores"]) if score > 0.5
    ]
    return detected_skills



ModuleNotFoundError: No module named 'transformers'

In [None]:
# Etiquetas para clasificar el nivel de inglés
english_levels = ["básico", "intermedio", "avanzado", "bilingüe"]

# Función para clasificar nivel de inglés
def detect_english_level_transformers(text):
    result = classifier(text, english_levels, multi_label=False)
    return result["labels"][0]  # Devuelve la etiqueta más probable


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# Cargar modelo de NER
ner_model = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(ner_model)
model = AutoModelForTokenClassification.from_pretrained(ner_model)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

# Función para extraer información de NER
def extract_entities(text):
    ner_results = ner_pipeline(text)
    age = "No especificado"
    gender = "No especificado"
    
    for entity in ner_results:
        if entity["entity"] == "AGE":
            age = entity["word"]
        elif entity["entity"] in ["MALE", "FEMALE"]:
            gender = "Masculino" if entity["entity"] == "MALE" else "Femenino"
    
    return age, gender


In [None]:
data_processed = []

for record in data:
    file_name = record["File Name"]
    file_type = record["File Type"]
    content = record["Content"]
    
    # Detectar habilidades
    skills = detect_skills_transformers(content)
    
    # Detectar nivel de inglés
    english_level = detect_english_level_transformers(content)
    
    # Extraer edad y género
    age, gender = extract_entities(content)
    
    # Guardar resultados
    data_processed.append({
        "File Name": file_name,
        "File Type": file_type,
        "Skills": ", ".join(skills),
        "English Level": english_level,
        "Age": age,
        "Gender": gender,
        "Content": content
    })

# Crear un DataFrame y guardar los resultados
df_processed = pd.DataFrame(data_processed)
df_processed.to_csv("dataset_procesado_transformers.csv", index=False)

print("Archivo 'dataset_procesado_transformers.csv' creado con éxito.")
