# Notebook para guardar el CV en forma de embeddings en la base de datos de vectores

## Importación de paquetes

In [1]:
import os

In [2]:
device = 'cuda'
device

'cuda'

## Extracción de documentos desde el CV en formato PDF

In [3]:
from langchain.document_loaders import PyPDFDirectoryLoader


def read_documents(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents


documents = read_documents("./documentos/")
documents

[Document(metadata={'source': 'documentos\\CV Lucas Barrera.pdf', 'page': 0}, page_content='Lucas Matías Barrera\nPersonal Information\nLocation: Turín, Italy\ne-mail: lucasmatiasbarrera@gmail.com\nLinkedIn: https://www.linkedin.com/in/lucas-matias-barrera/\nAbout\nI am an Electronics Engineer with over 14 years of experience in the industry, including\nalmost 10 years in the aerospace sector. I’m a very analytical professional, capable of analyz-\ning highly complex scenarios with great attention to detail and a strong market orientation,\nan aptitude I learned during my last 5 years of work experience.\nOver the past four years, I have taken courses in leadership and communication to im-\nprove my soft skills, which has allowed me to communicate and lead more effectively and\nassertively.\nI am a very curious person, which is why I began studying philosophy. I am passionate\nabout contemplating the most complex and profound problems that humanity has faced\nand continues to face. I a

## Chunking de los documentos cargados

- Si usamos LLama quizás convenga usar https://huggingface.co/jinaai/jina-embeddings-v3
- Otra opción con Pinecone: https://docs.pinecone.io/guides/inference/generate-embeddings

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_data(documents, chunk_size=800, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap)

    return text_splitter.split_documents(documents)


documents_chunks = chunk_data(documents=documents, chunk_size=400, chunk_overlap=50)
documents_chunks

[Document(metadata={'source': 'documentos\\CV Lucas Barrera.pdf', 'page': 0}, page_content='Lucas Matías Barrera\nPersonal Information\nLocation: Turín, Italy\ne-mail: lucasmatiasbarrera@gmail.com\nLinkedIn: https://www.linkedin.com/in/lucas-matias-barrera/\nAbout\nI am an Electronics Engineer with over 14 years of experience in the industry, including\nalmost 10 years in the aerospace sector. I’m a very analytical professional, capable of analyz-'),
 Document(metadata={'source': 'documentos\\CV Lucas Barrera.pdf', 'page': 0}, page_content='ing highly complex scenarios with great attention to detail and a strong market orientation,\nan aptitude I learned during my last 5 years of work experience.\nOver the past four years, I have taken courses in leadership and communication to im-\nprove my soft skills, which has allowed me to communicate and lead more effectively and\nassertively.'),
 Document(metadata={'source': 'documentos\\CV Lucas Barrera.pdf', 'page': 0}, page_content='assertive

## Carga de documentos en la base de datos

In [9]:
from pinecone import Pinecone
from pinecone import ServerlessSpec

pinecone_api_key = os.environ.get('PINECONE_API_KEY')

pc = Pinecone(api_key=pinecone_api_key)

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

In [15]:
import time
import configuracion

existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

if configuracion.PINECONE_INDEX_NAME not in existing_indexes:
    print("Creando índice")
    pc.create_index(
        configuracion.PINECONE_INDEX_NAME,
        dimension=384,  
        metric='cosine',
        spec=spec
    )

    while not pc.describe_index(configuracion.PINECONE_INDEX_NAME).status['ready']:
        print("Esperando inicialización del índice creado")
        time.sleep(1)
else:
    print("El índice ya existe")

index = pc.Index(configuracion.PINECONE_INDEX_NAME)
time.sleep(1)
index.describe_index_stats()

Creando índice


{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [16]:
from langchain_pinecone import PineconeVectorStore
from langchain_community.embeddings import HuggingFaceEmbeddings

embed_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
                                    model_kwargs={'device': device},
                                    encode_kwargs={'normalize_embeddings': False})



docsearch = PineconeVectorStore.from_documents(
    documents=documents_chunks,
    index_name=configuracion.PINECONE_INDEX_NAME,
    embedding=embed_model,
    namespace=configuracion.PINECONE_NAMESPACE_NAME
)

print("Datos agregados al índice", configuracion.PINECONE_INDEX_NAME)

Datos agregados al índice cvs-index


{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [18]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'default': {'vector_count': 24}},
 'total_vector_count': 24}