# Ingestando embeddings a Pinecone

In [20]:
%%capture
!pip install pinecone
!pip install tiktoken

In [None]:
# import pinecone
from pinecone import Pinecone, ServerlessSpec
import os, getpass
import tiktoken

In [None]:
api_key = getpass.getpass("Ingresa tu API Key de Pinecone : ")

## Creando un index en Pinecone

In [None]:
index_name = "knowledge-base-eliminatorias"
dimension = 1536

# Conectarse con la única región permitida
pc = Pinecone(api_key=api_key)

if index_name not in [idx.name for idx in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",          # ✅ Solo AWS
            region="us-east-1"    # ✅ Solo us-east-1
        )
    )
    print(f"✅ Índice '{index_name}' creado correctamente.")
else:
    print(f"ℹ️ El índice '{index_name}' ya existe.")


## Generando fragmentos de sitios web

![Imagen](https://imgmedia.larepublica.pe/640x371/larepublica/original/2023/10/17/652f24917388f967a923a805.webp)

In [None]:
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import CharacterTextSplitter

loader = WebBaseLoader(
    [
    "https://www.marca.com/co/2023/10/17/652e070f22601d73648b4585.html", 
    "https://hiraoka.com.pe/blog/post/eliminatorias-sudamericanas-mundial-2026-calendario-partidos-y-fechas"
    ]
)
data = loader.load()

#Genera varios fragmentos de 400 tokens
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size = 500, 
    chunk_overlap = 20
)

docs = text_splitter.split_documents(data)

In [None]:
len(docs), docs

In [None]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("Ingrese la API Key de OpenAI : ")

## Cargando datos a Pinecone

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from pinecone import Pinecone as PineconeClient
from uuid import uuid4

# Inicializa Pinecone v3
api_key = getpass.getpass("Ingresa tu API Key de Pinecone : ")
index_name = "knowledge-base-eliminatorias"
pc = PineconeClient(api_key=api_key)
index = pc.Index(index_name)

# Embeddings
embedder = OpenAIEmbeddings()
texts = [doc.page_content for doc in docs]
metadatas = [doc.metadata for doc in docs]

# Embed y subir
vectors = []
for i, text in enumerate(texts):
    embedding = embedder.embed_query(text)
    vectors.append({
        "id": str(uuid4()),
        "values": embedding,
        "metadata": metadatas[i] if i < len(metadatas) else {}
    })

index.upsert(vectors=vectors)
print("✅ Documentos insertados en Pinecone.")
