In [34]:
import hashlib
import requests
from bs4 import BeautifulSoup
from qdrant_client import QdrantClient
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [35]:
qdrant_client = QdrantClient(host="qdrant", port=6333)

In [36]:
collection_name = "knowledge_base"

qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config={
        "size": 1536,
        "distance": "Cosine"
    }
)
# Visit http://localhost:6333/dashboard#/collections

UnexpectedResponse: Unexpected Response: 409 (Conflict)
Raw response content:
b'{"status":{"error":"Wrong input: Collection `knowledge_base` already exists!"},"time":0.001504709}'

In [37]:
embeddings = OpenAIEmbeddings(
    model="text-embedding-ada-002"
)

In [38]:
def fetch_blog_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    article = soup.find('article')
    if article:
        return article.get_text(strip=True)
    else:
        return None

In [39]:
def generate_id_from_url(url):
    return int(hashlib.md5(url.encode('utf-8')).hexdigest(), 16) % (10**8)

In [40]:
url = 'https://dataloop.dev/'
exclude = {
    'https://dataloop.dev/author/admin/',
    'https://dataloop.dev/category/aws/',
    'https://dataloop.dev/category/azure/azure-analysis-services/',
    'https://dataloop.dev/category/azure/azure-data-factory/',
    'https://dataloop.dev/category/azure/azure-databricks/',
    'https://dataloop.dev/category/azure/azure-sql-server/azure-sql-database/',
    'https://dataloop.dev/category/power-bi-service/',
    'https://dataloop.dev/category/python/',
    'https://dataloop.dev/contactame/',
    url
}
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')

urls = list(
    set(
        link.get('href')
        for link in soup.find_all('a')
        if url in link.get('href') and link.get('href') not in exclude
    )
)

In [41]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=128,
    separators=["\n\n", ".", "?", "!", " "],
)

In [42]:
documents = []
for url in urls:
    content = fetch_blog_content(url)
    if content:
        chunks = text_splitter.split_text(content)
        
        for idx, chunk in enumerate(chunks):
            documents.append({
                "content": chunk,
                "metadata": {"url": url, "chunk_id": idx}
            })
    else:
        print(f"Error al extraer contenido de {url}")

print(f"Procesados {len(documents)} fragmentos.")

Procesados 136 fragmentos.


In [43]:
for doc in documents:
    doc_id = generate_id_from_url(doc["metadata"]["url"] + str(doc["metadata"]["chunk_id"]))
    
    vector = embeddings.embed_query(doc["content"])
    
    qdrant_client.upsert(
        collection_name=collection_name,
        points=[{
            "id": doc_id,
            "vector": vector,
            "payload": doc["metadata"],
        }]
    )

In [32]:
query = "¿Cómo actualizar el dataset de un power bi report?"
query_vector = embeddings.embed_query(query)

results = qdrant_client.search(
    collection_name=collection_name,
    query_vector=query_vector,
    limit=5
)


In [33]:
for result in results:
    print(f"Artículo: {result.payload['url']}, Score: {result.score}")

Artículo: https://dataloop.dev/actualizacion-de-power-bi-dataset-desde-power-automate/, Score: 0.8919406
Artículo: https://dataloop.dev/actualizacion-de-power-bi-dataset-desde-power-automate/, Score: 0.87158126
Artículo: https://dataloop.dev/actualizacion-de-power-bi-dataset-desde-power-automate/, Score: 0.8638105
Artículo: https://dataloop.dev/ejecucion-de-adf-pipelines-desde-azure-logic-app/, Score: 0.8623122
Artículo: https://dataloop.dev/caracteristicas-y-tipos-de-licenciamiento-de-power-bi/, Score: 0.8338643
