# Basic RAG over an existing vector database

In [5]:
%pip install langchain langchain_ollama
%pip install chromadb sentence-transformers langchain_huggingface langchain_chroma

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


# Librerias

In [23]:
import os
import re
import requests
from langchain_huggingface import HuggingFaceEmbeddings
from bs4 import BeautifulSoup
from langchain_chroma import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_ollama.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# Obtener los documentos de una pagina web

In [32]:
import requests

url = 'https://en.wikipedia.org/wiki/National_Basketball_Association'
r = requests.get(url)

contenido = r.content
print(contenido)
texto = r.text
texto_dividido = texto.splitlines()
#texto_dividido

# Combinar el contenido en un solo string
html_string = "".join(texto_dividido)

# Usar BeautifulSoup para procesar el HTML
soup = BeautifulSoup(html_string, "html.parser")

# Extraer solo el texto visible (sin etiquetas)
cleaned_text = soup.get_text()

# Imprimir el texto limpio
cleaned_text = cleaned_text.splitlines()

print(cleaned_text)




b'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8">\n<title>National Basketball Association - Wikipedia</title>\n<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-widt

# Seleccionamos un modelo para las embeddings

In [33]:

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")


# Añadimos los datos

Creamos vector storage

In [40]:


if not os.path.exists("./Nba"):  # Comprobar se existe o directorio
    vector_store = Chroma.from_texts(
        texts=cleaned_text,  # Lista de textos limpos
        collection_name="datos_webs",  # Nome da colección
        embedding=embeddings,  # Embeddings configurados
        persist_directory="./Nba",  # Directorio para persistir os datos
    )
else:
    vector_store = Chroma(
        collection_name="datos_webs",
        embedding_function=embeddings,
        persist_directory="./Nba",
    )


Create retriever

In [41]:
# Paso 5: Configurar o retriever
retriever = vector_store.as_retriever(
    search_type="similarity",  # Tipo de búsqueda
    search_kwargs={"k": 3}  # Número de resultados relevantes a devolver
)

In [42]:
# Realizar unha consulta
query = "NBA Founded"
results = retriever.get_relevant_documents(query)  # Invoca os documentos relevantes

# Mostrar os resultados
for i, result in enumerate(results):
    print(f"Resultado {i+1}:")
    print(result.page_content)  # Usar 'page_content' en lugar de 'get'
    print()


Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1


Resultado 1:
National Basketball Association - WikipediaJump to content      Main menu      Main menu move to sidebar hide  		Navigation	   Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us    		Contribute	   HelpLearn to editCommunity portalRecent changesUpload file               Search            Search                   Appearance               DonateCreate accountLog in      Personal tools      Donate Create account Log in   		Pages for logged out editors learn more    ContributionsTalk                        Contents move to sidebar hide    (Top)      1 History     Toggle History subsection      1.1 Creation and BAA–NBL merger (1946–1956)         1.2 Celtics' dominance, league expansion and competition (1956–1979)         1.3 Surging popularity and Bulls' dynasty (1979–1998)         1.4 Lakers' and Spurs' dynasties (1998–2014)         1.5 Warriors' dynasty and recent years (2014–present)         1.6 International influence         1.7 Other developments       

RAG Chain

In [43]:
# Prompt
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# Local LLM
ollama_llm = "tinyllama"
model_local = ChatOllama(model=ollama_llm)

# Chain
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model_local
    | StrOutputParser()
)

In [44]:
# Question
chain.invoke("When was founded the NBA?")


Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1


'The NBA was established on December 1, 1940.'