# Basic RAG over an existing vector database

In [232]:
!pip install langchain langchain_ollama
!pip install chromadb sentence-transformers langchain_huggingface langchain_chroma

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




# Librerias

In [233]:
import os
import re
import requests
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_ollama.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# Obtener los documentos de una pagina web

In [234]:
import requests

url = 'https://en.wikipedia.org/wiki/Harmonica'
r = requests.get(url)

contenido = r.content
print(contenido)
texto = r.text
texto_dividido = texto.splitlines()
texto_dividido



b'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8">\n<title>Harmonica - Wikipedia</title>\n<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-

['<!DOCTYPE html>',
 '<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available" lang="en" dir="ltr">',
 '<head>',
 '<meta charset="UTF-8">',
 '<title>Harmonica - Wikipedia</title>',
 '<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clien

# Seleccionamos un modelo para las embeddings

In [235]:

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")


# Añadimos los datos

Creamos vector storage

In [236]:


if not os.path.exists("./Webs"):  # Comprobar se existe o directorio
    vector_store = Chroma.from_texts(
        texts=texto_dividido,  # Lista de textos limpos
        collection_name="datos_webs",  # Nome da colección
        embedding=embeddings,  # Embeddings configurados
        persist_directory="./Webs",  # Directorio para persistir os datos
    )
else:
    vector_store = Chroma(
        collection_name="datos_webs",
        embedding_function=embeddings,
        persist_directory="./Webs",
    )


Create retriever

In [237]:
# Paso 5: Configurar o retriever
retriever = vector_store.as_retriever(
    search_type="similarity",  # Tipo de búsqueda
    search_kwargs={"k": 3}  # Número de resultados relevantes a devolver
)

In [238]:
# Realizar unha consulta
query = "Models of harmonicas"
results = retriever.get_relevant_documents(query)  # Invoca os documentos relevantes

# Mostrar os resultados
for i, result in enumerate(results):
    print(f"Resultado {i+1}:")
    print(result.page_content)  # Usar 'page_content' en lugar de 'get'
    print()


Resultado 1:
<p>The ChengGong harmonica<sup id="cite_ref-2" class="reference"><a href="#cite_note-2"><span class="cite-bracket">&#91;</span>2<span class="cite-bracket">&#93;</span></a></sup> has a main body, and a sliding mouthpiece. The body is a 24-hole diatonic harmonica that ranges from B<sub>2</sub> to D<sub>6</sub> (covering 3 octaves). Its 11-hole mouthpiece can slide along the front of the harmonica, which gives numerous chord choices and voicings (seven triads, three 6th chords, seven 7th chords, and seven 9th chords, for a total of 24 chords). As well, it is capable of playing single-note melodies and double stops over a range of three diatonic octaves. Unlike conventional harmonicas, blowing and drawing produce the same notes because its tuning is closer to the note layout of a typical East Asian tremolo harmonica or the Polyphonias.

Resultado 2:
<title>Harmonica - Wikipedia</title>

Resultado 3:
<p>There are eight kinds of orchestral melody harmonica; the most common are t

RAG Chain

In [239]:
# Prompt
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# Local LLM
ollama_llm = "llama3.2"
model_local = ChatOllama(model=ollama_llm)

# Chain
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model_local
    | StrOutputParser()
)

In [246]:
# Question
chain.invoke("Types of harmonicas")


'There are at least 8 types of orchestral melody harmonicas mentioned in the context. However, only two specific types are described:\n\n1. The horn harmonicas found in East Asia, which consist of a single large comb with blow-only reed-plates on the top and bottom.\n2. A version that mimics the layout of a piano or mallet instrument, with natural notes in one plate and sharps and flats in another.\n\nAdditionally, there is also a mention of the ChengGong harmonica, but it seems to be more of a specific model rather than a type of harmonica overall.'