In [None]:
# Install OpenAI and Langchain to instantiate and manage LLMs, also TikToken and Chroma DB

!pip install langchain
!pip install openai
!pip install chromadb
!pip install tiktoken

In [None]:
import requests
from bs4 import BeautifulSoup

# URL of the .txt file on the web server
url = "https://fegalaz.usc.es/~gamallo/aulas/lingcomputacional/corpus/quijote-es.txt"

# Send a GET request to fetch the content of the .txt file
response = requests.get(url)

# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

# Extract the text
data = soup.get_text()

# Print the cleaned text
print(data)

In [8]:
print(type(data))

<class 'str'>


In [7]:
# Install all the classes we need:

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.cohere import CohereEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch
from langchain.vectorstores import Chroma


In [4]:
# Activate the OpenAI API

import os
os.environ["OPENAI_API_KEY"] = ""

Usually we will have several text files, but in this exercise we are going to split our document into several pieces or chunks and treat each of them as a separate document.

The model will have to figure out which part contains the answer to our question. We break this text into multiple parts by assigning each part a maximum length using the commands below.

In [23]:
# This would be an exact way to cut the text in exact pieces of certain size
# BUT I would like to have some kind of semantic criteria to get the chunks, that´s why we need text_splitter

chunk_size = 250
overlap_size = 50
chunks = [data[i:i+chunk_size] for i in range(0, len(data) - chunk_size + 1, chunk_size - overlap_size)]

len(chunks)


10313

In [18]:
# !!! this needs to be reviewed
# break the single text file into multiple parts (chunks) and treat each part as a different document.
# Notice how langchain will try to get as close as possible to the chunk size, but not always possible.

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_text(data)
len(texts)

Now we create an object that we need to save the embeddings of the various parts of the created text. BUT we want to save this texts ina permanent way so we don´t need to reconstruct them for every query with the obvious waste of resources.

In [24]:
embeddings = OpenAIEmbeddings()

In [25]:
persist_directory = 'db'
docsearch = Chroma.from_texts(
    chunks,
    embeddings,
    persist_directory = persist_directory,
    metadatas=[{"source": f"{i}-pl"} for i in range(len(chunks))]
    )

In [26]:
from langchain.chains import RetrievalQAWithSourcesChain

In [27]:
# Now we want to turn docsearch into a retrieval because that will be its purpose.
from langchain import OpenAI

#convert the vectorstore to a retriever
retriever=docsearch.as_retriever()

In [28]:
# We can also see the retriever what distance metric it is using; in this case default is similarity.
retriever.search_type

'similarity'

In [46]:
# Finally, we can ask the retriever to take the document that most answers one of our queries.
# The retriever could also take more than one document if necessary.

docs = retriever.get_relevant_documents("Escribe completa la cita de 'la razon de la sinrazón...' ")


In [47]:
# And these are the docs we are going to use to build the metaprompt to query OpenAI

len(docs)
docs

[Document(page_content='niendo habilidad, suficiencia y entendimiento para tratar del\nuniverso todo, pide no se desprecie su trabajo, y se le den alabanzas, no\npor lo que escribe, sino por lo que ha dejado de escribir.\nY luego prosigue la historia diciendo que, en acabando ', metadata={'source': '8133-pl'}),
 Document(page_content='e moche lo primero que le viene\nal magín.\n-Una de las tachas que ponen a la tal historia -dijo el bachiller- es que\nsu autor puso en ella una novela intitulada El curioso impertinente; no por\nmala ni por mal razonada, sino por no ser de aquel lugar, ', metadata={'source': '5255-pl'}),
 Document(page_content='rece apócrifa, yo no tengo la culpa; y así, sin afirmarla\npor falsa o verdadera, la escribo. Tú, letor, pues eres prudente, juzga lo\nque te pareciere, que yo no debo ni puedo más; puesto que se tiene por\ncierto que al tiempo de su fin y muerte dicen ', metadata={'source': '6765-pl'}),
 Document(page_content='altar un punto a la verdad del caso.

Now what we can do is to create an agent. An agent is able to perform a series of steps to solve the user’s task on its own. Our agent will have to go and look through the documents available to it where the answer to the question asked is and return that document.

In [40]:
#create the chain to answer questions
chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm = OpenAI(temperature=0.7),
    chain_type="stuff",
    retriever=retriever,
    return_source_documents = True
    )

If we want, we can also create a function to post-process the agent’s output so that it is more readable.

In [34]:
def process_result(result):
  print(result['answer'])
  print("\n\n Sources : ",result['sources'] )
  print(result['sources'])

Now everything is finally ready, we can use our agent and go and answer our queries!

In [55]:
question = "Quien era sanson carrasco? en qué capitulos de la novela aparece ?"
result = chain({"question": question})
process_result(result)

 Sansón Carrasco era un bachiller de 24 años, de color macilenta, de nariz chata y de boca grande. Aparece en los capítulos 5215, 5459 y 6066 de la novela.



 Sources :  5215-pl, 5459-pl, 6066-pl, 5216-pl
5215-pl, 5459-pl, 6066-pl, 5216-pl


In [53]:
chunks[5285]

'sada.\nCapítulo IV. Donde Sancho Panza satisface al bachiller Sansón Carrasco de\nsus dudas y preguntas, con otros sucesos dignos de saberse y de contarse\nVolvió Sancho a casa de don Quijote, y, volviendo al pasado razonamiento,\ndijo:\n-A lo que el seño'