In [None]:
# Import necessary libraries and models
!pip install langchain-community
!pip install faiss-gpu
!pip install transformers
!pip install transformers_stream_generator
!pip install sentence-transformers
!pip install tiktoken

Collecting langchain-community
  Downloading langchain_community-0.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting SQLAlchemy<2.0.36,>=1.4 (from langchain-community)
  Downloading SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.6 (from langchain-community)
  Downloading langchain-0.3.7-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.15 (from langchain-community)
  Downloading langchain_core-0.3.15-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.6.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from datac

In [None]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

# Step 1: Load and split documents for efficient retrieval
# Load the text file
loader = TextLoader('/content/gullivers_travels.txt')
documents = loader.load()

# Split the text into manageable chunks for efficient retrieval
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,     # Each chunk is 500 characters for manageability
    chunk_overlap=50,   # Each chunk has a 50-character overlap to retain context
    separators=["\n\n", "\n", " "]  # Split on paragraphs or lines, then spaces
)
docs = text_splitter.split_documents(documents)

In [None]:
# Step 2: Initialize embedding model and vector store for retrieval
# Using a smaller embedding model to speed up processing while maintaining accuracy
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
db = FAISS.from_documents(docs, embedding_model)  # Store embeddings in FAISS for fast similarity search


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Step 3: Load the question-answering model for answer generation
# Using a smaller variant of T5 (flan-t5-large) to optimize for faster response generation
model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")
question_answerer = pipeline("text2text-generation", model=model, tokenizer=tokenizer)


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# Step 4: Define the retrieval and answer generation function
def retrieve_and_answer(query):
    # Retrieve the top 3 most relevant chunks based on similarity to the query
    contexts = db.similarity_search(query, k=3)
    combined_context = " ".join([context.page_content for context in contexts])

    # Generate a concise answer based on the retrieved context
    prompt = f"Answer concisely: {combined_context}\nQuestion: {query}\nAnswer:"

    # Generate answer with a controlled response length for efficiency
    response = question_answerer(prompt, max_length=100, early_stopping=True)
    answer = response[0]['generated_text'].strip()
    print(f"Question: {query}\nAnswer: {answer}\n")


In [None]:
# Step 5: Define the questions and run the pipeline
questions = [
    "What is the name of the land where Gulliver first lands and meets tiny people?",
    "How does Gulliver end up in Brobdingnag, the land of giants?",
    "Who are the Houyhnhnms in Gulliver’s Travels?",
    "What punishment does Gulliver face in Lilliput for refusing to destroy the enemy fleet?",
    "What are some of the laws that Gulliver finds strange in Lilliput?"
]

In [None]:
# Run the pipeline for each question to retrieve and answer
for question in questions:
    retrieve_and_answer(question)

Question: What is the name of the land where Gulliver first lands and meets tiny people?
Answer: Lilliput

Question: How does Gulliver end up in Brobdingnag, the land of giants?
Answer: He is left on shore, is seized by one of the natives, and carried to a farmer's house

Question: Who are the Houyhnhnms in Gulliver’s Travels?
Answer: degenerate and brutal nature

Question: What punishment does Gulliver face in Lilliput for refusing to destroy the enemy fleet?
Answer: death

Question: What are some of the laws that Gulliver finds strange in Lilliput?
Answer: reward and punishment

