Combine a retriever and generator to answer factual questions using RAG. Design a semantic search engine using a vector database. Use LlamaIndex to index unstructured data and enable question answering.

In [None]:
# Step 1: Install Required Libraries
!pip install llama-index langchain openai chromadb llama-index-embeddings-huggingface llama-index-llms-huggingface bitsandbytes

# Step 2: Load Unstructured Data
from llama_index.core import SimpleDirectoryReader



In [None]:
documents = SimpleDirectoryReader(input_files=["/content/GAN_exam_answer.pdf"]).load_data()

In [None]:
# Step 3: Create Vector Index Using LlamaIndex
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# Configure the embedding model for local usage
bge_embeddings = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

index = VectorStoreIndex.from_documents(documents, embed_model=bge_embeddings)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Step 4: Create Query Engine
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.prompts.prompts import SimpleInputPrompt
import torch
from transformers import BitsAndBytesConfig

# Define the LLM
system_prompt = "You are a Q&A assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided."
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

# Define BitsAndBytesConfig for 8-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

llm = HuggingFaceLLM(
    context_window=2048, # Corrected context_window to match model's max input size
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.7, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    device_map="auto",
    model_kwargs={"torch_dtype": torch.float16, "quantization_config": quantization_config},
)

query_engine = index.as_query_engine(llm=llm)

In [None]:
# Step 5: Ask a Question (RAG in Action)
response = query_engine.query(
    "What is machine learning?"
)

print(response)

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Machine learning is a field of computer science that involves the development and application of algorithms and techniques for automatically learning from data without being explicitly programmed. It is a subset of artificial intelligence and is used in various fields such as healthcare, finance, and marketing.


In [None]:
# Step 6: Explicit Semantic Search Example
retriever = index.as_retriever(similarity_top_k=3)
nodes = retriever.retrieve("Explain neural networks")

for node in nodes:
    print(node.text)


In [None]:
# Step 4: Create Query Engine
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.prompts.prompts import SimpleInputPrompt
import torch
from transformers import BitsAndBytesConfig

# Define the LLM
system_prompt = "You are a Q&A assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided."
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

# Define BitsAndBytesConfig for 8-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.7, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    device_map="auto",
    model_kwargs={"torch_dtype": torch.float16, "quantization_config": quantization_config},
)

query_engine = index.as_query_engine(llm=llm)

In [None]:
# Step 5: Ask a Question (RAG in Action)
response = query_engine.query(
    "What is machine learning?"
)

print(response)