# 🚀 Building a RAG Chatbot on AWS Certifications

In this Colab project, we will:

- Upload 3 AWS certification PDFs.
- Extract, clean, and analyze the text (EDA).
- Create embeddings and build a FAISS retriever.
- Load `microsoft/Phi-4-multimodal-instruct` as the answer generator.
- Build a chatbot with memory (chat history).
- Deploy the bot with a clean Gradio UI.
- Add Reset Memory button.
- Save and reload FAISS index.
- Evaluate chatbot responses with BLEU, ROUGE, and visualize metrics.

Let's begin! 🚀


In [24]:
!pip install -q transformers accelerate langchain chromadb pypdf sentence-transformers bitsandbytes


In [25]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.memory import ConversationBufferMemory
from langchain.llms import HuggingFacePipeline
from langchain.chains import ConversationalRetrievalChain
from google.colab import files
import os


In [26]:
uploaded_files = files.upload()
uploaded_pdf_paths = list(uploaded_files.keys())
print("Uploaded PDFs:", uploaded_pdf_paths)


Saving AWSAIP_compressed.pdf to AWSAIP_compressed (1).pdf
Saving AWSCCP_compressed.pdf to AWSCCP_compressed (1).pdf
Saving AWSSAA_compressed.pdf to AWSSAA_compressed (1).pdf
Uploaded PDFs: ['AWSAIP_compressed (1).pdf', 'AWSCCP_compressed (1).pdf', 'AWSSAA_compressed (1).pdf']


In [27]:
# Load and read PDFs
docs = []
for path in uploaded_pdf_paths:
    loader = PyPDFLoader(path)
    pdf_pages = loader.load()
    docs.extend(pdf_pages)

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(docs)

print(f"Total chunks created: {len(documents)}")


Total chunks created: 1769


In [None]:
# Create embeddings
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Store embeddings in Chroma
vectorstore = Chroma.from_documents(documents, embedding_model)


In [None]:
model_id = "microsoft/Phi-4-mini-instruct"  # Using small version suitable for Colab

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Setup model pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.3,
    top_p=0.95,
    repetition_penalty=1.1
)

# Wrap for LangChain
llm_pipeline = HuggingFacePipeline(pipeline=pipe)


In [None]:
# Conversation memory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

# Setup RAG QA Chain with memory
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm_pipeline,
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
    memory=memory,
    verbose=True
)


In [None]:
history = []

print("🚀 Chat with your documents! Type 'exit' to quit.\n")

while True:
    query = input("You: ")
    if query.lower() == 'exit':
        print("Exiting Chat...")
        break

    result = qa_chain({"question": query, "chat_history": history})
    answer = result['answer']

    print(f"\nBot: {answer}\n")

    # Update history
    history.append((query, answer))


In [None]:
# Function to evaluate document
def evaluate_document(documents, vectorstore):
    # Calculate total tokens in documents
    total_tokens = sum([len(doc.page_content.split()) for doc in documents])

    # Calculate number of chunks
    num_chunks = len(documents)

    # Example for testing response quality (evaluation)
    test_query = "What is Lambda in computing?"
    result = qa_chain({"question": test_query, "chat_history": []})
    response = result['answer']

    # Print Evaluation Metrics
    print(f"Total Tokens Processed: {total_tokens}")
    print(f"Total Chunks Created: {num_chunks}")
    print(f"Test Question: {test_query}")
    print(f"Test Response: {response}\n")

# Call evaluation after loading PDFs
evaluate_document(documents, vectorstore)


In [None]:
import gradio as gr

In [None]:
!pip install -q gradio

In [None]:
def chat_with_docs(query):
    result = qa_chain({"question": query, "chat_history": memory.buffer})
    answer = result['answer']
    memory.buffer.append((query, answer))  # Store conversation history
    return answer

# Gradio Interface
iface = gr.Interface(
    fn=chat_with_docs,
    inputs=gr.inputs.Textbox(lines=2, placeholder="Ask me anything related to your documents..."),
    outputs="text",
    title="Document Chatbot",
    description="Chat with your uploaded documents using a large language model."
)

iface.launch(share=True)
