## Backend

### Dense Retriever (Contextual retriever)

In [None]:
# !pip3 install PyPDF2 faiss-cpu python-docx streamlit

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting streamlit
  Downloading streamlit-1.40.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.

In [12]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from PyPDF2 import PdfReader
import docx
import openai
import streamlit as st

In [13]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

class DenseRetriever:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)
        self.index = None
        self.documents = []

    def build_index(self, documents):
        """Build a FAISS index for the given documents."""
        self.documents = documents
        embeddings = self.model.encode(documents, show_progress_bar=True)
        self.index = faiss.IndexFlatL2(embeddings.shape[1])
        self.index.add(embeddings)

    def retrieve(self, query, k=5):
        """Retrieve the top-k most relevant documents for a query."""
        query_vector = self.model.encode([query])
        distances, indices = self.index.search(query_vector, k)
        results = [(self.documents[i], distances[0][idx]) for idx, i in enumerate(indices[0])]
        return results



In [14]:
from transformers import AutoModelForCausalLM, AutoTokenizer

class RAGSystem:
    def __init__(self, retriever):
        self.retriever = retriever
        self.tokenizer = AutoTokenizer.from_pretrained("distilgpt2")   # Using hugging face model
        self.model = AutoModelForCausalLM.from_pretrained("distilgpt2")

    def generate_response(self, query, k=5):
        retrieved_docs = self.retriever.retrieve(query, k)
        context = "\n".join([f"{i+1}. {doc}" for i, (doc, _) in enumerate(retrieved_docs)])
        prompt = f"Answer the query based on the following documents:\n{context}\n\nQuery: {query}\nResponse:"

        # Tokenize and generate a response
        inputs = self.tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
        outputs = self.model.generate(inputs["input_ids"], max_length=200, num_return_sequences=1, do_sample=True)
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        return response.strip(), retrieved_docs



In [None]:
if __name__ == "__main__":
    sample_documents = [
        "The Eiffel Tower is located in Paris, France.",
        "The Great Wall of China is one of the Seven Wonders of the World.",
        "Python is a popular programming language known for its simplicity.",
        "The capital of Germany is Berlin.",
        "Mount Everest is the tallest mountain in the world."
    ]

    sample_query = "What is the capital of Germany?"

    # Test DenseRetriever
    print("Testing DenseRetriever...")
    retriever = DenseRetriever()
    retriever.build_index(sample_documents)
    retrieved_docs = retriever.retrieve(sample_query, k=3)

    print("\nRetrieved Documents:")
    for i, (doc, score) in enumerate(retrieved_docs, 1):
        print(f"{i}. {doc} (Score: {score:.2f})")

    # Test RAGSystem
    print("\nTesting RAGSystem...")
    rag_system = RAGSystem(retriever)
    response, retrieved_docs = rag_system.generate_response(sample_query, k=3)

    print("\nGenerated Response:")
    print(response)

    print("\nRetrieved Documents Used for Response:")
    for i, (doc, score) in enumerate(retrieved_docs, 1):
        print(f"{i}. {doc} (Score: {score:.2f})")

Testing DenseRetriever...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Retrieved Documents:
1. The capital of Germany is Berlin. (Score: 0.30)
2. The Eiffel Tower is located in Paris, France. (Score: 1.62)
3. The Great Wall of China is one of the Seven Wonders of the World. (Score: 1.68)

Testing RAGSystem...


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Generated Response:
Answer the query based on the following documents:
1. The capital of Germany is Berlin.
2. The Eiffel Tower is located in Paris, France.
3. The Great Wall of China is one of the Seven Wonders of the World.

Query: What is the capital of Germany?
Response: We did get a little info from a German government, but I'm not sure whether the government, or the government can get a reply. You just wrote this reply to Facebook.
Question: Can I reply ?
Answer: Yes, I can.
Response: If I reply the following query then you can use a different data structure;
Query: Who are the "The Wall or the Great Wall of China"?
Response: What is the capital of Germany?
Question: Because I'm not sure.
Answer: No.
Question: You don't know! You just write this reply to facebook. So instead of saying the following query I was

Retrieved Documents Used for Response:
1. The capital of Germany is Berlin. (Score: 0.30)
2. The Eiffel Tower is located in Paris, France. (Score: 1.62)
3. The Great Wall

In [18]:
if __name__ == "__main__":
    import os

    # Check if "test_file.txt" exists
    file_name = "test_file.txt"
    if not os.path.exists(file_name):
        print(f"Error: '{file_name}' does not exist.")
        exit(1)

    # Read content from "test_file.txt"
    print(f"Reading content from '{file_name}'...")
    with open(file_name, "r", encoding="utf-8") as file:
        sample_documents = file.read().splitlines()  # Each line as a document

    # Ensure documents are not empty
    sample_documents = [doc for doc in sample_documents if doc.strip()]
    if not sample_documents:
        print(f"Error: '{file_name}' does not contain any valid content.")
        exit(1)

    # Sample query
    sample_query = "What is the book's name?"

    # Test DenseRetriever
    print("\nTesting DenseRetriever...")
    retriever = DenseRetriever()
    retriever.build_index(sample_documents)
    retrieved_docs = retriever.retrieve(sample_query, k=3)

    print("\nRetrieved Documents:")
    for i, (doc, score) in enumerate(retrieved_docs, 1):
        print(f"{i}. {doc} (Score: {score:.2f})")

    # Test RAGSystem
    print("\nTesting RAGSystem...")
    rag_system = RAGSystem(retriever)
    response, retrieved_docs = rag_system.generate_response(sample_query, k=3)

    print("\nGenerated Response:")
    print(response)

    print("\nRetrieved Documents Used for Response:")
    for i, (doc, score) in enumerate(retrieved_docs, 1):
        print(f"{i}. {doc} (Score: {score:.2f})")

Reading content from 'test_file.txt'...

Testing DenseRetriever...


Batches:   0%|          | 0/134 [00:00<?, ?it/s]


Retrieved Documents:
1. There is one little book that I wish could be put into the hands of every\ (Score: 0.82)
2. *** END OF THE PROJECT GUTENBERG EBOOK THE WOMAN OF TO-MORROW ***\ (Score: 0.94)
3. *** START OF THE PROJECT GUTENBERG EBOOK THE WOMAN OF TO-MORROW ***\ (Score: 0.96)

Testing RAGSystem...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Generated Response:
Answer the query based on the following documents:
1. There is one little book that I wish could be put into the hands of every\
2. *** END OF THE PROJECT GUTENBERG EBOOK THE WOMAN OF TO-MORROW ***\
3. *** START OF THE PROJECT GUTENBERG EBOOK THE WOMAN OF TO-MORROW ***\

Query: What is the book's name?
Response: A. He is a pseudonym, and we will tell you what he is. He is a writer of books, and as a woman, I was very proud of him to be a member of "The Women's Project" (WEP) which is to create the book and the book.

Retrieved Documents Used for Response:
1. There is one little book that I wish could be put into the hands of every\ (Score: 0.82)
2. *** END OF THE PROJECT GUTENBERG EBOOK THE WOMAN OF TO-MORROW ***\ (Score: 0.94)
3. *** START OF THE PROJECT GUTENBERG EBOOK THE WOMAN OF TO-MORROW ***\ (Score: 0.96)


In [15]:
def process_uploaded_file(file):
    """Extract text from different document types."""
    if file.type == "text/plain":
        return file.read().decode("utf-8").split("\n")
    elif file.type == "application/pdf":
        reader = PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
        return text.split("\n")
    elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
        doc = docx.Document(file)
        text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
        return text.split("\n")
    else:
        raise ValueError("Unsupported file type!")



In [16]:
# Streamlit UI
st.title("Contextual Retrieval-Augmented Generation (RAG) System")
uploaded_file = st.file_uploader("Upload a document (TXT, PDF, DOCX)", type=["txt", "pdf", "docx"])

retriever = DenseRetriever()
rag_system = RAGSystem(retriever)

if uploaded_file:
    try:
        documents = process_uploaded_file(uploaded_file)
        retriever.build_index(documents)
        st.success("Document index built successfully!")
    except ValueError as e:
        st.error(str(e))

query = st.text_input("Enter your query:")
if query and retriever.index:
    response, retrieved_docs = rag_system.generate_response(query)
    st.subheader("Response")
    st.write(response)

    st.subheader("Retrieved Documents")
    for i, (doc, score) in enumerate(retrieved_docs):
        st.write(f"{i+1}. {doc} (Score: {score})")

2024-11-23 03:07:51.494 
  command:

    streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2024-11-23 03:07:53.052 Session state does not function when running a script without `streamlit run`


In [17]:
!streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.230.68.222:8501[0m
[0m
[34m  Stopping...[0m
^C
