In [1]:
# Install Required Packages
# langchain: Framework for developing LLM applications; provides chains and agents
# chromadb: Vector database for storing and retrieving document embeddings
# sentence_transformers: Creates text embeddings using transformer models
# gradio: Creates web UI interfaces for ML models
# python-dotenv: Loads environment variables from .env files
# openai: OpenAI API wrapper for accessing GPT models
# tiktoken: OpenAI's tokenizer for text processing
# langchain-community: Community-maintained integrations for LangChain
# pypdf: PDF document parsing and manipulation
# sympy: Symbolic mathematics library (though not used in current code)

!pip install langchain chromadb sentence_transformers gradio python-dotenv openai tiktoken
!pip install langchain-community
!pip install pypdf
!pip install -U sympy

# os: Operating system interface for file/path operations
# gradio: Web UI framework for ML models
# PyPDFLoader: Loads and parses PDF documents
# RecursiveCharacterTextSplitter: Splits text into smaller chunks recursively
# HuggingFaceEmbeddings: Creates text embeddings using HuggingFace models
# Chroma: Vector store for document embeddings
# OpenAI: Interface for OpenAI's language models
# RetrievalQA: Chain for question-answering over documents
# tempfile: Creates temporary files/directories
# hashlib: Implements various hash algorithms (SHA-256 here)
# sqlite3: SQLite database interface
# datetime: Date and time handling
# uuid: Generates unique identifiers
# shutil: High-level file operations

import os
import gradio as gr
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
import tempfile
import hashlib
import sqlite3
from datetime import datetime
import uuid
import shutil

# Set your OpenAI API key
from google.colab import userdata
api_key = userdata.get('OA_API')
os.environ['OPENAI_API_KEY'] = api_key
openai.api_key = os.getenv('OPENAI_API_KEY')

#A database manager class that creates and maintains a SQLite table for storing document metadata including file details, access roles, upload timestamps, and uploader information.

class DocumentDatabase:
    def __init__(self):
        self.conn = sqlite3.connect('documents.db', check_same_thread=False)
        self.create_tables()

    def create_tables(self):
        cursor = self.conn.cursor()
        cursor.execute('''
        CREATE TABLE IF NOT EXISTS documents (
            id TEXT PRIMARY KEY,
            filename TEXT NOT NULL,
            role TEXT CHECK(role IN ('ADMIN', 'REVIEWER', 'BASIC')) NOT NULL,
            upload_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            uploaded_by TEXT NOT NULL
        )
        ''')
        self.conn.commit()

#Class: A database manager for user authentication that handles user accounts, roles, and credentials.

class UserDatabase:
    def __init__(self):
        self.conn = sqlite3.connect('users.db', check_same_thread=False)
        self.create_tables()

    def create_tables(self):
        cursor = self.conn.cursor()
        cursor.execute('''
        CREATE TABLE IF NOT EXISTS users (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            username TEXT UNIQUE NOT NULL,
            password TEXT NOT NULL,
            role TEXT CHECK(role IN ('ADMIN', 'REVIEWER', 'BASIC')) NOT NULL DEFAULT 'BASIC',
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            last_login TIMESTAMP
        )
        ''')

        # Create initial admin user if not exists
        cursor.execute('''
        INSERT OR IGNORE INTO users (username, password, role)
        VALUES (?, ?, ?)
        ''', ('admin', hashlib.sha256('admin123'.encode()).hexdigest(), 'ADMIN'))

        self.conn.commit()

    def add_user(self, username, password, role):
            try:
                cursor = self.conn.cursor()
                hashed_password = hashlib.sha256(password.encode()).hexdigest()

                cursor.execute('''
                INSERT INTO users (username, password, role)
                VALUES (?, ?, ?)
                ''', (username, hashed_password, role))

                self.conn.commit()
                return True, "User created successfully"
            except sqlite3.IntegrityError:
                return False, "Username already exists"
            except Exception as e:
                return False, f"Error creating user: {str(e)}"

#RoleBasedRAG class that manages document processing, user authentication, and question answering with role-based access control to documents using embedding models and LLMs.

class RoleBasedRAG:
    def __init__(self):
        self.embeddings = HuggingFaceEmbeddings()
        self.llm = OpenAI()
        self.user_db = UserDatabase()
        self.doc_db = DocumentDatabase()
        self.persist_directory = "chroma_db"
        self.current_user = None
        self.current_role = None

    def authenticate(self, username, password):
        cursor = self.user_db.conn.cursor()
        hashed_password = hashlib.sha256(password.encode()).hexdigest()

        cursor.execute('''
        SELECT role FROM users
        WHERE username = ? AND password = ?
        ''', (username, hashed_password))

        result = cursor.fetchone()
        if result:
            self.current_user = username
            self.current_role = result[0]
            return True, f"Login successful. Role: {result[0]}"
        return False, "Invalid credentials"

    def process_document(self, file_path, role):
        try:
            if not self.current_user:
                return False, "Please login first"

            # Generate unique ID for document
            doc_id = str(uuid.uuid4())

            # Load and split document
            loader = PyPDFLoader(file_path)
            documents = loader.load()

            # Add metadata to each document chunk
            for doc in documents:
                doc.metadata['role'] = role
                doc.metadata['doc_id'] = doc_id

            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000,
                chunk_overlap=200
            )
            texts = text_splitter.split_documents(documents)

            # Store in Chroma
            persist_dir = os.path.join(self.persist_directory, doc_id)
            vectordb = Chroma.from_documents(
                documents=texts,
                embedding=self.embeddings,
                persist_directory=persist_dir
            )
            vectordb.persist()

            # Store document metadata in SQLite
            cursor = self.doc_db.conn.cursor()
            cursor.execute('''
            INSERT INTO documents (id, filename, role, uploaded_by)
            VALUES (?, ?, ?, ?)
            ''', (doc_id, os.path.basename(file_path), role, self.current_user))
            self.doc_db.conn.commit()

            return True, f"Document processed successfully with ID: {doc_id}"

        except Exception as e:
            return False, f"Error processing document: {str(e)}"

    def query_documents(self, question):
        if not self.current_user or not self.current_role:
            return "Please login first"

        try:
            # Get all document IDs accessible to user's role
            cursor = self.doc_db.conn.cursor()
            if self.current_role == 'ADMIN':
                cursor.execute('SELECT id FROM documents')
            else:
                cursor.execute('''
                SELECT id FROM documents
                WHERE role = ? OR role = 'BASIC'
                ''', (self.current_role,))

            accessible_docs = cursor.fetchall()
            if not accessible_docs:
                return "No accessible documents found"

            combined_results = []

            # Query each accessible document
            for doc_id in accessible_docs:
                persist_dir = os.path.join(self.persist_directory, doc_id[0])
                if os.path.exists(persist_dir):
                    vectordb = Chroma(
                        persist_directory=persist_dir,
                        embedding_function=self.embeddings
                    )

                    qa = RetrievalQA.from_chain_type(
                        llm=self.llm,
                        chain_type="stuff",
                        retriever=vectordb.as_retriever()
                    )

                    result = qa.run(question)
                    combined_results.append(f"Document {doc_id[0]}: {result}")

            return "\n\n".join(combined_results)

        except Exception as e:
            return f"Error querying documents: {str(e)}"

    def add_new_user(self, username, password, role):
        if not self.current_user or self.current_role != 'ADMIN':
            return "Only administrators can add new users"

        success, message = self.user_db.add_user(username, password, role)
        return message

#Creates and manages the Gradio web interface with tabs for login, user management, document upload, and document querying, connecting UI actions to RoleBasedRAG functionality.

class Interface:
    def __init__(self):
        self.rag = RoleBasedRAG()

    def login(self, username, password):
        success, message = self.rag.authenticate(username, password)
        return message

    def add_user(self, username, password, role):
        return self.rag.add_new_user(username, password, role)

    def upload_document(self, file, role):
        if not file:
            return "Please upload a file"

        success, message = self.rag.process_document(file.name, role)
        return message

    def ask_question(self, question):
        return self.rag.query_documents(question)

    def create_interface(self):
        with gr.Blocks() as interface:
            gr.Markdown("# Role-Based RAG System")

            with gr.Tab("Login"):
                username = gr.Textbox(label="Username")
                password = gr.Textbox(label="Password", type="password")
                login_btn = gr.Button("Login")
                login_output = gr.Textbox(label="Status")

            with gr.Tab("User Management"):
                gr.Markdown("## Add New User (Admin Only)")
                new_username = gr.Textbox(label="New Username")
                new_password = gr.Textbox(label="New Password", type="password")
                new_role = gr.Dropdown(
                    choices=['ADMIN', 'REVIEWER', 'BASIC'],
                    label="User Role",
                    value='BASIC'
                )
                add_user_btn = gr.Button("Add User")
                add_user_output = gr.Textbox(label="Status")

            with gr.Tab("Document Upload"):
                file_input = gr.File(label="Upload PDF")
                role_input = gr.Dropdown(
                    choices=['ADMIN', 'REVIEWER', 'BASIC'],
                    label="Document Access Role",
                    value='BASIC'
                )
                upload_btn = gr.Button("Upload")
                upload_output = gr.Textbox(label="Upload Status")

            with gr.Tab("Query"):
                question_input = gr.Textbox(label="Question")
                query_btn = gr.Button("Ask")
                answer_output = gr.Textbox(label="Answer")

            login_btn.click(
                self.login,
                inputs=[username, password],
                outputs=login_output
            )

            add_user_btn.click(
                self.add_user,
                inputs=[new_username, new_password, new_role],
                outputs=add_user_output
            )

            upload_btn.click(
                self.upload_document,
                inputs=[file_input, role_input],
                outputs=upload_output
            )

            query_btn.click(
                self.ask_question,
                inputs=[question_input],
                outputs=answer_output
            )

        return interface

# Create and launch interface
if __name__ == "__main__":
    interface = Interface()
    interface.create_interface().launch(share=True)

Collecting sympy (from onnxruntime>=1.14.1->chromadb)
  Using cached sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Using cached sympy-1.13.1-py3-none-any.whl (6.2 MB)
Installing collected packages: sympy
  Attempting uninstall: sympy
    Found existing installation: sympy 1.13.3
    Uninstalling sympy-1.13.3:
      Successfully uninstalled sympy-1.13.3
Successfully installed sympy-1.13.1
Collecting sympy
  Using cached sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Using cached sympy-1.13.3-py3-none-any.whl (6.2 MB)
Installing collected packages: sympy
  Attempting uninstall: sympy
    Found existing installation: sympy 1.13.1
    Uninstalling sympy-1.13.1:
      Successfully uninstalled sympy-1.13.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.5.0+cu121 requires sympy==1.13.1; python_version >= "3.9", but you have sympy 1.13.3 which is incompa

  self.embeddings = HuggingFaceEmbeddings()
  self.embeddings = HuggingFaceEmbeddings()
  self.llm = OpenAI()


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://31b989166798fd5d55.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
