In [1]:
# DXC Technologies 1B
# EU AI ACT Chatbot
# RAG Implementation

In [2]:
!pip install chromadb
!pip install pymupdf
!pip install cohere
!pip install hnswlib
from typing import List, Dict
import hnswlib
import fitz  # PyMuPDF

Collecting chromadb
  Downloading chromadb-0.5.23-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.32.1-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.7.4-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.28.2-py3

In [3]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m81.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
import logging # Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [5]:
import cohere
# Trail API key:
co = cohere.Client('YOUR-API-KEY')

In [6]:
#TEXT DATA CLEANING
import spacy

# Load spacy model
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text: str) -> str:
    doc = nlp(text)
    filtered_tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(filtered_tokens)

In [7]:
class Vectorstore:
    """
    A class representing a collection of documents stored in a vector-based index.

    Parameters:
    source_data (list): A list of dictionaries representing the sources of the original documents. Each dictionary should have 'title' and 'url' keys.

    Attributes:
    source_data (list): The original documents.
    chunks (list): A list of dictionaries representing the segmented documents, each containing 'title' and 'content'.
    embeddings (list): A list of embeddings for each document chunk.
    total_chunks (int): The count of document chunks in the collection.
    index (hnswlib.Index): The vector index used for fast retrieval.

    Methods:
    process_documents(): Loads the data and divides the content into smaller chunks.
    generate_embeddings(): Creates embeddings for each chunk using the Cohere API.
    build_index(): Builds an index of document chunks for efficient searching.
    search(): Retrieves relevant document chunks based on a user query.
    """

    def __init__(self, source_data: List[Dict[str, str]]):
        self.source_data = source_data
        self.chunks = []
        self.embeddings = []
        self.search_limit = 10
        self.refine_limit = 3
        self.process_documents()
        self.generate_embeddings()
        self.build_index()


    def process_documents(self) -> None:
        """
        Processes the raw document text and divides it into chunks.
        """
        logger.info("Processing documents...")

        for source in self.source_data:
            title = source["title"]
            content = source["text"]

            # Assuming chunking by paragraph or similar
            text_segments = self.segment_text(content)

            for segment in text_segments:
                self.chunks.append(
                    {
                        "title": title,
                        "content": str(segment)
                    }
                )
        logger.info(f"Processed {len(self.chunks)} document chunks.")

    def segment_text(self, content: str, max_size: int = 500) -> List[str]:
        """
        Splits text into smaller segments up to a maximum size.
        """
        # You can add more complex logic here if needed
        preprocessed_content = preprocess_text(content)
        return [preprocessed_content[i:i + max_size] for i in range(0, len(preprocessed_content), max_size)]


    def generate_embeddings(self) -> None:
        """
        Generates embeddings for each text chunk using the Cohere API.
        """
        logger.info("Generating embeddings for document chunks...")

        batch_limit = 90
        self.total_chunks = len(self.chunks)
        for i in range(0, self.total_chunks, batch_limit):
            batch = self.chunks[i : min(i + batch_limit, self.total_chunks)]
            contents = [item["content"] for item in batch]
            chunk_embeddings = co.embed(
                texts=contents, model="embed-english-v3.0", input_type="search_document"
            ).embeddings
            self.embeddings.extend(chunk_embeddings)

    def build_index(self) -> None:
        """
        Builds a vector index on document chunks for efficient retrieval.
        """
        logger.info("Building index for document chunks...")

        self.index = hnswlib.Index(space="ip", dim=1024)
        self.index.init_index(max_elements=self.total_chunks, ef_construction=512, M=64)
        self.index.add_items(self.embeddings, list(range(len(self.embeddings))))

        logger.info(f"Index built with {self.index.get_current_count()} document chunks.")

    def search(self, query: str) -> List[Dict[str, str]]:
        """
        Searches for relevant document chunks based on the input query.

        Parameters:
        query (str): The search query.

        Returns:
        List[Dict[str, str]]: A list of dictionaries representing the retrieved document chunks, each with 'title' and 'content'.
        """

        # Initial retrieval
        query_embedding = co.embed(
            texts=[query], model="embed-english-v3.0", input_type="search_query"
        ).embeddings

        retrieved_ids = self.index.knn_query(query_embedding, k=self.search_limit)[0][0]

        # Refining the results
        refine_fields = ["title", "content"]  # Using title and content fields for reranking

        docs_to_refine = [self.chunks[doc_id] for doc_id in retrieved_ids]
        refined_results = co.rerank(
            query=query,
            documents=docs_to_refine,
            top_n=self.refine_limit,
            model="rerank-english-v3.0",
            rank_fields=refine_fields
        )

        refined_ids = [retrieved_ids[result.index] for result in refined_results.results]

        refined_docs = []
        for doc_id in refined_ids:
            refined_docs.append(
                {
                    "title": self.chunks[doc_id]["title"],
                    "content": self.chunks[doc_id]["content"],
                }
            )

        return refined_docs

In [9]:
def load_pdf(file_path):
    try:
      doc = fitz.open(file_path)
      text = ""
      for page in doc:
          text += page.get_text()
      return text
    except Exception as e:
      logger.error(f"Error reading PDF: {e}")
      return " "

file_path = 'YOUR-FILEPATH-TO-THE-EU-AI-ACT'
pdf_text = load_pdf(file_path)

source_data = [ {"title": "Artifical_Intelligence_Act", "text": pdf_text} ]
vectorstore = Vectorstore(source_data)

In [31]:
def chatbot_query(query: str, vectorstore: Vectorstore) -> str:
    # Search vectorstore for relevant chunks
    relevant_chunks = vectorstore.search(query)
    # Concatenate chunk contents for context
    context = " ".join([chunk["content"] for chunk in relevant_chunks])
    # Append context to the query
    full_query = f"{context}\n\n{query}"

    # Initialize Cohere API
    # Trail API key:
    co = cohere.Client('YOUR-API-KEY')
    response = co.generate(
        model='command-xlarge-nightly',
        prompt=full_query,
        max_tokens=150
    )
    return response.generations[0].text

In [None]:
# Example query 1
query = "What is the EU AI act?"
response_text = chatbot_query(query, vectorstore)
print(response_text)

In [None]:
# Example query 2
query = "What is high risk and minimal risk?"
response_text = chatbot_query(query, vectorstore)
print(response_text)

In [None]:
# Example query 3
query = "What is impact on health, safety, and fundamental rights considered for EU AI act?"
response_text = chatbot_query(query, vectorstore)
print(response_text)

In [None]:
# Example query 4
query = "What is impact on health, safety, and fundamental rights considered?"
response_text = chatbot_query(query, vectorstore)
print(response_text)


In [None]:
# Example query 5
query = "What year was the EU AI Act proposed?"
response_text = chatbot_query(query, vectorstore)
print(response_text)

In [None]:
# Example query 6
query = "What year was the EU AI Act passed?"
response_text = chatbot_query(query, vectorstore)
print(response_text)

In [None]:
# Example query 7
query = "Explain the EU AI act risk-based approach"
response_text = chatbot_query(query, vectorstore)
print(response_text)

In [None]:
# Example query 8
query = "What did AlgorithmWatch underline?"
response_text = chatbot_query(query, vectorstore)
print(response_text)

In [None]:
# Example query 9
query = "What are 3 prohibited AI practices?"
response_text = chatbot_query(query, vectorstore)
print(response_text)

In [None]:
# Example query 10
query = "What did Ebers et al. stress that the AI act lacks?"
response_text = chatbot_query(query, vectorstore)
print(response_text)