<a href="https://colab.research.google.com/github/mehdihoore/AstraChatbot/blob/main/chunkAstradb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import json
import uuid
from pathlib import Path
from typing import Any, List, Tuple, Optional
from dataclasses import dataclass
from langchain.vectorstores import VectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import Docx2txtLoader  # Updated import path for DocxLoader
from langchain.document_loaders import PyPDFLoader  # Updated import path for PDFLoader
from langchain.document_loaders import TextLoader
from astrapy import DataAPIClient
import re
from google.colab import userdata
from astrapy.constants import VectorMetric
from astrapy.database import Database
from astrapy.collection import Collection
from astrapy.exceptions import CollectionAlreadyExistsException


# Constants and configurations
TEXT_FILE_TYPES = ["txt", "docx", "pdf"]
ASTRA_DB_APPLICATION_TOKEN = userdata.get("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_API_ENDPOINT = userdata.get("ASTRA_DB_API_ENDPOINT")
OPENAI_API_KEY = userdata.get("OPENAI_API_KEY")
OPENAI_EMBEDDING_MODEL_NAME = "text-embedding-ada-002"  # Change if needed

@dataclass
class Data:
    content: str = ""
    references: str = ""

class DocumentProcessor:
    def __init__(self, path: str, silent_errors: bool = False):
        self.path = path
        self.silent_errors = silent_errors
        self.status = ""

    def resolve_path(self, path: str) -> str:
        return os.path.abspath(path)

    def load_file(self) -> Tuple[Data, str]:
        if not self.path:
            raise ValueError("Please, upload a file to use this component.")
        resolved_path = self.resolve_path(self.path)

        extension = Path(resolved_path).suffix[1:].lower()
        if extension not in TEXT_FILE_TYPES:
            raise ValueError(f"Unsupported file type: {extension}")

        if extension == "docx":
            loader = Docx2txtLoader(resolved_path)
        elif extension == "pdf":
            loader = PyPDFLoader(resolved_path)
        else:  # Treat as text file
            loader = TextLoader(resolved_path)

        data_list = loader.load()

        # Ensure data_list contains the expected content
        if isinstance(data_list, list) and len(data_list) > 0:
            # Assuming the content of the first item in the list
            data = Data(content=data_list[0].page_content)
            return data, Path(resolved_path).stem
        else:
            return Data(), ""

class RecursiveCharacterTextSplitterComponent:
    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200, separators: Optional[List[str]] = None):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.separators = separators if separators else [".", "\n"]

    def split_text(self, text: str) -> List[str]:
        splitter = RecursiveCharacterTextSplitter(
            separators=self.separators,
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
        )
        return splitter.split_text(text)

class OpenAIEmbeddingsComponent:
    def __init__(self, api_key: str, model: str = OPENAI_EMBEDDING_MODEL_NAME):
        self.api_key = api_key
        self.model = model

    def build_embeddings(self, texts: List[str]) -> List[List[float]]:
        embedding_model = OpenAIEmbeddings(
            model=self.model,
            api_key=self.api_key
        )
        return embedding_model.embed_documents(texts)

class AstraDBManager:
    def __init__(self, api_endpoint: str, token: str, collection_name: str, namespace: Optional[str] = None):
        self.api_endpoint = api_endpoint
        self.token = token
        self.collection_name = collection_name
        self.namespace = namespace or "default_namespace"
        self.client = DataAPIClient(token)
        self.database = self.client.get_database(api_endpoint)

    def get_or_create_collection(self, collection_name: str, dimension: int = 1536):
        try:
            print(f"Checking for collection {collection_name} in database.")
            collections = self.database.list_collections()  # No arguments needed
            if collection_name in collections:
                print(f"* Collection {collection_name} already exists.")
                return self.database.get_collection(collection_name)
            else:
                print(f"* Collection {collection_name} does not exist. Creating...")
                collection = self.database.create_collection(
                    name=collection_name,  # Use 'name' instead of 'collection_name'
                    dimension=dimension,
                    metric=VectorMetric.COSINE,
                )
                print(f"* Collection {collection_name} created successfully.")
                return collection
        except CollectionAlreadyExistsException:
            print(f"* Collection {collection_name} already exists. Skipping creation.")
            return self.database.get_collection(collection_name)
        except Exception as e:
            print(f"Error handling collection {collection_name}: {e}")
            raise

    def add_documents(self, embeddings: List[List[float]], doc_name: str, references: str, chunks: List[str]):
        collection = self.get_or_create_collection(collection_name=self.collection_name)

        for index, embedding in enumerate(embeddings):
            doc_id = str(uuid.uuid4())
            document = {
                "_id": doc_id,
                "content": chunks[index],
                "$vector": embedding,
                "metadata": {
                    "doc_name": doc_name,
                    "references": references,
                }
            }
            try:
                result = collection.insert_one(document)
                if result.inserted_id:
                    print(f"Inserted document {doc_id}")
                else:
                    print(f"Failed to insert document {doc_id}")
            except Exception as e:
                print(f"Error processing document {doc_id}: {e}")


def extract_references(text: str) -> str:
    # Implement logic to extract references in the format "2-8-7-22"
    pattern = r'\d+-\d+-\d+-\d+'
    matches = re.findall(pattern, text)
    return matches[0] if matches else "No references"

def main(file_paths: List[str]):
    # Initialize components
    text_splitter = RecursiveCharacterTextSplitterComponent()
    embeddings_component = OpenAIEmbeddingsComponent(api_key=OPENAI_API_KEY)
    astradb_manager = AstraDBManager(
        api_endpoint=ASTRA_DB_API_ENDPOINT,
        token=ASTRA_DB_APPLICATION_TOKEN,
        collection_name="nationbuildingcodes"
    )

    for file_path in file_paths:
        print(f"Processing file: {file_path}")
        processor = DocumentProcessor(file_path)

        # Load and process file
        data, doc_name = processor.load_file()
        chunks = text_splitter.split_text(data.content)

        # Embeddings
        embeddings = embeddings_component.build_embeddings(chunks)

        # Extract references from the whole document
        references = extract_references(data.content)

        # Send to AstraDB
        astradb_manager.add_documents(embeddings, doc_name, references, chunks)
        print(f"Finished processing file: {file_path}")

if __name__ == "__main__":
    file_paths = [
        "/content/drive/MyDrive/Mabahes/Mabhas_02.txt",
        "/content/drive/MyDrive/Mabahes/Mabhas_03.txt",
        "/content/drive/MyDrive/Mabahes/Mabhas_04.txt",
        "/content/drive/MyDrive/Mabahes/Mabhas_05.txt",
        "/content/drive/MyDrive/Mabahes/Mabhas_06.txt",
        "/content/drive/MyDrive/Mabahes/Mabhas_07.txt",
        "/content/drive/MyDrive/Mabahes/Mabhas_08.txt",
        "/content/drive/MyDrive/Mabahes/Mabhas_09.txt",
        "/content/drive/MyDrive/Mabahes/Mabhas_10.txt",
        "/content/drive/MyDrive/Mabahes/Mabhas_11.txt",
        "/content/drive/MyDrive/Mabahes/Mabhas_12.txt",
        "/content/drive/MyDrive/Mabahes/Mabhas_13.txt",
        "/content/drive/MyDrive/Mabahes/Mabhas_14.txt",
        "/content/drive/MyDrive/Mabahes/Mabhas_15.txt",
        "/content/drive/MyDrive/Mabahes/Mabhas_16.txt",
        "/content/drive/MyDrive/Mabahes/Mabhas_17.txt",
        "/content/drive/MyDrive/Mabahes/Mabhas_18.txt",
        "/content/drive/MyDrive/Mabahes/Mabhas_19.txt",
        "/content/drive/MyDrive/Mabahes/Mabhas_20.txt",
        "/content/drive/MyDrive/Mabahes/Mabhas_21.txt",
        "/content/drive/MyDrive/Mabahes/Mabhas_22.txt",

    ]
    main(file_paths)


Processing file: /content/drive/MyDrive/Mabahes/Mabhas_02.txt
Checking for collection nationbuildingcodes in database.
* Collection nationbuildingcodes does not exist. Creating...
* Collection nationbuildingcodes already exists. Skipping creation.
Inserted document 1426d5f7-f209-47dc-886c-30c0ff7a2fb8
Inserted document 205d82e6-276d-4f8d-88ad-65c3c955b864
Inserted document ae8a42b7-e9bd-42a7-a058-7f7ca137122b
Inserted document 9e54eb66-0e98-4456-bb56-6435674a9205
Inserted document 7d6af745-4ca1-42ed-bd8b-3240e7000793
Inserted document baa3848c-4410-43bb-8c61-3607cf25b5da
Inserted document b48586a0-6f23-43a8-bdd8-9668194ce4e8
Inserted document a1926696-3f1d-41c0-8baa-d16629193ffa
Inserted document 31ad2bad-d244-4c35-8489-9961f2617ba2
Inserted document c73fb076-8385-4947-9ff5-1a2750ac120f
Inserted document 57113a28-d9ec-4611-bfcc-60f1b07a14a7
Inserted document ba4c4bd4-97ce-4fee-9f70-9b8ac83fdd50
Inserted document f21ac25b-61f0-4c6e-9853-823e322c4b2b
Inserted document 2d784dad-bc3a-4783-

In [None]:
import os
import json
import uuid
from pathlib import Path
from typing import Any, List, Tuple, Optional
from dataclasses import dataclass
from langchain.vectorstores import VectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import Docx2txtLoader  # Updated import path for DocxLoader
from langchain.document_loaders import PyPDFLoader  # Updated import path for PDFLoader
from langchain.document_loaders import TextLoader
from astrapy import DataAPIClient
import re
from google.colab import userdata
from astrapy.constants import VectorMetric


# Constants and configurations
TEXT_FILE_TYPES = ["txt", "docx", "pdf"]
ASTRA_DB_APPLICATION_TOKEN = userdata.get("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_API_ENDPOINT = userdata.get("ASTRA_DB_API_ENDPOINT")
OPENAI_API_KEY = userdata.get("OPENAI_API_KEY")
OPENAI_EMBEDDING_MODEL_NAME = "text-embedding-ada-002"  # Change if needed


In [None]:
from astrapy.exceptions import CollectionAlreadyExistsException