In [36]:
import copy
import inspect
import io
import os
import shutil
from pathlib import Path

from langchain_text_splitters import RecursiveCharacterTextSplitter

os.chdir("/app")
from uuid import UUID

from IPython.lib.display import IFrame
from sqlmodel.ext.asyncio.session import AsyncSession
from starlette.datastructures import State

from apps.inners.models.daos.document import Document
from apps.inners.models.dtos.contracts.responses.managements.documents.file_document_response import \
    FileDocumentResponse
from apps.inners.models.dtos.contracts.responses.managements.documents.text_document_response import \
    TextDocumentResponse
from apps.inners.models.dtos.contracts.responses.managements.documents.web_document_response import WebDocumentResponse
from apps.inners.use_cases.managements.document_management import DocumentManagement
from apps.inners.use_cases.managements.file_document_management import FileDocumentManagement
from apps.inners.use_cases.managements.text_document_management import TextDocumentManagement
from apps.inners.use_cases.managements.web_document_management import WebDocumentManagement
import hashlib
from typing import List
from typing import TypedDict, Tuple, Dict, Any

import dotenv
import litellm
from datasets import load_dataset
from dotenv import find_dotenv
from langchain_community.chat_models import ChatLiteLLMRouter
from langchain_community.vectorstores.milvus import Milvus
from langchain_core.runnables.base import RunnableLike
from langgraph.graph import StateGraph
from langgraph.graph.graph import CompiledGraph
from ragas import evaluate
from starlette import status
from unstructured.documents.elements import Element
from unstructured.partition.auto import partition
from unstructured.partition.utils.constants import PartitionStrategy

from apps.inners.models.dtos.contracts.result import Result
from apps.inners.use_cases.embeddings.hugging_face_e5_instruct_embedding import HuggingFaceE5InstructEmbeddings
from apps.outers.datastores.four_datastore import FourDatastore
from apps.outers.datastores.one_datastore import OneDatastore
from apps.outers.datastores.three_datastore import ThreeDatastore
from apps.outers.datastores.two_datastore import TwoDatastore
from apps.outers.repositories.file_document_repository import FileDocumentRepository
from apps.outers.repositories.text_document_repository import TextDocumentRepository
from apps.outers.repositories.web_document_repository import WebDocumentRepository
from tests.containers.test_container import TestContainer
from tests.seeders.all_seeder import AllSeeder
import tests

from tools.caller_tool import Caller
from tools import caller_tool
import inspect
import sys
inspect.currentframe().f_code.co_filename

'/tmp/ipykernel_284/3766366053.py'

In [32]:
class BaseException(Exception):
    def __init__(self, *args):
        super().__init__(*args)
        last_frame = sys._getframe().f_back
        f_locals = last_frame.f_locals.get("self", None)
        if f_locals is None:
            class_name = None
        else:
            class_name = f_locals.__class__.__name__
        self.class_name = class_name
        self.function_name = last_frame.f_code.co_name
        print(last_frame)
        
class TestException(BaseException):
    pass

class test_2:
    def test(self):
        raise TestException()
        return 1 / 0


class test_1:
    def test(self):
        return test_2().test()


try:
    test_1().test()
except Exception as e:
    print(e.__class__.__name__)
    print(e.__dict__)
    pass

TestException
{'class_name': 'test_2', 'function_name': 'test'}


In [113]:
# import tensorflow
# 
# tensorflow.config.list_physical_devices('GPU')

In [114]:
# import torch
# 
# torch.cuda.is_available()

In [115]:
dotenv.load_dotenv(find_dotenv())


def Frame(src):
    return IFrame(src, width=700, height=500)


In [116]:
test_container = TestContainer()

one_datastore: OneDatastore = test_container.applications.datastores.one()
two_datastore: TwoDatastore = test_container.applications.datastores.two()
three_datastore: ThreeDatastore = test_container.applications.datastores.three()
four_datastore: FourDatastore = test_container.applications.datastores.four()
temp_datastore: ThreeDatastore = test_container.applications.datastores.temp()

file_document_repository: FileDocumentRepository = test_container.applications.repositories.file_document()
text_document_repository: TextDocumentRepository = test_container.applications.repositories.text_document()
web_document_repository: WebDocumentRepository = test_container.applications.repositories.web_document()

document_management: DocumentManagement = test_container.applications.use_cases.managements.document()
file_document_management: FileDocumentManagement = test_container.applications.use_cases.managements.file_document()
text_document_management: TextDocumentManagement = test_container.applications.use_cases.managements.text_document()
web_document_management: WebDocumentManagement = test_container.applications.use_cases.managements.web_document()

all_seeder: AllSeeder = test_container.seeders.all()

In [117]:
await all_seeder.up()

In [8]:
await all_seeder.down()

In [5]:
await two_datastore.client.set("test", "test", ex=10)

True

In [118]:
# loading the V2 dataset
amnesty_qa = load_dataset("explodinggradients/amnesty_qa", "english_v2", trust_remote_code=True)

In [119]:
amnesty_qa

DatasetDict({
    eval: Dataset({
        features: ['question', 'ground_truth', 'answer', 'contexts'],
        num_rows: 20
    })
})

In [120]:
class BaseDocumentProcessor:
    def __init__(
            self,
            document_management: DocumentManagement,
            file_document_management: FileDocumentManagement,
            text_document_management: TextDocumentManagement,
            web_document_management: WebDocumentManagement,
    ):
        self.document_management = document_management
        self.file_document_management = file_document_management
        self.text_document_management = text_document_management
        self.web_document_management = web_document_management

    async def _partition_file(self, state: State, found_document: Document) -> Result[List[Element]]:
        found_file_document: Result[FileDocumentResponse] = await self.file_document_management.find_one_by_id(
            state=state,
            id=found_document.id
        )
        if found_file_document.status_code != status.HTTP_200_OK:
            result: Result[List[Element]] = Result[List[Element]](
                status_code=found_file_document.status_code,
                message=f"BaseDocumentProcessor._partition_file: Failed, {found_file_document.message}.",
                data=None
            )
            return result

        file_data: bytes = self.file_document_management.file_document_repository.get_object_data(
            object_name=found_file_document.data.file_name
        )
        extract_image_path: Path = self.file_document_management.file_document_repository.file_path / found_file_document.data.file_data_hash
        extract_image_path.mkdir(exist_ok=True)
        shutil.rmtree(extract_image_path)
        elements = partition(
            metadata_filename=found_file_document.data.file_name,
            file=io.BytesIO(file_data),
            extract_images_in_pdf=True,
            extract_image_block_output_dir=str(extract_image_path),
            strategy=PartitionStrategy.HI_RES,
            hi_res_model_name="yolox"
        )
        result: Result[List[Element]] = Result[List[Element]](
            status_code=status.HTTP_200_OK,
            message="BaseDocumentProcessor._partition_file: Succeed.",
            data=elements,
        )
        return result

    async def _partition_text(self, state: State, found_document: Document) -> Result[List[Element]]:
        found_text_document: Result[TextDocumentResponse] = await self.text_document_management.find_one_by_id(
            state=state,
            id=found_document.id
        )
        if found_text_document.status_code != status.HTTP_200_OK:
            result: Result[List[Element]] = Result[List[Element]](
                status_code=found_text_document.status_code,
                message=f"BaseDocumentProcessor._partition_text: Failed, {found_text_document.message}.",
                data=None
            )
            return result

        elements = partition(
            text=found_text_document.data.text
        )
        result: Result[List[Element]] = Result[List[Element]](
            status_code=status.HTTP_200_OK,
            message="BaseDocumentProcessor._partition_text: Succeed.",
            data=elements,
        )
        return result

    async def _partition_web(self, state: State, found_document: Document) -> Result[List[Element]]:
        found_web_document: Result[WebDocumentResponse] = await self.web_document_management.find_one_by_id(
            state=state,
            id=found_document.id
        )
        if found_web_document.status_code != status.HTTP_200_OK:
            result: Result[List[Element]] = Result[List[Element]](
                status_code=found_web_document.status_code,
                message=f"BaseDocumentProcessor._partition_web: Failed, {found_web_document.message}.",
                data=None
            )
            return result

        elements = partition(
            url=found_web_document.data.url
        )
        result: Result[List[Element]] = Result[List[Element]](
            status_code=status.HTTP_200_OK,
            message="BaseDocumentProcessor._partition_web: Succeed.",
            data=elements,
        )
        return result

    async def partition(self, state: State, document_id: UUID) -> Result[List[Element]]:
        try:
            found_document: Result[Document] = await self.document_management.find_one_by_id(
                state=state,
                id=document_id
            )
            if found_document.status_code != status.HTTP_200_OK:
                result: Result[List[Element]] = Result[List[Element]](
                    status_code=found_document.status_code,
                    message=f"BaseDocumentProcessor.partition: Failed, {found_document.message}.",
                    data=None
                )
                return result

            if found_document.data.document_type_id == "file":
                result_elements: Result[List[Element]] = await self._partition_file(
                    state=state,
                    found_document=found_document.data
                )
            elif found_document.data.document_type_id == "text":
                result_elements: Result[List[Element]] = await self._partition_text(
                    state=state,
                    found_document=found_document.data
                )
            elif found_document.data.document_type_id == "web":
                result_elements: Result[List[Element]] = await self._partition_web(
                    state=state,
                    found_document=found_document.data
                )
            else:
                result: Result[List[Element]] = Result[List[Element]](
                    status_code=status.HTTP_400_BAD_REQUEST,
                    message=f"BaseDocumentProcessor.partition: Failed, invalid document type {found_document.data.document_type_id}.",
                    data=None
                )
                return result

            if result_elements.status_code != status.HTTP_200_OK:
                result: Result[List[Element]] = Result[List[Element]](
                    status_code=result_elements.status_code,
                    message=f"BaseDocumentProcessor.partition: Failed, {result_elements.message}.",
                    data=None
                )
                return result

            result: Result[List[Element]] = Result[List[Element]](
                status_code=status.HTTP_200_OK,
                message="BaseDocumentProcessor.partition: Succeed.",
                data=result_elements.data,
            )
        except Exception as e:
            result: Result[List[Element]] = Result[List[Element]](
                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                message=f"BaseDocumentProcessor.partition: Failed, {e}",
                data=None,
            )

        return result

    async def categorize_elements(self, elements: List[Element]) -> Result[Dict[str, List[Element]]]:
        try:
            data: Dict[str, List[Element]] = {
                "text": [],
                "table": [],
                "image": [],
            }

            for element in elements:
                if any(
                        element_type in str(type(element)) for element_type in
                        ["unstructured.documents.elements.Text", "unstructured.documents.elements.NarrativeText"]
                ):
                    data["text"].append(element)
                elif any(
                        element_type in str(type(element)) for element_type in
                        ["unstructured.documents.elements.Table"]
                ):
                    data["table"].append(element)
                elif any(
                        element_type in str(type(element)) for element_type in
                        ["unstructured.documents.elements.Image"]
                ):
                    data["image"].append(element)
                else:
                    print(f"BaseDocumentProcessor.categorize_elements: Ignoring element type {type(element)}.")

            result: Result[Dict[str, List[Element]]] = Result[Dict[str, List[Element]]](
                status_code=status.HTTP_200_OK,
                message="BaseDocumentProcessor.categorize_elements: Succeed.",
                data=data,
            )
        except Exception as e:
            result: Result[Dict[str, List[Element]]] = Result[Dict[str, List[Element]]](
                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                message=f"BaseDocumentProcessor.categorize_elements: Failed, {e}",
                data=None,
            )

        return result

    def split_text(self, text: str, chunk_size: int = 4000, chunk_overlap: int = 0) -> Result[List[str]]:
        try:
            text_splitter: RecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap
            )
            data: List[str] = text_splitter.split_text(
                text=text
            )
            result: Result[List[str]] = Result[List[str]](
                status_code=status.HTTP_200_OK,
                message="BaseDocumentProcessor.split_text: Succeed.",
                data=data,
            )
        except Exception as e:
            result: Result[List[str]] = Result[List[str]](
                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                message=f"BaseDocumentProcessor.split_text: Failed, {e}",
                data=None,
            )

        return result


base_document_processor: BaseDocumentProcessor = BaseDocumentProcessor(
    document_management=document_management,
    file_document_management=file_document_management,
    text_document_management=text_document_management,
    web_document_management=web_document_management,
)

In [101]:
result_elements: Result[List[Element]] = None


async def handler(session: AsyncSession):
    global result_elements
    state: State = State()
    state.session = session
    result_elements = await base_document_processor.partition(
        state=state,
        document_id=all_seeder.file_document_seeder.file_document_mock.data[0].id
    )


await one_datastore.retryable(handler)

result_elements

Result[List[unstructured.documents.elements.Element]](status_code=200, message='BaseDocumentProcessor.partition: Succeed.', data=[<unstructured.documents.elements.Image object at 0x7fb050140c10>, <unstructured.documents.elements.NarrativeText object at 0x7fb050140370>, <unstructured.documents.elements.NarrativeText object at 0x7fb050141420>, <unstructured.documents.elements.Title object at 0x7fb0501410c0>, <unstructured.documents.elements.Text object at 0x7fb1e36bb550>, <unstructured.documents.elements.NarrativeText object at 0x7fb050141450>, <unstructured.documents.elements.NarrativeText object at 0x7fb050141f90>, <unstructured.documents.elements.ListItem object at 0x7fb050141480>, <unstructured.documents.elements.NarrativeText object at 0x7fb050141090>, <unstructured.documents.elements.NarrativeText object at 0x7fb0501425c0>, <unstructured.documents.elements.NarrativeText object at 0x7fb0501423e0>, <unstructured.documents.elements.NarrativeText object at 0x7fb050141fc0>, <unstructure

In [102]:
result_categorized_elements: Result[Dict[str, List[Element]]] = await base_document_processor.categorize_elements(
    result_elements.data
)

string_from_text_and_table_elements = "".join(
    [element.text for element in result_categorized_elements.data["text"]] +
    [element.text for element in result_categorized_elements.data["table"]]
)
split_text_result: Result[List[str]] = base_document_processor.split_text(
    text=
)
result_categorized_elements
split_text_result

BaseDocumentProcessor.categorize_elements: Ignoring element type <class 'unstructured.documents.elements.Title'>.
BaseDocumentProcessor.categorize_elements: Ignoring element type <class 'unstructured.documents.elements.ListItem'>.
BaseDocumentProcessor.categorize_elements: Ignoring element type <class 'unstructured.documents.elements.Header'>.
BaseDocumentProcessor.categorize_elements: Ignoring element type <class 'unstructured.documents.elements.FigureCaption'>.
BaseDocumentProcessor.categorize_elements: Ignoring element type <class 'unstructured.documents.elements.Header'>.
BaseDocumentProcessor.categorize_elements: Ignoring element type <class 'unstructured.documents.elements.Header'>.
BaseDocumentProcessor.categorize_elements: Ignoring element type <class 'unstructured.documents.elements.Header'>.
BaseDocumentProcessor.categorize_elements: Ignoring element type <class 'unstructured.documents.elements.Title'>.
BaseDocumentProcessor.categorize_elements: Ignoring element type <class '

Result[List[str]](status_code=200, message='BaseDocumentProcessor.split_text: Succeed.', data=['Received April 5, 2020, accepted April 14, 2020, date of publication April 17, 2020, date of current version May 5, 2020.'])

In [9]:
file_path_1 = Path(tests.__name__) / "mocks" / "files" / "file.pdf"
file_data_1 = open(file_path_1, "rb").read()
file_data_hash_1 = hashlib.sha256(bytes(file_data_1)).hexdigest()
file_data_2 = all_seeder.file_document_seeder.file_document_mock.file_data[0]
file_data_hash_2 = hashlib.sha256(file_data_2).hexdigest()

response = three_datastore.client.get_object(
    bucket_name="research-assistant-backend.file-documents",
    object_name=all_seeder.file_document_seeder.file_document_mock.data[0].file_name,
)
file_data_0: bytes = response.read()
response.close()
file_data_hash_0 = hashlib.sha256(file_data_0).hexdigest()
print(file_data_hash_0, file_data_hash_1, file_data_hash_2)

8d6a191a8bb02b3cd77352306b856e2269394cc5f43110df19c9ae11384637e5 8d6a191a8bb02b3cd77352306b856e2269394cc5f43110df19c9ae11384637e5 8d6a191a8bb02b3cd77352306b856e2269394cc5f43110df19c9ae11384637e5


'http://192.168.137.2:9000/research-assistant-backend.file-documents/2f55fc1e-58ae-4b2f-829d-120c401febe0.pdf?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=user%2F20240327%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240327T233708Z&X-Amz-Expires=604800&X-Amz-SignedHeaders=host&X-Amz-Signature=b0623a40f901aba4192e79912791d7d1cdde84284bd6ec96dba228d22dca3440'

In [None]:
class GraphState(TypedDict):
    data: Dict[str, Any]


class GraphLongFormQa:
    def __init__(
            self,
            four_datastore: FourDatastore,
    ):
        self.four_datastore = four_datastore

    def _retrieve(self, input_state: GraphState) -> GraphState:
        output_state: GraphState = copy.deepcopy(input_state)
        retriever: Milvus = self.four_datastore.get_client(
            embedding_function=input_state["retriever_setting"]["embedding_function"],
            collection_name=input_state["retriever_setting"]["collection_name"],
        )
        return output_state

    def _get_node(self, node: RunnableLike) -> Tuple[str, RunnableLike]:
        return node.__name__, node

    def compile(self) -> CompiledGraph:
        graph: StateGraph = StateGraph(GraphState)
        graph.add_node(self._get_node(self._retrieve))
        graph.set_entry_point(self._retrieve.__name__)
        graph.set_finish_point(self._retrieve.__name__)
        compiled_graph: CompiledGraph = graph.compile()
        return compiled_graph


graph_lfqa = GraphLongFormQa(
    four_datastore=four_datastore,
)
compiled_graph_lfqa = graph_lfqa.compile()
input_state: GraphState = GraphState(
    data={
        "retriever_setting": {
            "embedding_function": HuggingFaceE5InstructEmbeddings,
            "collection_name": all_seeder.account_seeder.account_mock.data[0].id,
        },
        "question": "what is artificial intelligence?",
        "document": three_datastore.client.ur
    }
)
compiled_graph_lfqa.invoke(input_state)


In [19]:
litellm.set_verbose = False
llm = ChatLiteLLMRouter(
    anthropic_api_key=os.getenv("ANTHROPIC_API_KEY"),
    model="claude-3-haiku-20240307",
    streaming=True,
    temperature=0,
)

embeddings = HuggingFaceE5InstructEmbeddings(
    model_name="/mnt/c/Data/Apps/research-assistant-infrastructure/data/models/infloat/multilingual-e5-large-instruct",
    model_kwargs={'device': 'cuda'},
    encode_kwargs={'normalize_embeddings': True},
    query_instruction="Given the question, retrieve the answer from the context."
)

In [30]:
eval_data = amnesty_qa["eval"].select(range(1))
eval_data

Dataset({
    features: ['question', 'ground_truth', 'answer', 'contexts'],
    num_rows: 1
})

In [21]:
result = evaluate(
    eval_data,
    llm=llm,
    embeddings=embeddings,
    # metrics=[
    #     metrics.faithfulness,
    #     metrics.answer_relevancy, 
    #     metrics.context_recall,
    #     metrics.context_precision,
    #     metrics.answer_correctness,
    #     metrics.context_relevancy,
    #     metrics.context_entity_recall,
    # ],
)

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


Invalid JSON response. Expected dictionary with key 'Attributed'
  value = np.nanmean(self.scores[cn])


In [22]:
result

{'answer_relevancy': 0.9599, 'context_precision': 1.0000, 'faithfulness': 0.5000, 'context_recall': nan}