In [23]:
import os

from magic import Magic

from apps.inners.models.dtos.contracts.requests.long_form_qas.input_setting_body import GeneratorSetting

os.chdir("/app")
from apps.inners.models.dtos.contracts.requests.long_form_qas.input_setting_body import InputSettingBody as LongFormQaInputSettingBody
from apps.inners.models.dtos.contracts.requests.passage_searches.input_setting_body import InputSettingBody as PassageSearchInputSettingBody, LlmSetting, \
    PreprocessorSetting, EmbedderSetting, RetrieverSetting, RerankerSetting
from apps.inners.models.dtos.contracts.requests.passage_searches.process_body import ProcessBody as PassageSearchProcessBody
from apps.inners.models.dtos.contracts.requests.long_form_qas.process_body import ProcessBody as LongFormQaProcessBody

from langchain_anthropic import ChatAnthropic
from ragas.testset import TestsetGenerator, evolutions

from apps.inners.use_cases.graphs.preparation_graph import PreparationGraph
from apps.inners.use_cases.document_processor.category_document_processor import CategoryDocumentProcessor
from apps.inners.use_cases.document_processor.partition_document_processor import PartitionDocumentProcessor
from apps.inners.use_cases.document_processor.summary_document_processor import SummaryDocumentProcessor

from langchain_community.embeddings.infinity import InfinityEmbeddings

from tools import cache_tool

import gc

from sqlmodel.ext.asyncio.session import AsyncSession
from starlette.datastructures import State

import dotenv
from datasets import load_dataset
from dotenv import find_dotenv
from ragas import evaluate, metrics

from tests.containers.test_container import TestContainer
from apps.inners.models.dtos.graph_state import LongFormQaGraphState, PreparationGraphState


In [2]:
# !pip show flagembedding
# !pip show langchain-anthropic
# !pip show pymilvus
# !pip show opencv-python
# !libreoffice --help
# !wkhtmltopdf

In [3]:
import tensorflow

tensorflow.config.list_physical_devices('GPU')

2024-04-14 15:03:45.948932: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-14 15:03:45.953177: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-14 15:03:45.953211: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
import torch

torch.cuda.is_available()

True

In [5]:
dotenv.load_dotenv(find_dotenv())


True

In [2]:
test_container = TestContainer()

one_llm_setting = test_container.applications.settings.one_llm()
one_embedding_setting = test_container.applications.settings.one_embedding()

one_datastore = test_container.applications.datastores.one()
two_datastore = test_container.applications.datastores.two()
three_datastore = test_container.applications.datastores.three()
four_datastore = test_container.applications.datastores.four()
temp_datastore = test_container.applications.datastores.temp()

file_document_repository = test_container.applications.repositories.file_document()
text_document_repository = test_container.applications.repositories.text_document()
web_document_repository = test_container.applications.repositories.web_document()

libre_office_document_converter = test_container.applications.use_cases.document_converter.libre_office()
marker_document_converter = test_container.applications.use_cases.document_converter.marker()
document_management = test_container.applications.use_cases.managements.document()
file_document_management = test_container.applications.use_cases.managements.file_document()
text_document_management = test_container.applications.use_cases.managements.text_document()
web_document_management = test_container.applications.use_cases.managements.web_document()

long_form_qa_graph = test_container.applications.use_cases.graphs.long_form_qa()
passage_search_graph = test_container.applications.use_cases.graphs.passage_search()

process_passage_search = test_container.applications.use_cases.passage_searches.process()
process_long_form_qa = test_container.applications.use_cases.long_form_qas.process()

all_seeder = test_container.seeders.all()

In [3]:
await all_seeder.up()

In [19]:
await all_seeder.down()

In [5]:
await two_datastore.async_client.set("test", "test", ex=10)

True

In [None]:
partition_document_processor: PartitionDocumentProcessor = PartitionDocumentProcessor(
    document_management=document_management,
    file_document_management=file_document_management,
    text_document_management=text_document_management,
    web_document_management=web_document_management,
)
        
summary_document_processor: SummaryDocumentProcessor = SummaryDocumentProcessor()
category_document_processor: CategoryDocumentProcessor = CategoryDocumentProcessor(
    summary_document_processor=summary_document_processor
)

In [4]:
state = State()
state.authorized_session = all_seeder.session_seeder.session_fake.data[0]
state.session = one_datastore.get_session()


In [3]:
# await state.session.rollback()
# magic: Magic = Magic()
# magic.from_buffer(
#     all_seeder.file_document_seeder.file_document_fake.file_data[0]
# )

4360bfde-f309-418e-b1a9-cf65d16a2f5b


In [6]:
passage_search_process_body = PassageSearchProcessBody(
    input_setting=PassageSearchInputSettingBody(
        document_ids=[
            all_seeder.document_seeder.document_fake.data[0].id,
            all_seeder.document_seeder.document_fake.data[1].id,
            all_seeder.document_seeder.document_fake.data[2].id
        ],
        llm_setting=LlmSetting(
            model_name="claude-3-haiku-20240307",
            max_token=500
        ),
        preprocessor_setting=PreprocessorSetting(
            is_force_refresh_categorized_element=False,
            is_force_refresh_categorized_document=False,
            chunk_size=500,
            overlap_size=50,
            is_include_table=False,
            is_include_image=False
        ),
        embedder_setting=EmbedderSetting(
            is_force_refresh_embedding=False,
            is_force_refresh_document=False,
            model_name="BAAI/bge-m3",
            query_instruction="Given the question, retrieve passage that answer the question."
        ),
        retriever_setting=RetrieverSetting(
            is_force_refresh_relevant_document=False,
            top_k=50
        ),
        reranker_setting=RerankerSetting(
            is_force_refresh_re_ranked_document=False,
            model_name="BAAI/bge-reranker-v2-m3",
            top_k=5
        ),
        question="what is political science?"
    )
)
passage_search_process_response = await process_passage_search.process(
    state=state,
    body=passage_search_process_body
)



BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocument

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BaseDocumentProcessor.categorize_elements: Ignoring element type Title.
BaseDocumentProcessor.categorize_elements: Ignoring element type ListItem.
BaseDocumentProcessor.categorize_elements: Ignoring element type Header.
BaseDocumentProcessor.categorize_elements: Ignoring element type FigureCaption.
BaseDocumentProcessor.categorize_elements: Ignoring element type Header.
BaseDocumentProcessor.categorize_elements: Ignoring element type Header.
BaseDocumentProcessor.categorize_elements: Ignoring element type Header.
BaseDocumentProcessor.categorize_elements: Ignoring element type Title.
BaseDocumentProcessor.categorize_elements: Ignoring element type FigureCaption.
BaseDocumentProcessor.categorize_elements: Ignoring element type Header.
BaseDocumentProcessor.categorize_elements: Ignoring element type Header.
BaseDocumentProcessor.categorize_elements: Ignoring element type Header.
BaseDocumentProcessor.categorize_elements: Ignoring element type Header.
BaseDocumentProcessor.categorize_elem

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

convert /app/apps/outers/datastores/temps/file_documents/libre_office_converted_documents/da36a217-4c0a-44ea-b931-05fe101e8bd4.txt -> /app/apps/outers/datastores/temps/file_documents/libre_office_converted_documents/da36a217-4c0a-44ea-b931-05fe101e8bd4.pdf using filter : writer_pdf_Export


In [7]:
passage_search_process_response

ProcessResponse(re_ranked_documents=[{'page_content': 'Political science is the scientific study of politics. It is a social science dealing with systems of governance and power, and the analysis of political activities, political thoughts, political behavior, and political structures.', 'metadata': {'re_ranked_score': 0.9999151889582765, 'id': 'd257b88f-23de-4aa4-b408-b675e52af897', 'category': 'text', 'orig_metadata': [{'languages': ['eng'], 'filetype': 'text/plain'}], 'document_id': 'da36a217-4c0a-44ea-b931-05fe101e8bd4', 'relevancy_score': 0.032786883413791656}, 'type': 'Document'}, {'page_content': 'science whose pursuits are aimed at solving different cognitive problems commonly associated with the human intelligence, such as learning, problem solving, and pattern recognition, and subsequently adapting [11]. As a the- ory, Chassignol et al. deﬁned AI as a theoretical framework guiding the development and use of computer systems with the capabilities of human beings, more particul

In [24]:
long_form_qa_process_body = LongFormQaProcessBody(
    input_setting=LongFormQaInputSettingBody(
        document_ids=[
            all_seeder.document_seeder.document_fake.data[0].id,
            all_seeder.document_seeder.document_fake.data[1].id,
            all_seeder.document_seeder.document_fake.data[2].id
        ],
        llm_setting=LlmSetting(
            model_name="claude-3-haiku-20240307",
            max_token=500
        ),
        preprocessor_setting=PreprocessorSetting(
            is_force_refresh_categorized_element=False,
            is_force_refresh_categorized_document=False,
            chunk_size=500,
            overlap_size=50,
            is_include_table=False,
            is_include_image=False
        ),
        embedder_setting=EmbedderSetting(
            is_force_refresh_embedding=False,
            is_force_refresh_document=False,
            model_name="BAAI/bge-m3",
            query_instruction="Given the question, retrieve passage that answer the question."
        ),
        retriever_setting=RetrieverSetting(
            is_force_refresh_relevant_document=False,
            top_k=50
        ),
        reranker_setting=RerankerSetting(
            is_force_refresh_re_ranked_document=False,
            model_name="BAAI/bge-reranker-v2-m3",
            top_k=5
        ),
        question="what is political science?",
        generator_setting=GeneratorSetting(
            is_force_refresh_generated_answer=False,
            is_force_refresh_generated_question=False,
            is_force_refresh_generated_hallucination_grade_hash=False,
            is_force_refresh_generated_answer_relevancy_grade_hash=False,
            prompt_text="""Instruction: Create a concise and informative answer for a given question based solely on the given passages. You must only use information from the given passages. Use an unbiased and journalistic tone. Do not repeat text. Cite at least one passage in each sentence. Cite the passages using passage number notation like "[number]". If multiple passages contain the answer, cite those passages like "[number, number, etc.]". If the passages do not contain the answer to the question, then say that answering is not possible given the available information with the explanation. Ensure the output is only the answer without re-explain the instruction.
            Passages:
            {% for passage in passages %}
            [{{ loop.index }}]={{ passage.page_content }}
            {% endfor %}
            Question: {{ question }}
            Answer:"""
        ),
        transform_question_max_retry=3
    )
)
long_form_qa_process_response = await process_long_form_qa.process(
    state=state,
    body=long_form_qa_process_body
)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

  warn_beta(


In [6]:
long_form_qa_process_response

True

In [8]:
converted_document_data = await libre_office_document_converter.convert(
    state=state,
    document_id=all_seeder.document_seeder.document_fake.data[1].id,
    output_format="pdf"
)
marked_document_data = await marker_document_converter.convert(
    input_file_data=converted_document_data,
    highlights=[("label", "Political science")]
)

convert /app/apps/outers/datastores/temps/file_documents/libre_office_converted_documents/051e9e65-0bcb-4cd2-a0a4-5ba64cc425cf.txt -> /app/apps/outers/datastores/temps/file_documents/libre_office_converted_documents/051e9e65-0bcb-4cd2-a0a4-5ba64cc425cf.pdf using filter : writer_pdf_Export


In [ ]:
elements = await partition_document_processor.partition(
    state=state,
    document_id=all_seeder.document_seeder.document_fake.data[0].id
)

In [9]:
categorized_elements = await category_document_processor.categorize_elements(
    elements=elements
)
categorized_elements.texts = categorized_elements.texts[:1]
categorized_documents = await category_document_processor.get_categorized_documents(
    categorized_elements=categorized_elements,
    summarization_model=ChatAnthropic(
        anthropic_api_key=one_llm_setting.LLM_ONE_ANTHROPIC_API_KEY_ONE,
        model="claude-3-haiku-20240307",
        max_tokens=100,
        streaming=True,
        temperature=0
    ),
    is_include_table=False,
    is_include_image=False,
    chunk_size=100,
    overlap_size=50,
)
categorized_documents.texts

NameError: name 'category_document_processor' is not defined

In [None]:
# cache_tool.clear_cache()
await one_datastore.retryable(handler)
torch.cuda.empty_cache()
gc.collect()
cache_tool.get_cache()

In [None]:
document_id = all_seeder.file_document_seeder.file_document_fake.data[0].id
output_state: PreparationGraphState

state: State = State()
state.authorized_session = all_seeder.session_seeder.session_fake.data[0]
state.session = one_datastore.get_session()

input_state: PreparationGraphState = {
    "state": state,
    "document_ids": [
        # all_seeder.document_seeder.document_fake.data[0].id,
        all_seeder.document_seeder.document_fake.data[1].id,
        all_seeder.document_seeder.document_fake.data[2].id
    ],
    "llm_setting": {
        "model_name": "claude-3-haiku-20240307",
        "max_token": 500,
        "model": None,
    },
    "preprocessor_setting": {
        "is_force_refresh_categorized_element": False,
        "is_force_refresh_categorized_document": False,
        "chunk_size": 500,
        "overlap_size": 50,
        "is_include_table": False,
        "is_include_image": False,
    },
    "categorized_element_hashes": None,
    "categorized_documents": None,
    "categorized_document_hashes": None,
    "next_document_id": None
}
preparation_graph = PreparationGraph(
    one_llm_setting=one_llm_setting,
    two_datastore=two_datastore,
    partition_document_processor=partition_document_processor,
    category_document_processor=category_document_processor
)
output_state = await preparation_graph.compiled_graph.ainvoke(
    input=input_state
)

output_state

In [19]:
document_category = output_state["categorized_documents"][document_id]
documents = document_category.get_all()

generator_llm = ChatAnthropic(
    model="claude-3-haiku-20240307",
    anthropic_api_key=one_llm_setting.LLM_ONE_ANTHROPIC_API_KEY_ONE
)
critic_llm = ChatAnthropic(
    model="claude-3-opus-20240229",
    anthropic_api_key=one_llm_setting.LLM_ONE_ANTHROPIC_API_KEY_ONE
)
embeddings = InfinityEmbeddings(
    model="intfloat/multilingual-e5-large-instruct",
    infinity_api_url=one_embedding_setting.URL
)

generator = TestsetGenerator.from_langchain(
    generator_llm=generator_llm,
    critic_llm=critic_llm,
    embeddings=embeddings
)

test_set = generator.generate_with_langchain_docs(
    documents=documents,
    test_size=1,
    distributions={
        evolutions.simple: 0.5,
        evolutions.reasoning: 0.25,
        evolutions.multi_context: 0.25
    }
)

embedding nodes:   0%|          | 0/708 [00:00<?, ?it/s]

Exception in thread Thread-11:
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.10/dist-packages/ragas/executor.py", line 96, in run
    results = self.loop.run_until_complete(self._aresults())
  File "/usr/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
    return future.result()
  File "/usr/local/lib/python3.10/dist-packages/ragas/executor.py", line 84, in _aresults
    raise e
  File "/usr/local/lib/python3.10/dist-packages/ragas/executor.py", line 79, in _aresults
    r = await future
  File "/usr/lib/python3.10/asyncio/tasks.py", line 571, in _wait_for_one
    return f.result()  # May raise f.exception().
  File "/usr/local/lib/python3.10/dist-packages/ragas/executor.py", line 38, in sema_coro
    return await coro
  File "/usr/local/lib/python3.10/dist-packages/ragas/executor.py", line 112, in wrapped_callable_async
    return counter, await callable(

ExceptionInRunner: The runner thread which was running the jobs raised an exeception. Read the traceback above to debug it. You can also pass `raise_exceptions=False` incase you want to show only a warning message instead.

In [27]:
eval_set = test_set.to_dataset()
eval_set.rename_column(
    original_column_name="answer",
    new_column_name="ground_truth"
)

NameError: name 'test_set' is not defined

In [166]:
for index, eval in enumerate(eval_set):
    state: State = State()
    state.authorized_session = all_seeder.session_seeder.session_fake.data[0]
    state.session = one_datastore.get_session()
    input_state: LongFormQaGraphState = {
        "state": state,
        "document_ids": [
            # all_seeder.document_seeder.document_fake.data[0].id,
            all_seeder.document_seeder.document_fake.data[1].id,
            all_seeder.document_seeder.document_fake.data[2].id
        ],
        "llm_setting": {
            "model_name": "claude-3-haiku-20240307",
            "max_token": 500,
            "model": None,
        },
        "preprocessor_setting": {
            "is_force_refresh_categorized_element": False,
            "is_force_refresh_categorized_document": False,
            "chunk_size": 500,
            "overlap_size:": 50,
            "is_include_table": False,
            "is_include_image": False,
        },
        "categorized_element_hashes": None,
        "categorized_documents": None,
        "categorized_document_hashes": None,
        "next_document_id": None,
        "embedder_setting": {
            "is_force_refresh_embedding": False,
            "is_force_refresh_document": False,
            # "model_name": "intfloat/multilingual-e5-large-instruct",
            "model_name": "BAAI/bge-m3",
            "query_instruction": "Given the question, retrieve passage that answer the question.",
        },
        "retriever_setting": {
            "is_force_refresh_relevant_document": False,
            "top_k": 50,
        },
        "reranker_setting": {
            "is_force_refresh_re_ranked_document": False,
            "model_name": "BAAI/bge-reranker-v2-m3",
            "top_k": 5,
        },
        "embedded_document_ids": None,
        "next_categorized_document": None,
        "relevant_documents": None,
        "relevant_document_hash": None,
        "re_ranked_documents": None,
        "re_ranked_document_hash": None,
        "question": "what is political science?",
        "generator_setting": {
            "is_force_refresh_generated_answer": False,
            "is_force_refresh_generated_question": False,
            "is_force_refresh_generated_hallucination_grade_hash": False,
            "is_force_refresh_generated_answer_relevancy_grade_hash": False,
            "prompt_text": """Instruction: Create a concise and informative answer for a given question based solely on the given passages. You must only use information from the given passages. Use an unbiased and journalistic tone. Do not repeat text. Cite at least one passage in each sentence. Cite the passages using passage number notation like "[number]". If multiple passages contain the answer, cite those passages like "[number, number, etc.]". If the passages do not contain the answer to the question, then say that answering is not possible given the available information with the explanation. Ensure the output is only the answer without re-explain the instruction.
            Passages:
            {% for passage in passages %}
            [{{ loop.index }}]={{ passage.page_content }}
            {% endfor %}
            Question: {{ question }}
            Answer:"""
        },
        "transform_question_max_retry": 0,
        "generated_answer": None,
        "generated_answer_hash": None,
        "generated_question": None,
        "generated_question_hash": None,
        "generated_hallucination_grade": None,
        "generated_hallucination_grade_hash": None,
        "generated_answer_relevancy_grade": None,
        "generated_answer_relevancy_grade_hash": None,
    }
    output_state = await long_form_qa_graph.compiled_graph.ainvoke(input_state)

    eval_set[index]["contexts"] = [document.page_content for document in
                                   output_state["categorized_documents"][document_id].get_all()]
    eval_set[index]["answer"] = output_state["generated_answer"]

NameError: name 'eval_set' is not defined

In [23]:
# loading the V2 dataset
amnesty_qa = load_dataset("explodinggradients/amnesty_qa", "english_v2", trust_remote_code=True)

Repo card metadata block was not found. Setting CardData to empty.


In [24]:
eval_set_2 = amnesty_qa["eval"].select(range(1))
eval_set_2

Dataset({
    features: ['question', 'ground_truth', 'answer', 'contexts'],
    num_rows: 1
})

In [25]:
result = evaluate(
    dataset=eval_set_2,
    llm=critic_llm,
    embeddings=embeddings,
    metrics=[
        metrics.faithfulness,
        metrics.answer_relevancy,
        metrics.context_recall,
        metrics.context_precision,
        #     metrics.answer_correctness,
        #     metrics.context_relevancy,
        #     metrics.context_entity_recall,
    ],
)

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

  for attr in assigned:
Task was destroyed but it is pending!
task: <Task pending name='Task-369' coro=<as_completed.<locals>.sema_coro() running at /usr/local/lib/python3.10/dist-packages/ragas/executor.py:37> wait_for=<Future pending cb=[Task.task_wakeup()]> cb=[as_completed.<locals>._on_completion() at /usr/lib/python3.10/asyncio/tasks.py:558]>
Task was destroyed but it is pending!
task: <Task pending name='Task-52' coro=<as_completed.<locals>.sema_coro() running at /usr/local/lib/python3.10/dist-packages/ragas/executor.py:38> wait_for=<Future pending cb=[Task.task_wakeup()]> cb=[as_completed.<locals>._on_completion() at /usr/lib/python3.10/asyncio/tasks.py:558]>
Task was destroyed but it is pending!
task: <Task pending name='Task-55' coro=<as_completed.<locals>.sema_coro() running at /usr/local/lib/python3.10/dist-packages/ragas/executor.py:38> wait_for=<Future pending cb=[Task.task_wakeup()]> cb=[as_completed.<locals>._on_completion() at /usr/lib/python3.10/asyncio/tasks.py:558]>


In [26]:
result

{'faithfulness': 0.5714, 'answer_relevancy': 1.0000, 'context_recall': 1.0000, 'context_precision': 1.0000}