In [10]:
import os

from langchain_community.chat_models import ChatHuggingFace
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

os.chdir("/app")

from apps.inners.use_cases.embeddings.bge_m3_embedding import BgeM3Embedding

from apps.inners.use_cases.document_converters.libre_office_document_converter import LibreOfficeDocumentConverter
from apps.inners.use_cases.document_converters.marker_document_converter import MarkerDocumentConverter
from apps.inners.use_cases.graphs.long_form_qa_graph import LongFormQaGraph
from apps.inners.use_cases.graphs.passage_search_graph import PassageSearchGraph
from apps.inners.use_cases.long_form_qas.process_long_form_qa import ProcessLongFormQa
from apps.inners.use_cases.managements.document_management import DocumentManagement
from apps.inners.use_cases.managements.file_document_management import FileDocumentManagement
from apps.inners.use_cases.managements.text_document_management import TextDocumentManagement
from apps.inners.use_cases.managements.web_document_management import WebDocumentManagement
from apps.inners.use_cases.passage_searches.process_passage_search import ProcessPassageSearch
from apps.outers.datastores.four_datastore import FourDatastore
from apps.outers.datastores.one_datastore import OneDatastore
from apps.outers.datastores.temp_datastore import TempDatastore
from apps.outers.datastores.three_datastore import ThreeDatastore
from apps.outers.datastores.two_datastore import TwoDatastore
from apps.outers.repositories.file_document_repository import FileDocumentRepository
from apps.outers.repositories.text_document_repository import TextDocumentRepository
from apps.outers.repositories.web_document_repository import WebDocumentRepository
from apps.outers.settings.one_embedding_setting import OneEmbeddingSetting
from apps.outers.settings.one_llm_setting import OneLlmSetting
from tests.seeders.all_seeder import AllSeeder

from apps.inners.models.dtos.contracts.requests.long_form_qas.input_setting_body import GeneratorSetting

from apps.inners.models.dtos.contracts.requests.long_form_qas.input_setting_body import \
    InputSettingBody as LongFormQaInputSettingBody
from apps.inners.models.dtos.contracts.requests.passage_searches.input_setting_body import \
    InputSettingBody as PassageSearchInputSettingBody, LlmSetting, \
    PreprocessorSetting, EmbedderSetting, RetrieverSetting, RerankerSetting
from apps.inners.models.dtos.contracts.requests.passage_searches.process_body import \
    ProcessBody as PassageSearchProcessBody
from apps.inners.models.dtos.contracts.requests.long_form_qas.process_body import ProcessBody as LongFormQaProcessBody

from langchain_anthropic import ChatAnthropic
from ragas.testset import TestsetGenerator, evolutions

from apps.inners.use_cases.graphs.preparation_graph import PreparationGraph
from apps.inners.use_cases.document_processor.category_document_processor import CategoryDocumentProcessor
from apps.inners.use_cases.document_processor.partition_document_processor import PartitionDocumentProcessor
from apps.inners.use_cases.document_processor.summary_document_processor import SummaryDocumentProcessor

from tools import cache_tool

import gc

from starlette.datastructures import State

import dotenv
from datasets import load_dataset
from dotenv import find_dotenv
from ragas import evaluate, metrics

from tests.containers.test_container import TestContainer
from apps.inners.models.dtos.graph_state import LongFormQaGraphState, PreparationGraphState


In [14]:
!python3 -m pytest --cov=apps --cov-report=html tests -n 1 

platform linux -- Python 3.10.12, pytest-8.1.1, pluggy-1.4.0
rootdir: /app
plugins: cov-5.0.0, xdist-3.5.0, asyncio-0.23.6, anyio-4.3.0
asyncio: mode=strict
1 worker [43 items][0m[0m[1m[1m
[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[33m                              [100%][0m
  /usr/local/lib/python3.10/dist-packages/pydantic/_internal/_config.py:284: PydanticDeprecatedSince20: Support for class-based `config` is deprecated, use ConfigDict instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.7/migration/

../usr/local/lib/

In [2]:
# !pip show flagembedding
# !pip show langchain-anthropic
# !pip show pymilvus
# !pip show opencv-python
# !libreoffice --help
# !wkhtmltopdf
# !apt install strace
# !strace -e open,openat python3 -c "import tensorflow as tf" 2>&1 | grep "libnvinfer\|TF-TRT"
# !echo $(echo $(dirname $(dirname $(python3 -c "import nvidia.cudnn;print(nvidia.cudnn.__file__)")))/*/lib/ | sed -r 's/\s+/:/g')${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
# !ls /TensorRT-8.6.1.6
!python3 -m langchain_core.sys_info


System Information
------------------
> OS:  Linux
> OS Version:  #1 SMP Thu Mar 7 03:22:57 UTC 2024
> Python Version:  3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]

Package Information
-------------------
> langchain_core: 0.1.43
> langchain: 0.1.16
> langchain_community: 0.0.33
> langsmith: 0.1.48
> langchain_anthropic: 0.1.8
> langchain_openai: 0.1.3
> langchain_text_splitters: 0.0.1
> langgraph: 0.0.37

Packages not installed (Not Necessarily a Problem)
--------------------------------------------------
The following packages were not found:

> langserve


In [7]:
import tensorflow
from tensorflow.compiler.tf2tensorrt._pywrap_py_utils import get_linked_tensorrt_version
from tensorflow.compiler.tf2tensorrt._pywrap_py_utils import get_loaded_tensorrt_version

print(f"Linked TensorRT version {get_linked_tensorrt_version()}")
print(f"Loaded TensorRT version {get_loaded_tensorrt_version()}")

tensorflow.config.list_physical_devices('GPU')

Linked TensorRT version (8, 6, 1)
Loaded TensorRT version (8, 6, 1)


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [6]:
# Sanity check if TensorRT is working.
import numpy as np
from tensorflow import keras


def get_model():
    # Create a simple model.
    inputs = keras.Input(shape=(32,))
    outputs = keras.layers.Dense(1)(inputs)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer="adam", loss="mean_squared_error")
    return model


model = get_model()

# Train the model.
test_input = np.random.random((128, 32))
test_target = np.random.random((128, 1))
model.fit(test_input, test_target)

# Calling `save('my_model')` creates a SavedModel folder `my_model`.
model.save("my_model.keras")

!aria2c https://raw.githubusercontent.com/tensorflow/tensorrt/master/tftrt/blog_posts/Leveraging%20TensorFlow-TensorRT%20integration%20for%20Low%20latency%20Inference/tf2_inference.py
!python3 tf2_inference.py --use_tftrt_model --precision fp16

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2915  

04/16 06:43:07 [[1;32mNOTICE[0m] Downloading 1 item(s)

04/16 06:43:08 [[1;32mNOTICE[0m] File already exists. Renamed to /tf2_inference.1.py.

04/16 06:43:08 [[1;32mNOTICE[0m] Download complete: /tf2_inference.1.py

Download Results:
gid   |stat|avg speed  |path/URI
661654|[1;32mOK[0m  |   912KiB/s|/tf2_inference.1.py

Status Legend:
(OK):download completed.

Inference using: TF-TRT ...
Batch size: 512
Precision:  fp16

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
TrtConversionParams(max_workspace_size_bytes=8589934592, precision_mode='FP16', minimum_segment_size=3, maximum_cached_engines=100, use_calibration=True, allow_build_at_runtime=True)
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Instructions for updating:
Use individual converter parameters instead



KeyboardInterrupt



In [4]:
import torch

torch.cuda.is_available()

True

In [2]:
dotenv.load_dotenv(find_dotenv())


True

In [2]:
test_container: TestContainer = TestContainer()

one_llm_setting: OneLlmSetting = test_container.applications.settings.one_llm()
one_embedding_setting: OneEmbeddingSetting = test_container.applications.settings.one_embedding()

one_datastore: OneDatastore = test_container.applications.datastores.one()
two_datastore: TwoDatastore = test_container.applications.datastores.two()
three_datastore: ThreeDatastore = test_container.applications.datastores.three()
four_datastore: FourDatastore = test_container.applications.datastores.four()
temp_datastore: TempDatastore = test_container.applications.datastores.temp()

file_document_repository: FileDocumentRepository = test_container.applications.repositories.file_document()
text_document_repository: TextDocumentRepository = test_container.applications.repositories.text_document()
web_document_repository: WebDocumentRepository = test_container.applications.repositories.web_document()

libre_office_document_converter: LibreOfficeDocumentConverter = test_container.applications.use_cases.document_converter.libre_office()
marker_document_converter: MarkerDocumentConverter = test_container.applications.use_cases.document_converter.marker()
document_management: DocumentManagement = test_container.applications.use_cases.managements.document()
file_document_management: FileDocumentManagement = test_container.applications.use_cases.managements.file_document()
text_document_management: TextDocumentManagement = test_container.applications.use_cases.managements.text_document()
web_document_management: WebDocumentManagement = test_container.applications.use_cases.managements.web_document()

long_form_qa_graph: LongFormQaGraph = test_container.applications.use_cases.graphs.long_form_qa()
passage_search_graph: PassageSearchGraph = test_container.applications.use_cases.graphs.passage_search()

process_passage_search: ProcessPassageSearch = test_container.applications.use_cases.passage_searches.process()
process_long_form_qa: ProcessLongFormQa = test_container.applications.use_cases.long_form_qas.process()

all_seeder: AllSeeder = test_container.seeders.all()

In [3]:
await all_seeder.up()

In [54]:
await all_seeder.down()

In [5]:
await two_datastore.async_client.set("test", "test", ex=10)

True

In [5]:
partition_document_processor: PartitionDocumentProcessor = PartitionDocumentProcessor(
    document_management=document_management,
    file_document_management=file_document_management,
    text_document_management=text_document_management,
    web_document_management=web_document_management,
)

summary_document_processor: SummaryDocumentProcessor = SummaryDocumentProcessor()
category_document_processor: CategoryDocumentProcessor = CategoryDocumentProcessor(
    summary_document_processor=summary_document_processor
)

In [6]:
state = State()
state.authorized_session = all_seeder.session_seeder.session_fake.data[0]
state.session = one_datastore.get_session()


In [79]:
from typing import List
from apps.inners.models.daos.document import Document
from sqlalchemy import text, ScalarResult

session = state.session
await session.close()
account_id: int = state.authorized_session.account_id
filter: dict = {
    "name": "NAME1",
}
size = 1000
filter_expressions: List[str] = []
for key, value in filter.items():
    # filter_expressions.append(f"account_document.{key}::text like '%{value}%'")
    filter_expressions.append(f"SIMILARITY(account_document.{key}::text, '{value}')")
query: str = f"""
    SELECT *
    FROM (
        SELECT * 
        FROM document
        WHERE account_id = '{account_id}'
    ) AS account_document
    ORDER BY (({'+'.join(filter_expressions)})/{len(filter_expressions)}) DESC
    LIMIT {size};
""".replace('\n', ' ')

found_document_result: ScalarResult[Document] = await session.exec(
    text(query)
)
found_documents: List[Document] = list(found_document_result.all())
found_documents

[(UUID('1f119c48-7529-4672-9244-d5f9f5892180'), 'name1', 'description1', 'text', UUID('9ff3f8c1-5984-48db-a016-9d062c5c474b')),
 (UUID('c5c0d069-c885-4eae-8cdf-54479fd74582'), 'name2', 'description2', 'web', UUID('9ff3f8c1-5984-48db-a016-9d062c5c474b')),
 (UUID('94b8c808-396a-4310-be58-56fbec19d2e4'), 'name0', 'description0', 'file', UUID('9ff3f8c1-5984-48db-a016-9d062c5c474b'))]

In [9]:
# await state.session.rollback()
# magic: Magic = Magic()
# magic.from_buffer(
#     all_seeder.file_document_seeder.file_document_fake.file_data[0]
# )
found_documents = await document_management.find_many_with_authorization_and_pagination(
    state=state,
    page_position=1,
    page_size=1000
)
found_file_documents = await file_document_repository.find_many_by_id_and_account_id(
    session=state.session,
    ids=[found_document.id for found_document in found_documents],
    account_id=state.authorized_session.account_id
)

In [12]:
found_documents
found_file_documents

[FileDocument(file_name='646cb7a0-86da-4fb9-be93-9284f06ad27f.pdf', id=UUID('bafa984d-a908-4575-b88d-44e093ea0664'), file_data_hash='8d6a191a8bb02b3cd77352306b856e2269394cc5f43110df19c9ae11384637e5')]

In [6]:
passage_search_process_body = PassageSearchProcessBody(
    input_setting=PassageSearchInputSettingBody(
        document_ids=[
            all_seeder.document_seeder.document_fake.data[0].id,
            all_seeder.document_seeder.document_fake.data[1].id,
            all_seeder.document_seeder.document_fake.data[2].id
        ],
        llm_setting=LlmSetting(
            model_name="claude-3-haiku-20240307",
            max_token=500
        ),
        preprocessor_setting=PreprocessorSetting(
            is_force_refresh_categorized_element=False,
            is_force_refresh_categorized_document=False,
            chunk_size=500,
            overlap_size=50,
            is_include_table=False,
            is_include_image=False
        ),
        embedder_setting=EmbedderSetting(
            is_force_refresh_embedding=False,
            is_force_refresh_document=False,
            model_name="BAAI/bge-m3",
            query_instruction="Given the question, retrieve passage that answer the question."
        ),
        retriever_setting=RetrieverSetting(
            is_force_refresh_relevant_document=False,
            top_k=50
        ),
        reranker_setting=RerankerSetting(
            is_force_refresh_re_ranked_document=False,
            model_name="BAAI/bge-reranker-v2-m3",
            top_k=5
        ),
        question="what is political science?"
    )
)
passage_search_process_response = await process_passage_search.process(
    state=state,
    body=passage_search_process_body
)



BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocument

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BaseDocumentProcessor.categorize_elements: Ignoring element type Title.
BaseDocumentProcessor.categorize_elements: Ignoring element type ListItem.
BaseDocumentProcessor.categorize_elements: Ignoring element type Header.
BaseDocumentProcessor.categorize_elements: Ignoring element type FigureCaption.
BaseDocumentProcessor.categorize_elements: Ignoring element type Header.
BaseDocumentProcessor.categorize_elements: Ignoring element type Header.
BaseDocumentProcessor.categorize_elements: Ignoring element type Header.
BaseDocumentProcessor.categorize_elements: Ignoring element type Title.
BaseDocumentProcessor.categorize_elements: Ignoring element type FigureCaption.
BaseDocumentProcessor.categorize_elements: Ignoring element type Header.
BaseDocumentProcessor.categorize_elements: Ignoring element type Header.
BaseDocumentProcessor.categorize_elements: Ignoring element type Header.
BaseDocumentProcessor.categorize_elements: Ignoring element type Header.
BaseDocumentProcessor.categorize_elem

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

convert /app/apps/outers/datastores/temps/file_documents/libre_office_converted_documents/da36a217-4c0a-44ea-b931-05fe101e8bd4.txt -> /app/apps/outers/datastores/temps/file_documents/libre_office_converted_documents/da36a217-4c0a-44ea-b931-05fe101e8bd4.pdf using filter : writer_pdf_Export


In [7]:
passage_search_process_response

ProcessResponse(re_ranked_documents=[{'page_content': 'Political science is the scientific study of politics. It is a social science dealing with systems of governance and power, and the analysis of political activities, political thoughts, political behavior, and political structures.', 'metadata': {'re_ranked_score': 0.9999151889582765, 'id': 'd257b88f-23de-4aa4-b408-b675e52af897', 'category': 'text', 'orig_metadata': [{'languages': ['eng'], 'filetype': 'text/plain'}], 'document_id': 'da36a217-4c0a-44ea-b931-05fe101e8bd4', 'relevancy_score': 0.032786883413791656}, 'type': 'Document'}, {'page_content': 'science whose pursuits are aimed at solving different cognitive problems commonly associated with the human intelligence, such as learning, problem solving, and pattern recognition, and subsequently adapting [11]. As a the- ory, Chassignol et al. deﬁned AI as a theoretical framework guiding the development and use of computer systems with the capabilities of human beings, more particul

In [24]:
long_form_qa_process_body = LongFormQaProcessBody(
    input_setting=LongFormQaInputSettingBody(
        document_ids=[
            all_seeder.document_seeder.document_fake.data[0].id,
            all_seeder.document_seeder.document_fake.data[1].id,
            all_seeder.document_seeder.document_fake.data[2].id
        ],
        llm_setting=LlmSetting(
            model_name="claude-3-haiku-20240307",
            max_token=500
        ),
        preprocessor_setting=PreprocessorSetting(
            is_force_refresh_categorized_element=False,
            is_force_refresh_categorized_document=False,
            chunk_size=500,
            overlap_size=50,
            is_include_table=False,
            is_include_image=False
        ),
        embedder_setting=EmbedderSetting(
            is_force_refresh_embedding=False,
            is_force_refresh_document=False,
            model_name="BAAI/bge-m3",
            query_instruction="Given the question, retrieve passage that answer the question."
        ),
        retriever_setting=RetrieverSetting(
            is_force_refresh_relevant_document=False,
            top_k=50
        ),
        reranker_setting=RerankerSetting(
            is_force_refresh_re_ranked_document=False,
            model_name="BAAI/bge-reranker-v2-m3",
            top_k=5
        ),
        question="what is political science?",
        generator_setting=GeneratorSetting(
            is_force_refresh_generated_answer=False,
            is_force_refresh_generated_question=False,
            is_force_refresh_generated_hallucination_grade_hash=False,
            is_force_refresh_generated_answer_relevancy_grade_hash=False,
            prompt="""Instruction: Create a concise and informative answer for a given question based solely on the given passages. You must only use information from the given passages. Use an academic style. Do not repeat text. Cite at least one passage in each sentence. Cite the passages using passage number notation like "[number]". If multiple passages contain the answer, cite those passages like "[number, number, etc.]". If the passages do not contain the answer to the question, then say that answering is impossible given the available information with the explanation. Ensure the output is not re-explaining the instruction.
            Passages:
            {% for passage in passages %}
            [{{ loop.index }}]={{ passage.page_content }}
            {% endfor %}
            Question: {{ question }}
            Answer:"""
        ),
        transform_question_max_retry=3
    )
)
long_form_qa_process_response = await process_long_form_qa.process(
    state=state,
    body=long_form_qa_process_body
)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

  warn_beta(


In [6]:
long_form_qa_process_response

True

In [8]:
converted_document_data = await libre_office_document_converter.convert(
    state=state,
    document_id=all_seeder.document_seeder.document_fake.data[1].id,
    output_format="pdf"
)
marked_document_data = await marker_document_converter.convert(
    input_file_data=converted_document_data,
    highlights=[("label", "Political science")]
)

convert /app/apps/outers/datastores/temps/file_documents/libre_office_converted_documents/051e9e65-0bcb-4cd2-a0a4-5ba64cc425cf.txt -> /app/apps/outers/datastores/temps/file_documents/libre_office_converted_documents/051e9e65-0bcb-4cd2-a0a4-5ba64cc425cf.pdf using filter : writer_pdf_Export


In [None]:
elements = await partition_document_processor.partition(
    state=state,
    document_id=all_seeder.document_seeder.document_fake.data[0].id
)

In [9]:
categorized_elements = await category_document_processor.categorize_elements(
    elements=elements
)
categorized_elements.texts = categorized_elements.texts[:1]
categorized_documents = await category_document_processor.get_categorized_documents(
    categorized_elements=categorized_elements,
    summarization_model=ChatAnthropic(
        anthropic_api_key=one_llm_setting.LLM_ONE_ANTHROPIC_API_KEY_ONE,
        model="claude-3-haiku-20240307",
        max_tokens=100,
        streaming=True,
        temperature=0
    ),
    is_include_table=False,
    is_include_image=False,
    chunk_size=100,
    overlap_size=50,
)
categorized_documents.texts

NameError: name 'category_document_processor' is not defined

In [None]:
# cache_tool.clear_cache()
torch.cuda.empty_cache()
gc.collect()
cache_tool.get_cache()

In [7]:
input_state: PreparationGraphState = {
    "state": state,
    "document_ids": [
        all_seeder.document_seeder.document_fake.data[0].id,
        all_seeder.document_seeder.document_fake.data[1].id,
        all_seeder.document_seeder.document_fake.data[2].id
    ],
    "llm_setting": {
        "model_name": "claude-3-haiku-20240307",
        "max_token": 500,
        "model": None,
    },
    "preprocessor_setting": {
        "is_force_refresh_categorized_element": False,
        "is_force_refresh_categorized_document": False,
        "file_partition_strategy": "auto",
        "chunk_size": 500,
        "overlap_size": 50,
        "is_include_table": False,
        "is_include_image": False,
    },
    "categorized_element_hashes": None,
    "categorized_documents": None,
    "categorized_document_hashes": None,
}
preparation_graph: PreparationGraph = PreparationGraph(
    one_llm_setting=one_llm_setting,
    two_datastore=two_datastore,
    partition_document_processor=partition_document_processor,
    category_document_processor=category_document_processor
)
output_state: PreparationGraphState = await preparation_graph.compiled_graph.ainvoke(
    input=input_state
)

output_state

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BaseDocumentProcessor.categorize_elements: Ignoring element type Title.
BaseDocumentProcessor.categorize_elements: Ignoring element type Header.
BaseDocumentProcessor.categorize_elements: Ignoring element type FigureCaption.
BaseDocumentProcessor.categorize_elements: Ignoring element type Header.
BaseDocumentProcessor.categorize_elements: Ignoring element type Header.
BaseDocumentProcessor.categorize_elements: Ignoring element type Header.
BaseDocumentProcessor.categorize_elements: Ignoring element type Title.
BaseDocumentProcessor.categorize_elements: Ignoring element type FigureCaption.
BaseDocumentProcessor.categorize_elements: Ignoring element type Header.
BaseDocumentProcessor.categorize_elements: Ignoring element type Header.
BaseDocumentProcessor.categorize_elements: Ignoring element type Header.
BaseDocumentProcessor.categorize_elements: Ignoring element type Header.
BaseDocumentProcessor.categorize_elements: Ignoring element type Title.
BaseDocumentProcessor.categorize_element



BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocumentProcessor.categorize_elements: Ignoring element type HTMLTitle.
BaseDocument

{'state': <starlette.datastructures.State at 0x7fcf2035dab0>,
 'document_ids': [UUID('b92456a7-e6cc-4429-b794-3538482ee983'),
  UUID('6679b57e-a910-401d-bb14-86117d2021a8'),
  UUID('fce3973e-c0f1-49d3-85db-40a8f1370f12')],
 'llm_setting': {'model_name': 'claude-3-haiku-20240307',
  'max_token': 500,
  'model': ChatAnthropic(model='claude-3-haiku-20240307', max_tokens=500, temperature=0.0, anthropic_api_key=SecretStr('**********'), streaming=True, _client=<anthropic.Anthropic object at 0x7fcf107eb430>, _async_client=<anthropic.AsyncAnthropic object at 0x7fcf107e8e50>)},
 'preprocessor_setting': {'is_force_refresh_categorized_element': False,
  'is_force_refresh_categorized_document': False,
  'file_partition_strategy': 'auto',
  'chunk_size': 500,
  'overlap_size': 50,
  'is_include_table': False,
  'is_include_image': False},
 'categorized_element_hashes': {UUID('6679b57e-a910-401d-bb14-86117d2021a8'): 'categorized_element/a1cb2ce354f6c39fabb13f6d30abe78af2deba243cb41a8b783ddba3cc1eaad

In [None]:
model_name = "SeaLLMs/SeaLLM-7B-v2.5"
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=model_name
)
model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_name,
)
pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=8192,
    torch_dtype=torch.bfloat16,
    device_map="cuda"
)
llm = HuggingFacePipeline(pipeline=pipe)
chat_llm = ChatHuggingFace(
    llm=llm
)

generator_llm = chat_llm
critic_llm = chat_llm

# generator_llm = ChatAnthropic(
#     model="claude-3-haiku-20240307",
#     anthropic_api_key=one_llm_setting.LLM_ONE_ANTHROPIC_API_KEY_ONE
# )
# critic_llm = ChatAnthropic(
#     model="claude-3-opus-20240229",
#     anthropic_api_key=one_llm_setting.LLM_ONE_ANTHROPIC_API_KEY_ONE
# )
embeddings = BgeM3Embedding(
    use_fp16=False,
    normalize_embeddings=False,
    return_colbert_vecs=False,
)

config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

In [None]:
generator = TestsetGenerator.from_langchain(
    generator_llm=generator_llm,
    critic_llm=critic_llm,
    embeddings=embeddings
)

document_id = input_state["document_ids"][0]
document_category = output_state["categorized_documents"][document_id]
documents = document_category.get_all()
test_set = generator.generate_with_langchain_docs(
    documents=documents,
    test_size=1,
    distributions={
        evolutions.simple: 0.5,
        evolutions.reasoning: 0.25,
        evolutions.multi_context: 0.25
    }
)

In [27]:
eval_set = test_set.to_dataset()
eval_set.rename_column(
    original_column_name="answer",
    new_column_name="ground_truth"
)

NameError: name 'test_set' is not defined

In [166]:
for index, eval in enumerate(eval_set):
    state: State = State()
    state.authorized_session = all_seeder.session_seeder.session_fake.data[0]
    state.session = one_datastore.get_session()
    input_state: LongFormQaGraphState = {
        "state": state,
        "document_ids": [
            # all_seeder.document_seeder.document_fake.data[0].id,
            all_seeder.document_seeder.document_fake.data[1].id,
            all_seeder.document_seeder.document_fake.data[2].id
        ],
        "llm_setting": {
            "model_name": "claude-3-haiku-20240307",
            "max_token": 500,
            "model": None,
        },
        "preprocessor_setting": {
            "is_force_refresh_categorized_element": False,
            "is_force_refresh_categorized_document": False,
            "chunk_size": 500,
            "overlap_size:": 50,
            "is_include_table": False,
            "is_include_image": False,
        },
        "categorized_element_hashes": None,
        "categorized_documents": None,
        "categorized_document_hashes": None,
        "next_document_id": None,
        "embedder_setting": {
            "is_force_refresh_embedding": False,
            "is_force_refresh_document": False,
            # "model_name": "intfloat/multilingual-e5-large-instruct",
            "model_name": "BAAI/bge-m3",
            "query_instruction": "Given the question, retrieve passage that answer the question.",
        },
        "retriever_setting": {
            "is_force_refresh_relevant_document": False,
            "top_k": 50,
        },
        "reranker_setting": {
            "is_force_refresh_re_ranked_document": False,
            "model_name": "BAAI/bge-reranker-v2-m3",
            "top_k": 5,
        },
        "embedded_document_ids": None,
        "next_categorized_document": None,
        "relevant_documents": None,
        "relevant_document_hash": None,
        "re_ranked_documents": None,
        "re_ranked_document_hash": None,
        "question": "what is political science?",
        "generator_setting": {
            "is_force_refresh_generated_answer": False,
            "is_force_refresh_generated_question": False,
            "is_force_refresh_generated_hallucination_grade": False,
            "is_force_refresh_generated_answer_relevancy_grade": False,
            "prompt": """Instruction: Create a concise and informative answer for a given question based solely on the given passages. You must only use information from the given passages. Use an academic style. Do not repeat text. Cite at least one passage in each sentence. Cite the passages using passage number notation like "[number]". If multiple passages contain the answer, cite those passages like "[number, number, etc.]". If the passages do not contain the answer to the question, then say that answering is impossible given the available information with the explanation. Ensure the output is not re-explaining the instruction.
            Passages:
            {% for passage in passages %}
            [{{ loop.index }}]={{ passage.page_content }}
            {% endfor %}
            Question: {{ question }}
            Answer:"""
        },
        "transform_question_max_retry": 0,
        "generated_answer": None,
        "generated_answer_hash": None,
        "generated_question": None,
        "generated_question_hash": None,
        "generated_hallucination_grade": None,
        "generated_hallucination_grade_hash": None,
        "generated_answer_relevancy_grade": None,
        "generated_answer_relevancy_grade_hash": None,
    }
    output_state = await long_form_qa_graph.compiled_graph.ainvoke(input_state)

    eval_set[index]["contexts"] = [document.page_content for document in
                                   output_state["categorized_documents"][document_id].get_all()]
    eval_set[index]["answer"] = output_state["generated_answer"]

NameError: name 'eval_set' is not defined

In [23]:
# loading the V2 dataset
amnesty_qa = load_dataset("explodinggradients/amnesty_qa", "english_v2", trust_remote_code=True)

Repo card metadata block was not found. Setting CardData to empty.


In [24]:
eval_set_2 = amnesty_qa["eval"].select(range(1))
eval_set_2

Dataset({
    features: ['question', 'ground_truth', 'answer', 'contexts'],
    num_rows: 1
})

In [25]:
result = evaluate(
    dataset=eval_set_2,
    llm=critic_llm,
    embeddings=embeddings,
    metrics=[
        metrics.faithfulness,
        metrics.answer_relevancy,
        metrics.context_recall,
        metrics.context_precision,
        #     metrics.answer_correctness,
        #     metrics.context_relevancy,
        #     metrics.context_entity_recall,
    ],
)

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

  for attr in assigned:
Task was destroyed but it is pending!
task: <Task pending name='Task-369' coro=<as_completed.<locals>.sema_coro() running at /usr/local/lib/python3.10/dist-packages/ragas/executor.py:37> wait_for=<Future pending cb=[Task.task_wakeup()]> cb=[as_completed.<locals>._on_completion() at /usr/lib/python3.10/asyncio/tasks.py:558]>
Task was destroyed but it is pending!
task: <Task pending name='Task-52' coro=<as_completed.<locals>.sema_coro() running at /usr/local/lib/python3.10/dist-packages/ragas/executor.py:38> wait_for=<Future pending cb=[Task.task_wakeup()]> cb=[as_completed.<locals>._on_completion() at /usr/lib/python3.10/asyncio/tasks.py:558]>
Task was destroyed but it is pending!
task: <Task pending name='Task-55' coro=<as_completed.<locals>.sema_coro() running at /usr/local/lib/python3.10/dist-packages/ragas/executor.py:38> wait_for=<Future pending cb=[Task.task_wakeup()]> cb=[as_completed.<locals>._on_completion() at /usr/lib/python3.10/asyncio/tasks.py:558]>


In [26]:
result

{'faithfulness': 0.5714, 'answer_relevancy': 1.0000, 'context_recall': 1.0000, 'context_precision': 1.0000}