### LLM's

#### AWS

In [1]:
import boto3
import os

bedrock_runtime = boto3.client(
    service_name="bedrock-runtime",
    region_name="us-east-1",
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
    aws_session_token=os.getenv("AWS_SESSION_TOKEN"),
)

from langchain_aws import ChatBedrock

model_id = "us.anthropic.claude-3-5-sonnet-20241022-v2:0"

claude_3_sonnet = ChatBedrock(
    client=bedrock_runtime,
    model_id=model_id,
)

### Document Loaders

In [2]:
file_paths = [
    "../data/OJ_L_202401689_EN_TXT.pdf",
]

In [3]:
from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader

path = "../data"
loader = DirectoryLoader(
    path,
    glob="*.pdf",
    # loader_kwargs={
    #     "strategy": "hi_res",
    #     # "mode": "elements",
    #     # "chunking_strategy": "by_title",
    # },
)
docs = loader.load()

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

child_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200, chunk_overlap=0, separators=["\n\n", "\n", " ", ""]
)

docs_splitter = child_splitter.split_documents(docs)

### Test Dataset Generation

In [5]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    encode_kwargs={"normalize_embeddings": True, "batch_size": 8},
)

generator_llm = LangchainLLMWrapper(claude_3_sonnet)

generator_embeddings = LangchainEmbeddingsWrapper(embeddings)

In [6]:
from ragas.testset import TestsetGenerator
from ragas.cost import CostCallbackHandler
from ragas import RunConfig

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(
    docs_splitter,
    testset_size=50,
    run_config=RunConfig(timeout=180, max_retries=10, max_wait=180, seed=42),
)

Applying SummaryExtractor:   0%|          | 0/570 [00:00<?, ?it/s]

Error raised by bedrock service
Traceback (most recent call last):
  File "/Users/pmalla01/Desktop/assignment_clifford/env/lib/python3.10/site-packages/langchain_aws/llms/bedrock.py", line 918, in _prepare_input_and_invoke
    response = self.client.invoke_model(**request_options)
  File "/Users/pmalla01/Desktop/assignment_clifford/env/lib/python3.10/site-packages/botocore/client.py", line 570, in _api_call
    return self._make_api_call(operation_name, kwargs)
  File "/Users/pmalla01/Desktop/assignment_clifford/env/lib/python3.10/site-packages/botocore/context.py", line 124, in wrapper
    return func(*args, **kwargs)
  File "/Users/pmalla01/Desktop/assignment_clifford/env/lib/python3.10/site-packages/botocore/client.py", line 1031, in _make_api_call
    raise error_class(parsed_response, operation_name)
botocore.errorfactory.ThrottlingException: An error occurred (ThrottlingException) when calling the InvokeModel operation (reached max retries: 4): Too many requests, please wait befo

Applying CustomNodeFilter:   0%|          | 0/666 [00:00<?, ?it/s]

Node 465ca2c1-97b0-4a9a-8c54-a428c28a6928 does not have a summary. Skipping filtering.
Node fca9961b-9a73-4c3e-a101-37cdca64baa8 does not have a summary. Skipping filtering.
Node d6ede271-0886-4039-bb9e-039a49ddae7a does not have a summary. Skipping filtering.
Node f23cfb2b-0af5-458a-801f-6bef729e6e70 does not have a summary. Skipping filtering.
Node 937ad79c-b652-4848-a8d5-952289ac40e3 does not have a summary. Skipping filtering.
Node b131a487-3e7d-4b8d-a42a-d28ba820e954 does not have a summary. Skipping filtering.
Node 3819ca15-418e-4f67-b2a5-bb27b86e1297 does not have a summary. Skipping filtering.
Node e39714b8-294a-45cf-ba98-fecbbbaf5312 does not have a summary. Skipping filtering.
Node 443cb018-23ad-4cb2-a137-bb1873c4f1e4 does not have a summary. Skipping filtering.
Node 48a3de25-28d1-464d-aa86-6c85ecde07a0 does not have a summary. Skipping filtering.
Node 40affdc6-0076-4fc6-8931-8af8610b65e0 does not have a summary. Skipping filtering.
Node d2733ec6-9b7c-468b-9602-08c3ae375b41 d

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/1902 [00:00<?, ?it/s]

Error raised by bedrock service
Traceback (most recent call last):
  File "/Users/pmalla01/Desktop/assignment_clifford/env/lib/python3.10/site-packages/langchain_aws/llms/bedrock.py", line 918, in _prepare_input_and_invoke
    response = self.client.invoke_model(**request_options)
  File "/Users/pmalla01/Desktop/assignment_clifford/env/lib/python3.10/site-packages/botocore/client.py", line 570, in _api_call
    return self._make_api_call(operation_name, kwargs)
  File "/Users/pmalla01/Desktop/assignment_clifford/env/lib/python3.10/site-packages/botocore/context.py", line 124, in wrapper
    return func(*args, **kwargs)
  File "/Users/pmalla01/Desktop/assignment_clifford/env/lib/python3.10/site-packages/botocore/client.py", line 1031, in _make_api_call
    raise error_class(parsed_response, operation_name)
botocore.errorfactory.ThrottlingException: An error occurred (ThrottlingException) when calling the InvokeModel operation (reached max retries: 4): Too many requests, please wait befo

Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
dataset.to_pandas()

In [None]:
dataset.to_pandas().to_csv("../datasets/synthetic_dataset.csv")