# *import packages and module*

In [None]:

from pydantic import BaseModel
from typing import Optional, List
import random
import numpy as np
from langchain.document_loaders import TextLoader
from langchain.text_splitter import TokenTextSplitter
from custom_models import CustomLLModel, CustomEmbeddingModel
from synthetic_dataset_generate import DeepEvalSynthesizer
from utils.llm_con import get_chat_openai

llm = get_chat_openai()
deep_e = DeepEvalSynthesizer(CustomLLModel(llm), CustomEmbeddingModel())
embeddings = CustomEmbeddingModel()

# *chunking*

In [None]:

path = '../../Datasets/docx_example.docx'
text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=0)
loader = TextLoader("../../Datasets/txt_example.txt")
raw_chunks = loader.load_and_split(text_splitter)

In [None]:
raw_chunks


# *embedding and Context Generation*

In [None]:
content = [rc.page_content for rc in raw_chunks]
embeddings = embeddings.embed_texts(content)

In [None]:

reference_index = random.randint(0, len(embeddings) - 1)
reference_embedding = embeddings[reference_index]
contexts = [content[reference_index]]

similarity_threshold = 0.7
similar_indices = []
for i, embedding in enumerate(embeddings):
    product = np.dot(reference_embedding, embedding)
    norm = np.linalg.norm(reference_embedding) * np.linalg.norm(embedding)
    similarity = product / norm
    if similarity >= similarity_threshold:
        similar_indices.append(i)

for i in similar_indices:
    contexts.append(content[i])

# *Query Generation*

In [None]:
from langchain_openai import ChatOpenAI

prompt = f"""I want you act as a copywriter. Based on the given context,
which is list of strings, please generate a list of JSON objects
with a `input` key. The `input` can either be a question or a
statement that can be addressed by the given context.

contexts:
{contexts}"""

query = llm.invoke(prompt)

In [None]:
print(query.content)

# *Query Evolution*

In [None]:
example_generated_query = "what is recipe of Apple Turnovers"
context = contexts
original_input = example_generated_query
num_evolution_steps = 3

multi_context_template = f"""
I want you to rewrite the given `input` so that it requires readers to use information from all elements in `Context`.

1. `Input` should require information from all `Context` elements.
2. `Rewritten Input` must be concise and fully answerable from `Context`.
3. Do not use phrases like 'based on the provided context.'
4. `Rewritten Input` should not exceed 15 words.

Context: {context}
Input: {original_input}
Rewritten Input:
"""

reasoning_template = f"""
I want you to rewrite the given `input` so that it explicitly requests multi-step reasoning.

1. `Rewritten Input` should require multiple logical connections or inferences.
2. `Rewritten Input` should be concise and understandable.
3. Do not use phrases like 'based on the provided context.'
4. `Rewritten Input` must be fully answerable from `Context`.
5. `Rewritten Input` should not exceed 15 words.

Context: {context}
Input: {original_input}
Rewritten Input:
"""

hypothetical_scenario_template = f"""
I want you to rewrite the given `input` to incorporate a hypothetical or speculative scenario.

1. `Rewritten Input` should encourage applying knowledge from `Context` to deduce outcomes.
2. `Rewritten Input` should be concise and understandable.
3. Do not use phrases like 'based on the provided context.'
4. `Rewritten Input` must be fully answerable from `Context`.
5. `Rewritten Input` should not exceed 15 words.

Context: {context}
Input: {original_input}
Rewritten Input:
"""
evolution_templates = [multi_context_template, reasoning_template, hypothetical_scenario_template]


def evolve_query(original_input: str, context, steps: int) -> str:
    current_input = original_input
    for _ in range(steps):
        chosen_template = random.choice(evolution_templates)
        evolved_prompt = (
            chosen_template
            .replace("{context}", str(context))
            .replace("{original_input}", current_input)
        )
        print("Prompt sent to LLM:\n", evolved_prompt)

        response = llm.invoke(evolved_prompt)
        current_input = response.content

    return current_input


evolved_query = evolve_query(original_input, context, num_evolution_steps)

In [None]:
evolved_query

# *Expected Output Generation*

In [None]:
expected_output_template = f"""
I want you to generate an answer for the given `input`. This answer has to be factually aligned to the provided context.

Context: {context}
Input: {evolved_query}
Answer:
"""

prompt = expected_output_template.replace("{context}", str(context)).replace("{evolved_query}", evolved_query)

expected_output = llm.invoke(prompt)

In [None]:
print(expected_output.content)

In [None]:


class SyntheticData(BaseModel):
    query: str
    expected_output: Optional[str]
    context: List[str]


def as_str(x):
    return x.content if hasattr(x, "content") else x


synthetic_data = SyntheticData(
    query=evolved_query,
    expected_output=as_str(expected_output),
    context=context,
)

synthetic_dataset = []
synthetic_dataset.append(synthetic_data)

In [None]:
synthetic_dataset[0].context

# *Generating Synthetic Datasets Using DeepEval*

In [None]:
path = '../../Datasets/txt_example.txt'

result = deep_e.generate_goldens_from_documents(document_paths=path)

In [None]:
deep_e.to_dataframe()