In [None]:
# Databricks notebook source
!pip install -q -U tqdm

# COMMAND ----------

!pip install -q -U datasets

# COMMAND ----------

!pip install --upgrade -q -U ragas

# COMMAND ----------

dbutils.library.restartPython()

# COMMAND ----------

from transformers import pipeline
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval.test_case import LLMTestCaseParams, LLMTestCase
from deepeval import assert_test
from deepeval.metrics import (
    GEval, AnswerRelevancyMetric,
    ContextualPrecisionMetric,
    ContextualRecallMetric,
    ContextualRelevancyMetric
)
import openai
from openai import OpenAI, AzureOpenAI
import os
from langchain_openai import AzureChatOpenAI, ChatOpenAI
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
#from dotenv import load_dotenv, find_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate
)
from langchain.chains import LLMChain
from langchain.evaluation import load_evaluator, EvaluatorType
from dotenv import load_dotenv
from langchain_community.document_loaders import WebBaseLoader, TextLoader
from langchain_openai.embeddings import AzureOpenAIEmbeddings
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_chroma import Chroma
from langchain_community.vectorstores import DocArrayInMemorySearch
import pandas as pd
import giskard
import os
from giskard.rag import KnowledgeBase, generate_testset, evaluate
from giskard.llm.client.openai import OpenAIClient
from giskard.llm import set_llm_model
from langchain_core.output_parsers import StrOutputParser
from operator import itemgetter
from langchain.chains import RetrievalQA
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from tqdm import tqdm
from datasets import Dataset
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    answer_relevancy,
    answer_correctness,
    answer_similarity
)

from ragas.metrics.critique import harmfulness
from ragas import evaluate

# COMMAND ----------

# Setting up the environment and teh model
environment = dbutils.secrets.get(scope = "kvsecretscope", key = "Environment")
openai.api_type = "azure"
openai.api_version = "2023-03-1"
client_id = dbutils.secrets.get("kvsecretscope", "OpenAI")
client_secret = dbutils.secrets.get("akvsecretscope", "APIKey")
openai.azure_endpoint = client_id
openai.api_key = client_secret

os.environ['AZURE_OPENAI_API_KEY'] = client_secret
os.environ['AZURE_OPENAI_ENDPOINT'] = client_id
os.environ['OPENAI_API_VERSION'] = "2023-03-1"
os.environ['OPENAI_API_KEY'] = client_secret
os.environ['DEEPEVAL_API_KEY'] = client_secret

# COMMAND ----------

custom_model = AzureChatOpenAI(
    openai_api_version=openai.api_version,
    azure_deployment="gpt-35-turbo",
    azure_endpoint=client_id,
    openai_api_key=client_secret,
)

# COMMAND ----------

# Loading the data , split it into chunks, embedding and vectorStore
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=5)
loader = WebBaseLoader("yahoo.com/en-nl")
documents = loader.load_and_split(text_splitter)
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
db = Chroma.from_documents(documents, embedding_function)
vectorstore = DocArrayInMemorySearch.from_documents(documents, embedding=embedding_function)
retriever = db.as_retriever()
retriever.get_relevant_documents("What is ML engineering?")

# COMMAND ----------

def answer_fn(question, history=None):
    return chain.invoke({"question": question})

# Using Giskard for evaluation
# Creating Knowledge base and test set for evaluation
set_llm_model('gpt-35-turbo')
df = pd.DataFrame([d.page_content for d in documents], columns=["text"])
knowledge_base = KnowledgeBase.from_pandas(df, columns=["text"])
testset = generate_testset(
    knowledge_base,
    num_questions=5,
    agent_description="A chatbot answering questions about the Machienlearning and AI",
)
test_set_df = testset.to_pandas()

for index, row in enumerate(test_set_df.head(3).iterrows()):
    print(f"Question {index + 1}: {row[1]['question']}")
    print(f"Reference answer: {row[1]['reference_answer']}")
    print("Reference context:")
    print(row[1]['reference_context'])
    print("******************", end="\n\n")
testset.save("test-set.jsonl")

template = """
Answer the question based on the context below. If you can't
answer the question, reply "Ask my Boss".
Context: {context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
# print(prompt.format(context="Here is some context", question="Here is a question"))

retrieval_augmented_qa_chain = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": prompt | custom_model, "context": itemgetter("context")}
)
chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | custom_model
    | StrOutputParser()
)
chain.invoke({"question": "What is RAG LLM?"})

# comparing the answers from the chain with the reference answers in the test set.
report = giskard.rag.evaluate(answer_fn, testset=testset, knowledge_base=knowledge_base)
report.correctness_by_question_type()
# display(report)

# COMMAND ----------

question_schema = ResponseSchema(
  name = "question",
  description = "A question about the context"
)
question_response_schemas = [question_schema]
question_output_parser = StructuredOutputParser.from_response_schemas(question_response_schemas)
format_instructions = question_output_parser.get_format_instructions()
question_generation_llm = AzureChatOpenAI(model="gpt-35-turbo", openai_api_key=client_secret)

bare_prompt_template = "{content}"
bare_template = ChatPromptTemplate.from_template(template=bare_prompt_template)

qa_template = """\
You are a University Professor creating a test for advanced students. For each context,
create a question that is specific to the context. Avoid creating generic or general questions.

question: a question about the context.

Format the output as JSON with the following keys:
question

context: {context}
"""

prompt_template = ChatPromptTemplate.from_template(template=qa_template)

messages = prompt_template.format_messages(
    context=documents[0],
    format_instructions=format_instructions
)

question_generation_chain = bare_template | question_generation_llm

response = question_generation_chain.invoke({"content" : messages})
output_dict = question_output_parser.parse(response.content)

qac_triples = []

for text in tqdm(documents[:10]):
  messages = prompt_template.format_messages(
      context=text,
      format_instructions=format_instructions
  )
  response = question_generation_chain.invoke({"content" : messages})
  try:
    output_dict = question_output_parser.parse(response.content)
  except Exception as e:
    continue
  output_dict["context"] = text
  qac_triples.append(output_dict)

answer_generation_llm = AzureChatOpenAI(model="gpt-35-turbo", openai_api_key=client_secret)

answer_schema = ResponseSchema(
    name="answer",
    description="an answer to the question"
)

answer_response_schemas = [
    answer_schema,
]

answer_output_parser = StructuredOutputParser.from_response_schemas(answer_response_schemas)
format_instructions = answer_output_parser.get_format_instructions()

answer_qa_template = """\
You are a University Professor creating a test for advanced students. For each question and context, create an answer.

answer: a answer about the context.

Format the output as JSON with the following keys:
answer

question: {question}
context: {context}
"""

answer_prompt_template = ChatPromptTemplate.from_template(template=answer_qa_template)

messages = answer_prompt_template.format_messages(
    context=qac_triples[0]["context"],
    question=qac_triples[0]["question"],
    format_instructions=format_instructions
)

answer_generation_chain = bare_template | answer_generation_llm

answer_response = answer_generation_chain.invoke({"content" : messages})
output_dict = answer_output_parser.parse(answer_response.content)

for triple in tqdm(qac_triples):
  messages = answer_prompt_template.format_messages(
      context=triple["context"],
      question=triple["question"],
      format_instructions=format_instructions
  )
  response = answer_generation_chain.invoke({"content" : messages})
  try:
    output_dict = answer_output_parser.parse(response.content)
  except Exception as e:
    continue
  triple["answer"] = output_dict["answer"]

ground_truth_qac_set = pd.DataFrame(qac_triples)
ground_truth_qac_set["context"] = ground_truth_qac_set["context"].map(lambda x: str(x.page_content))
ground_truth_qac_set = ground_truth_qac_set.rename(columns={"answer" : "ground_truth"})


eval_dataset = Dataset.from_pandas(ground_truth_qac_set)
eval_dataset.to_csv("./groundtruth_eval_dataset.csv")

# COMMAND ----------

def create_ragas_dataset(rag_pipeline, eval_dataset):
  rag_dataset = []
  for row in tqdm(eval_dataset):
    answer = rag_pipeline.invoke({"question" : row["question"]})
    rag_dataset.append(
        {"question" : row["question"],
         "answer" : answer["response"].content,
         "contexts" : [context.page_content for context in answer["context"]],
         "ground_truths" : [row["ground_truth"]]
         }
    )
  rag_df = pd.DataFrame(rag_dataset)
  rag_eval_dataset = Dataset.from_pandas(rag_df)
  return rag_eval_dataset

from ragas import evaluate

def evaluate_ragas_dataset(ragas_dataset):
  result = evaluate(
    ragas_dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        answer_relevancy,
        answer_correctness,
        answer_similarity
    ],
  )
  return result

# COMMAND ----------

basic_qa_ragas_dataset = create_ragas_dataset(retrieval_augmented_qa_chain, eval_dataset)
basic_qa_ragas_dataset.to_csv("./basic_qa_ragas_dataset.csv")

# COMMAND ----------

# basic_qa_result = evaluate_ragas_dataset(basic_qa_ragas_dataset)
metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        answer_relevancy,
        answer_correctness,
        answer_similarity
    ]

result = evaluate(
    basic_qa_ragas_dataset, metrics=metrics, llm=custom_model, embeddings=embedding_function
)

result

# COMMAND ----------

from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

def create_qa_chain(retriever):
  primary_qa_llm = AzureChatOpenAI(model="gpt-35-turbo", openai_api_key=client_secret, temperature=0)
  created_qa_chain = (
    {"context": itemgetter("question") | retriever,
     "question": itemgetter("question")
    }
    | RunnablePassthrough.assign(
        context=itemgetter("context")
      )
    | {
         "response": prompt | primary_qa_llm,
         "context": itemgetter("context"),
      }
  )

  return created_qa_chain


parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1500)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=200)

vectorstore = Chroma(collection_name="split_parents", embedding_function=embedding_function)

store = InMemoryStore()

parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)
parent_document_retriever.add_documents(documents)
parent_document_retriever_qa_chain = create_qa_chain(parent_document_retriever)
parent_document_retriever_qa_chain.invoke({"question" : "What is RAG?"})["response"].content
pdr_qa_ragas_dataset = create_ragas_dataset(parent_document_retriever_qa_chain, eval_dataset)
pdr_qa_ragas_dataset.to_csv("./pdr_qa_ragas_dataset.csv")
metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        answer_relevancy,
        answer_correctness,
        answer_similarity
    ]

pdr_qa_result = evaluate(
    pdr_qa_ragas_dataset, metrics=metrics, llm=custom_model, embeddings=embedding_function
)


# COMMAND ----------

pdr_qa_result

# COMMAND ----------

# Using DeepEval
class AzureOpenAI(DeepEvalBaseLLM):
    def __init__(
        self,
        model
    ):
        self.model = model

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        return chat_model.invoke(prompt).content

    async def a_generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        res = await chat_model.ainvoke(prompt)
        return res.content

    def get_model_name(self):
        return "Custom Azure OpenAI Model"

azure_openai = AzureOpenAI(model=custom_model)

actual_output = "We offer a 30-day full refund at no extra cost."
answer_rel_metric = AnswerRelevancyMetric(model=azure_openai, threshold=0.7)
# evaluates the reranker in retriever
contextual_precision = ContextualPrecisionMetric(model=azure_openai)
# evaluates the embedding model in retriever
contextual_recall = ContextualRecallMetric(model=azure_openai)
# evaluates the text chunk size and top-K of retriever
contextual_relevancy = ContextualRelevancyMetric(model=azure_openai)
correctness_metric = GEval(
  name="Correctness",
  model = azure_openai,
  criteria="Determine whether the actual output is factually correct based on the expected output.",
  evaluation_steps=[
    "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
    "You should also heavily penalize omission of detail",
    "Vague language, or contradicting OPINIONS, are OK"
    ],
  evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT]
  )

test_case = LLMTestCase(
    input="What is Desicion tree?",# "The dog chased the cat up the tree, who ran up the tree?"
    actual_output=actual_output # , expected_output="The cat."
)

correctness_metric.measure(test_case)
# answer_rel_metric.measure(test_case)
# print(correctness_metric.score)
# print(correctness_metric.reason)
# print(metric.score)
# print(metric.reason)
# correctness_metric.is_successful()

from deepeval import evaluate

qa = RetrievalQA.from_chain_type(
    llm=custom_model,
    chain_type="stuff",
    retriever=retriever,
)
queries = ["Tell me about the Eiffel Tower.", "What is RAG?"]

evaluation_results = []
for query in queries:
    response = qa.run(query)
    test_case = LLMTestCase(
    input=query,
    actual_output= response,
    retrieval_context=[
        """If you are an AI engineer you would know that this is teh best solution ever"""
    ]
    )
    # evaluation = g_eval(response, query)
    evaluation = evaluate([test_case],[correctness_metric])
    evaluation_results.append((query, response, evaluation))

for query, response, result in evaluation_results:
    print(f"New Query: {query}")
    print(f"New Response: {response}")
    print(f"New Evaluation: {result}\n")
    # print(f"Correctness score: {score:.2f}")

# COMMAND ----------

template = "You are a teacher. Give a brief answer to anything asked"
system_message_prompt = SystemMessagePromptTemplate.from_template(template)
human_template = "{text}"
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
chain = LLMChain(llm = custom_model, prompt=chat_prompt)
prompts = {'conciseness':['What is machien learning?','What is the capital of the Netherlands?'],
           'relevance':['what are the ingredients of  pizza margarita'],
           'coherence':['What is machien learning?','What is the capital of the Netherlands?']}
for criteria in prompts:
  evaluator = load_evaluator(EvaluatorType.CRITERIA, llm=custom_model, criteria=criteria)
  print("\n**{}**".format(criteria.upper()))
  for prompt in prompts[criteria]:
    prediction = chain.run(prompt)
    eval_result = evaluator.evaluate_strings(
      prediction=prediction,
      input=prompt
    )
    print("\nPROMPT :", prompt)
    print("\nRESULT :", '\n'.join(prediction.replace('\n','').split('.')[:-1]))
    print("\nVALUE :", eval_result['value'])
    print("\nSCORE :", eval_result['score'])
    print("\nREASON :",'\n'.join(eval_result['reasoning'].replace('\n','').split('.')[:-1]))

prompts2 = {'correctness':{'prompt':"How many players are required for chess?",'answer':'2'},
           'relevance':{'prompt':"What is a data science?",'answer':'Data science is an interdiciplinary field that combines math, statistics,\ programming, advanced analytics and artificial intelligence(AI)'}}
for criteria in prompts2:
  evaluator = load_evaluator("labeled_criteria", llm=custom_model, criteria=criteria)
  print("\n**{}**".format(criteria.upper()))
  prediction = chain.run(prompts2[criteria]['prompt'])
  eval_result = evaluator.evaluate_strings(
    prediction=prediction,
    input=prompts2[criteria]['prompt'],
    reference= prompts2[criteria]['answer']
  )
  print("\nPROMPT :", prompts2[criteria]['prompt'])
  print("\nRESULT :", '\n'.join(prediction.replace('\n','').split('.')[:-1]))
  print("\nVALUE :", eval_result['value'])
  print("\nSCORE :", eval_result['score'])
  print("\nREASON :",'\n'.join(eval_result['reasoning'].replace('\n','').split('.')[:-1]))

# COMMAND ----------

# MAGIC %environment
# MAGIC "client": "1"
# MAGIC "base_environment": ""
