In [None]:
# Databricks notebook source
from transformers import pipeline
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval.test_case import LLMTestCaseParams, LLMTestCase
from deepeval import assert_test
from deepeval.metrics import (
    GEval, AnswerRelevancyMetric,
    ContextualPrecisionMetric,
    ContextualRecallMetric,
    ContextualRelevancyMetric
)
import openai
from openai import OpenAI, AzureOpenAI
import os
from langchain_openai import AzureChatOpenAI
from azure.identity import DefaultAzureCredential, get_bearer_token_provider

# COMMAND ----------

environment = dbutils.secrets.get(scope = "kvsecretscope", key = "Environment")

# COMMAND ----------

openai.api_type = "azure"
openai.api_version = "2023-03-1"
client_id = dbutils.secrets.get("akvsecretscope01", "OpenAI")
client_secret = dbutils.secrets.get("akvsecretscope01", "APIKey")
openai.azure_endpoint = client_id
openai.api_key = client_secret

# COMMAND ----------

completion = openai.chat.completions.create(
            model="gpt-35-turbo",
            messages = [{
            "role": "user",
      "content": "What are the differences between Azure Machine Learning and Azure AI services?"}
        ],
            max_tokens=500,
            n=1,
            stop=None,
            temperature=0
        )
print(completion.choices[0].message.content)
# print(completion.to_json())

# COMMAND ----------

class AzureOpenAI(DeepEvalBaseLLM):
    def __init__(
        self,
        model
    ):
        self.model = model

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        return chat_model.invoke(prompt).content

    async def a_generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        res = await chat_model.ainvoke(prompt)
        return res.content

    def get_model_name(self):
        return "Custom Azure OpenAI Model"

# COMMAND ----------

# Replace these with real values
custom_model = AzureChatOpenAI(
    openai_api_version=openai.api_version,
    azure_deployment="gpt-35-turbo",
    azure_endpoint=client_id,
    openai_api_key=client_secret,
)
azure_openai = AzureOpenAI(model=custom_model)
print(azure_openai.generate("Write me a joke"))

# COMMAND ----------

qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

# COMMAND ----------

# Sample Chatbot Class using Hugging Face model
class HuggingFaceChatBot:
    def __init__(self, qa_pipeline):
        self.qa_pipeline = qa_pipeline
        self.context = ("The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France.\
        It is named after the engineer Gustave Eiffel, whose company designed and built the tower. \
        Constructed from 1887 to 1889 as the entrance arch for the 1889 World's Fair, it was initially criticized \
        by some of France's leading artists and intellectuals for its design, but it has become a global cultural icon\
        of France and one of the most recognizable structures in the world. \
        The Eiffel Tower is the most-visited paid monument in the world; 6.91 million people ascended it in 2015.\
         The tower is 330 meters (1,083 ft) tall, about the same height as an 81-story building, and the tallest structure \
         in Paris. Its base is square, measuring 125 meters (410 ft) on each side.")


    def get_response(self, query):
        response = self.qa_pipeline(question=query, context=self.context)
        return response['answer']

# COMMAND ----------


answer_relevancy_metric = AnswerRelevancyMetric(model=azure_openai, threshold=0.5)
test_case = LLMTestCase(
    input="What if these shoes don't fit?",
    # Replace this with the actual output of your LLM application
    actual_output="We  dont care"
)
# assert_test(test_case, [answer_relevancy_metric])
answer_relevancy_metric.measure(test_case)
print(answer_relevancy_metric.score)
print(answer_relevancy_metric.reason)

# COMMAND ----------

metric = AnswerRelevancyMetric(model=azure_openai)
test_case = LLMTestCase(
  input="The dog chased the cat up the tree, who ran up the tree?",
  actual_output="It depends, some might consider the cat, while others might argue the dog.",
  expected_output="The cat."
  )
metric.measure(test_case)
print(metric.score)
print(metric.reason)

# COMMAND ----------

# evaluates the reranker in retriever
contextual_precision = ContextualPrecisionMetric(model=azure_openai)
# evaluates the embedding model in retriever
contextual_recall = ContextualRecallMetric(model=azure_openai)
# evaluates the text chunk size and top-K of retriever
contextual_relevancy = ContextualRelevancyMetric(model=azure_openai)


# COMMAND ----------

test_case = LLMTestCase(
    input="I'm on an F-1 visa, gow long can I stay in the US after graduation?",
      # the final generation of your RAG pipeline
    actual_output="You can stay up to 30 days after completing your degree.",
    expected_output="You can stay up to 60 days after completing your degree.",
    # the retrieved text chunks during the retrieval step
    retrieval_context=[
        """If you are in the U.S. on an F-1 visa, you are allowed to stay for 60 days after completing
        your degree, unless you have applied for and been approved to participate in OPT."""
    ]
)
evaluate(
    test_cases=[test_case],
    metrics=[contextual_precision, contextual_recall, contextual_relevancy]
)

# COMMAND ----------

correctness_metric = GEval(
  name="Correctness",
  model = azure_openai,
  criteria="Determine whether the actual output is factually correct based on the expected output.",
  evaluation_steps=[
    "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
    "You should also heavily penalize omission of detail",
    "Vague language, or contradicting OPINIONS, are OK"
    ],
  evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT]
  )

test_case = LLMTestCase(
  input="The dog chased the cat up the tree, who ran up the tree?",
  actual_output="It depends, some might consider the cat, while others might argue the dog.",
  expected_output="The cat."
  )

# correctness_metric.measure(test_case)
# print(correctness_metric.score)
# print(correctness_metric.reason)

# COMMAND ----------

# Initialize chatbot
chatbot = HuggingFaceChatBot(qa_pipeline)

# Define queries for evaluation
queries = ["Tell me about the Eiffel Tower.", "What's the weather like today?"]

# COMMAND ----------

# Run the evaluation
evaluation_results = []
for query in queries:
    response = chatbot.get_response(query)
    # evaluation = g_eval(response, query)
    evaluation = correctness_metric.evaluate(response)
    evaluation_results.append((query, response, evaluation))
# Print evaluation results
for query, response, result in evaluation_results:
    print(f"Query: {query}")
    print(f"Response: {response}")
    print(f"Evaluation: {result}\n")
    # print(f"Correctness score: {score:.2f}")
