# LangChain: Evaluation

## Outline:

* Example generation
* Manual evaluation (and debuging)
* LLM-assisted evaluation
* LangChain evaluation platform

In [1]:
# import os

# from dotenv import load_dotenv, find_dotenv
# _ = load_dotenv(find_dotenv()) # read local .env file

## Create our QandA application

In [2]:
from langchain.chains import RetrievalQA
# from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import CSVLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import DocArrayInMemorySearch

In [3]:
file = 'OutdoorClothingCatalog_1000.csv'
loader = CSVLoader(file_path=file, encoding='utf-8')
data = loader.load()

In [4]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from config.config import GEMINI_API_KEY
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GEMINI_API_KEY)




In [None]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch, embedding=embeddings
).from_loaders([loader])

In [None]:
from llm_util.gemini_llm import Gemini_llm
llm_obj: Gemini_llm = Gemini_llm()
llm = llm_obj.get_langchain_llm()

In [7]:
# llm = ChatOpenAI(temperature = 0.0)
qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=index.vectorstore.as_retriever(), 
    verbose=True,
    chain_type_kwargs = {
        "document_separator": "<<<<>>>>>"
    }
)

### Coming up with test datapoints

In [None]:
data[10]

In [None]:
data[11]

### Hard-coded examples

In [10]:
examples = [
    {
        "query": "Do the Cozy Comfort Pullover Set\
        have side pockets?",
        "answer": "Yes"
    },
    {
        "query": "What collection is the Ultra-Lofty \
        850 Stretch Down Hooded Jacket from?",
        "answer": "The DownTek collection"
    }
]

### LLM-Generated examples

* QAGeneration chain will take documents and create QA pairs from each document

In [11]:
from langchain.evaluation.qa import QAGenerateChain


In [12]:
example_gen_chain = QAGenerateChain.from_llm(llm)

In [None]:
data[:5]

In [None]:
new_examples = example_gen_chain.apply_and_parse(
    [{"doc": t} for t in data[:5]]
)

In [None]:
# new_examples[0]['qa_pairs']
new_examples = [ex['qa_pairs'] for ex in new_examples if ex['qa_pairs']]
new_examples

In [16]:
# new_examples[1]

In [None]:
data[0]

### Combine examples

In [None]:
examples += new_examples
examples

In [None]:
qa.run(examples[0]["query"])

## Manual Evaluation

In [20]:
import langchain
langchain.debug = True

In [None]:
qa.run(examples[0]["query"])

In [22]:
# Turn off the debug mode
langchain.debug = False

#### In this manner, we can run all the queries and compare the answers whether it is correct or not.
#### But it is tedious
#### Can we take help from LLM to evaluate the answers which are generated?

## LLM assisted evaluation

In [None]:
examples

In [None]:
predictions = qa.apply(examples)

In [25]:
from langchain.evaluation.qa import QAEvalChain

In [26]:
# llm = ChatOpenAI(temperature=0)
eval_chain = QAEvalChain.from_llm(llm)

In [None]:
graded_outputs = eval_chain.evaluate(examples, predictions)
graded_outputs

In [None]:
for i, eg in enumerate(examples):
    print(f"Example {i}:")
    print("Question: " + predictions[i]['query']) # generated by LLM, from the documents  
    print("Real Answer: " + predictions[i]['answer']) # generated by LLM, from the documents
    print("Predicted Answer: " + predictions[i]['result']) # generated by LLM, when its doing the QA chain, using retrieval from embeddings in vector databases, passing that into a LLM and predict the answer.
    print("Predicted Grade: " + graded_outputs[i]['results']) # generated by LLM, when its asking eval chain to evaluate the answer.
    print()

### End of the example
### If we notice the below two strings
* Real Answer: Yes
* Predicted Answer: Yes, the Cozy Comfort Pullover Set has side pockets. 
### Both the strings are not exactly the same, but the meaning is the same.
### LLM is able to understand the context and provide the correct answer.
### It is difficult to know whether the answer is correct or not, without knowing the context.