In [8]:
import os
from langchain_community.document_loaders import CSVLoader
from langchain_community.vectorstores import Chroma

def vct_db_filename_gen(file_path):
    # Derive vector DB filename from CSV filename
    base_name = os.path.basename(file_path)
    db_file_name = os.path.splitext(base_name)[0] + ".vecdb"

    return os.path.join(os.path.dirname(file_path), db_file_name)

def check_and_load_vector_db(file_path, embedding):
    """
    Checks if a vector db file exists for the given file_path, 
    loads it if exists, otherwise creates it from the csv and saves it.
    """
    # Derive vector DB filename from CSV filename
    db_file_path = vct_db_filename_gen(file_path)

    # Check if the vector DB file exists
    if os.path.exists(db_file_path):
        print(f"Loading existing vector DB from {db_file_path}")
        db = Chroma(persist_directory=db_file_path, embedding_function=embedding)
    else:
        print(f"Vector DB not found. Creating from {file_path}")
        # Load the CSV and create the vector DB
        loader = CSVLoader(file_path=file_path)
        documents = loader.load()
        # Save the newly created vector DB
        db = Chroma.from_documents(documents, embedding, persist_directory=db_file_path)
        print(f"Saved new vector DB to {db_file_path}")
    
    return db

In [9]:
class ResultsData:
    def __init__(self, chain_type, time=None, tokens_used=None, example_number=None, predicted_query=None, predicted_answer=None, answer=None, result=None):
        self.chain_type = chain_type
        self.eval = []
        if example_number is not None:
            self.append_evaluation(time, tokens_used, example_number, predicted_query, answer, predicted_answer, result)
    
    def append_evaluation(self, time, tokens_used, example_number, predicted_query, answer, predicted_answer, result):
        """Append a new evaluation result to the eval list."""
        self.eval.append({
            "time": time,
            "tokens_used": tokens_used,
            "example_number": example_number,
            "query": predicted_query,
            "predicted_answer": predicted_answer,
            "answer": answer,
            "result": result
        })


In [10]:
import datetime

# account for deprecation of LLM model

def llm_model():
    # Get the current date
    current_date = datetime.datetime.now().date()

    # Define the date after which the model should be set to "gpt-3.5-turbo"
    target_date = datetime.date(2024, 6, 12)

    # Set the model variable based on the current date
    if current_date > target_date:
        return "gpt-3.5-turbo"
    else:
        return "gpt-3.5-turbo-0301"


In [11]:
from langchain.evaluation.qa import QAGenerateChain
from langchain.evaluation.qa import QAEvalChain
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import CSVLoader
from langchain_openai import ChatOpenAI

def langchain_output_parser(qa_output):
    """
    Transforms the QA output from langchain into a dictionary format without the 'qa_pairs' field.
    
    Parameters:
    - qa_output: A list of dictionaries, where each dictionary contains 'qa_pairs' among other possible fields.

    Returns:
    - A list of dictionaries, where each dictionary directly contains 'query' and 'answer' fields.
    """
    parsed_output = []
    for item in qa_output:
        # Assuming each item in qa_output is a dictionary with a 'qa_pairs' key
        qa_pair = item.get('qa_pairs', {})
        # Repackage the qa_pair without the 'qa_pairs' field
        reformatted_item = {
            'query': qa_pair.get('query', ''),
            'answer': qa_pair.get('answer', '')
        }
        parsed_output.append(reformatted_item)
    return parsed_output


def generate_qas(file_path, db, llm, chain_type):
    # Load vector db to index
    loader = CSVLoader(file_path=file_path)
    data = loader.load()
    index = VectorStoreIndexWrapper(vectorstore=db)

    qa = RetrievalQA.from_chain_type(
        llm=llm, 
        chain_type=chain_type, 
        retriever=index.vectorstore.as_retriever(), 
        verbose=True,
        chain_type_kwargs = {
            "document_separator": "<<<<>>>>>"
        }
    ) 

    # LLM-Generated example Q&A pairs 
    example_gen_chain = QAGenerateChain.from_llm(ChatOpenAI(model=llm_model()))
    # the warning below can be safely ignored
    raw_examples = example_gen_chain.apply( # create raw examples
        [{"doc": t} for t in data[:5]],
    )

    # Parse the raw examples into required format
    examples = langchain_output_parser(raw_examples)

    # run for manual evaluation
    qa.run(examples[0]["query"])

    return qa, examples

def evaluate(chain_type, qa, examples, llm, results_data):
    # LLM assisted evaluation
    predictions = qa.apply(examples)
    eval_chain = QAEvalChain.from_llm(llm)
    graded_outputs = eval_chain.evaluate(examples, predictions)

    # turn to object and return
    # using llm as real answer and predicted answer are not similar in a string match sense, e.g. look at example_llm_eval.txt
    for i, eg in enumerate(examples):
        
        example_number = i
        query = predictions[i]['query']
        answer = predictions[i]['answer']
        predicted_answer = predictions[i]['result']
        result = graded_outputs[i]['results']
        
        print(f"Example {example_number}:")
        print("Question: " + query)
        print("Real Answer: " + answer)
        print("Predicted Answer: " + predicted_answer)
        print("Predicted Grade: " + result)
        print()

        results_data = add_to_results_list(results_data, chain_type, query, answer=answer, result=result)
    return results_data

def add_to_results_list(results_data, chain_type, query, td=None, tokens_used=None, number=None, response=None, answer=None, result=None):
    found = False
    for item in results_data:
        if item.chain_type == chain_type:
            # Update the existing dictionary
            item.append_evaluation(time=td, tokens_used=tokens_used, example_number=number, 
                         predicted_query=query, answer=response, predicted_answer=answer, result=result)
            found = True
            break

    if not found:
        # Append a new instance of ResultsData if no matching chain_type was found
        results_data.append(ResultsData(chain_type=chain_type, time=td, tokens_used=tokens_used, 
                                        example_number=number, predicted_query=query, 
                                        answer=response, predicted_answer=answer, 
                                        result=result))
        
    return results_data

In [12]:
from langchain.chains import RetrievalQA
from langchain.callbacks import get_openai_callback

def qa_analysis(llm, chain_type, retriever, verbose, query, number, results_data):
    """
    Initializes a QA analysis with a given language model, chain type, and retriever.
    Then, it runs the QA analysis, timing its execution and printing the response along with the execution time.
    """
    # Initialize the RetrievalQA object with the specified parameters.
    qa = RetrievalQA.from_chain_type(
        llm=llm, 
        chain_type=chain_type, 
        retriever=retriever, 
        verbose=verbose
    )

    # Measure number of tokens used
    with get_openai_callback() as cb:
        start = datetime.datetime.now()

        try:
            # Execute the QA analysis
            response = qa.invoke(query) #TODO: i've only added queries, no answers...
        except ValueError as e: 
            response = e

        end = datetime.datetime.now()
    
    tokens_used = cb.total_tokens

    # Calculate the difference between the end and start timestamps to get the execution duration.
    # The duration is converted to milliseconds for a more precise and readable format.
    td = (end - start).total_seconds() * 10**3
    
    print(f"Response: {response}\nThe time of execution of above program is : {td:.03f}ms")

    results_data = add_to_results_list(results_data, chain_type, query, td, tokens_used, number, response)

    print("\n\nTESTING\n:" + '\n'.join([str(item) for item in results_data]))

    return results_data


In [13]:
def results_data_to_markdown_table(results_data_list):
    # Define the header of the markdown table
    headers = ["Chain Type", "Eval Time", "Tokens Used", "Example Number", "Predicted Query", "Predicted Answer", "Answer", "Result"]
    # Create the markdown table header and separator rows
    markdown_table = "| " + " | ".join(headers) + " |\n"
    markdown_table += "| " + " | ".join(["---"] * len(headers)) + " |\n"
    
    # Iterate over each ResultsData instance
    for data in results_data_list:
        # And then iterate over each evaluation within the ResultsData instance
        for eval in data.eval:
            # Construct each row with the appropriate data
            row = [
                data.chain_type,  # Corrected from data.type to data.chain_type
                str(eval.get("time", "")),  # Using .get() for safer access to dictionary keys
                str(eval.get("tokens_used", "")),
                str(eval.get("example_number", "")),
                eval.get("query", ""),
                eval.get("predicted_answer", ""),
                eval.get("answer", ""),
                eval.get("result", "")
            ]
            markdown_table += "| " + " | ".join(row) + " |\n"
    
    return markdown_table

def write_markdown_table_to_file(markdown_table, filename):
    # Write the markdown table to the specified file
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(markdown_table)
    
    print(f"Markdown table successfully written to {filename}")

In [14]:
from dotenv import load_dotenv, find_dotenv
from langchain_openai import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings

def main():
    # Basic Setup
    _ = load_dotenv(find_dotenv()) # read local .env file
    results_data = []
    strategies = ["stuff", "map_reduce", "refine", "map_rerank"]

    # Load data into vector db or use existing one
    file_path = '../data/OutdoorClothingCatalog_1000.csv'
    embedding = OpenAIEmbeddings()  # Define embedding

    # Check if vector DB exists for the CSV, and load or create accordingly
    db = check_and_load_vector_db(file_path, embedding)

    queries = ["Please suggest a shirt with sunblocking", "Please suggest a shirt with sunblocking and tell me why this one", "Please suggest three shirts with sunblocking and tell me why. Give this back to me in markdown code as a table", "Please suggest three shirts with sunblocking and tell me why. Give this back to me in markdown code as a table, with a summary below outlining why sunblocking is important"]

    # Configure LLM for querying
    # layers vector db on llm to inform decisions and responses
    llm = ChatOpenAI(temperature = 0.0, model=llm_model())
    retriever = db.as_retriever()

    # Manual analysis - TODO: add answers
    # for index, query in enumerate(queries, start=1):
    #     results_data = qa_analysis(llm, "stuff", retriever, True, query, index, results_data)
    #     results_data = qa_analysis(llm, "map_reduce", retriever, True, query, index, results_data)
    #     results_data = qa_analysis(llm, "refine", retriever, True, query, index, results_data)
    #     results_data = qa_analysis(llm, "map_rerank", retriever, True, query, index, results_data)

    # LLM QA Gen AND Evaluate
    for strat in strategies:
        # Generate evaluation Q&As
        tuple = generate_qas(file_path, db, llm, strat)
        qa = tuple[0]
        examples = tuple[1]

        # Evaluate 
        results_data = evaluate(strat, qa, examples, llm, results_data)

    # Generate results in markdown
    md_table = results_data_to_markdown_table(results_data)

    # Write results to file
    write_markdown_table_to_file(md_table, "results.md")

if __name__ == '__main__':
    main()

Loading existing vector DB from ../data/OutdoorClothingCatalog_1000.vecdb


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Example 0:
Question: What is the approximate weight of the Women's Campside Oxfords per pair?
Real Answer: The approximate weight of the Women's Campside Oxfords per pair is 1 lb.1 oz.
Predicted Answer: The approximate weight of the Women's Campside Oxfords per pair is 1 lb. 1 oz.
Predicted Grade: CORRECT

Example 1:
Question: What are the dimensions of the small and medium Recycled Waterhog dog mats?
Real Answer: The small Recycled Waterhog dog mat has dimensions of 18" x 28", while

ValidationError: 1 validation error for MapReduceDocumentsChain
document_separator
  extra fields not permitted (type=value_error.extra)