## Task 0: All the Prereq Code (Midterm Tasks and Deliverables Below)

In [None]:
# Libs and Dependencies
!pip install -qU langchain langchain_openai langchain-community langgraph langchain-qdrant ragas qdrant-client pymupdf langchain-huggingface langchain_core==0.2.38 langchain_experimental

In [2]:
# API Keys
import os
import getpass
from uuid import uuid4

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "AIE4 - Midterm Testing"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass("LangSmith API Key: ")

OpenAI API Key:··········
LangSmith API Key: ··········


In [3]:
# Set up async
import nest_asyncio
nest_asyncio.apply()

In [4]:
# Importing my util functions and constants
# If running locally, comment out the next 4 lines
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/My Drive/Colab Notebooks')
import constants
from util import generateDataset, generateChain, generateQdrantRetriever

Mounted at /content/drive


In [5]:
# Document Loading
from langchain_community.document_loaders import PyMuPDFLoader

all_documents = []

# For each URL in the list
for url in constants.PDF_URLS:
    # Create a PyMuPDFLoader on that PDF
    loader = PyMuPDFLoader(file_path=url)
    # Load it into a document
    documents = loader.load()

    # Append each document to the all_documents list
    all_documents.extend(documents)

In [6]:
# Checking docs loaded
print(all_documents[0].metadata)
print(len(all_documents))

{'source': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'file_path': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 0, 'total_pages': 73, 'format': 'PDF 1.6', 'title': 'Blueprint for an AI Bill of Rights', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe Illustrator 26.3 (Macintosh)', 'producer': 'iLovePDF', 'creationDate': "D:20220920133035-04'00'", 'modDate': "D:20221003104118-04'00'", 'trapped': ''}
137


In [7]:
# Setting up chunking methods
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

# recursive text splitter
recursiveChunker = RecursiveCharacterTextSplitter(
    chunk_size = 600,
    chunk_overlap = 60,
    length_function = len,
)

# semantic chunker (using te3-small)
semanticChunker = SemanticChunker(
    OpenAIEmbeddings(model="text-embedding-3-small"),
    breakpoint_threshold_type="percentile"
)

In [8]:
# Creating the sets of chunked documents
recursive_split_docs = recursiveChunker.split_documents(all_documents)
semantic_split_docs = semanticChunker.split_documents(all_documents)

In [9]:
# Checking split lengths
print("recursive split docs length: "+str(len(recursive_split_docs)))
print("semantic split docs length: "+str(len(semantic_split_docs)))

recursive split docs length: 764
semantic split docs length: 322


In [134]:
# Creating embedders for use with the vector stores
from langchain_huggingface import HuggingFaceEmbeddings

# Starting with an untrained all-MiniLM-L6-v2
untrained_model = "sentence-transformers/all-MiniLM-L6-v2"
untrained_embeddings = HuggingFaceEmbeddings(model_name=untrained_model)

# Grabbing my trained version of all-MiniLM-L6-v2 for now; actual code for training and pushing to HF is below
trained_model = "pattonma/AIE4_midterm_tuned_embeddings"
trained_embeddings = HuggingFaceEmbeddings(model_name=trained_model)



In [135]:
# creating the 4 different retrievers

recursive_untrained_retriever = generateQdrantRetriever(documents=recursive_split_docs, embeddingModel=untrained_embeddings, nameExt="rec_un")
recursive_trained_retriever = generateQdrantRetriever(documents=recursive_split_docs, embeddingModel=trained_embeddings, nameExt="rec_tr")
semantic_untrained_retriever = generateQdrantRetriever(documents=semantic_split_docs, embeddingModel=untrained_embeddings, nameExt="sem_un")
semantic_trained_retriever = generateQdrantRetriever(documents=semantic_split_docs, embeddingModel=trained_embeddings, nameExt="sem_tr")

In [136]:
# Creating the 4 different chains

recursive_untrained_chain = generateChain(recursive_untrained_retriever)
recursive_trained_chain = generateChain(recursive_trained_retriever)
semantic_untrained_chain = generateChain(semantic_untrained_retriever)
semantic_trained_chain = generateChain(semantic_trained_retriever)

In [137]:
# Quickly testing the chains

response = recursive_untrained_chain.invoke({"question" : "What is the AI Bill of Rights?"})
print("ru :" +response["response"].content)
response = recursive_trained_chain.invoke({"question" : "What is the AI Bill of Rights?"})
print("rt :" +response["response"].content)
response = semantic_untrained_chain.invoke({"question" : "What is the AI Bill of Rights?"})
print("su :" +response["response"].content)
response = semantic_trained_chain.invoke({"question" : "What is the AI Bill of Rights?"})
print("st :" +response["response"].content)

ru :The AI Bill of Rights is a set of five principles and associated practices designed to guide the design, use, and deployment of automated systems in order to protect the rights of the American public in the age of artificial intelligence. It aims to align automated systems with democratic values and protect civil rights, civil liberties, and privacy.
rt :The AI Bill of Rights is a set of five principles and associated practices designed to guide the design, use, and deployment of automated systems to protect the rights of the American public in the age of artificial intelligence. It aims to align automated systems with democratic values and safeguard civil rights, civil liberties, and privacy.
su :The AI Bill of Rights is a set of five principles and associated practices designed to guide the design, use, and deployment of automated systems to protect the rights of the American public in the age of artificial intelligence. It aims to align automated systems with democratic values a

## Task 1: Dealing with the Data

The default chunking strategy I will use for the RAG prototype will be recursive text splitting. This is a method that is actually recommended by LangChain themselves as a quick and simply way to start splitting documents, which is exactly what I'm after for this first quick RAG prototype. Mechanically, this strategy uses a list of separators and applies a series of splits in order through that list, resulting is a coarse-to-fine-grained splitting approach. If a chunk that it returns from a coarser split is too large, it recursively splits it using a "finer-grained" separator. This strategy results in more logically coherent and semantically meaningful chunks than a simpler strategy like fixed-size chunking, which is useful during our retrieval process later. This approach is also fairly tunable, allowing me to pick a chunk size and overlap very easily.

Associated code:
```python
recursiveChunker = RecursiveCharacterTextSplitter(
    chunk_size = 600,
    chunk_overlap = 60,
    length_function = len,
)
recursive_split_docs = recursiveChunker.split_documents(all_documents)
```

My second approach to chunking will be semantic chunking. This is a more advanced chunking strategy that splits chunks based on their semantic similarity. If embeddings of the document text are found to be sufficiently far apart, they are split into separate chunks. The goal of semantic chunking is to preserve as much coherence as possible in the individual chunks. This is also helpful because it is less reliant on the documents themselves being simply large blocks of text than the recursive  splitter. In this particular case, this is very helpful for the NIST paper, which has large sections of text in a tabular format, which may affect the recursive splitter. The downside of semantic chunking is that it is much more computationally intensive than recursive text-splitting, requiring an embedding model to be used during the chunking process itself. However, it has the possibility to be a much better chunking strategy for a production-level application.

Associated code:
```python
semanticChunker = SemanticChunker(
    OpenAIEmbeddings(model="text-embedding-3-small"),
    breakpoint_threshold_type="percentile"
)
semantic_split_docs = semanticChunker.split_documents(all_documents)
```

Why I have chosen these two specific chunking strategies essentially comes down to ease of implementation for this prototype. The recursive text splitting is very quick and simple to implement. It should answer any of the simple and moderately difficult questions that the stakeholders may ask it. For the simple proof-of-concept that will be shown to the SVP, this should suffice. The second strategy of semantic chunking was chosen because of the potential increase in chunk quality over the simpler recursive strategy. It should result in a much better set of chunks for the retrievers to work with. (Also, as Mark and not the AI Solutions Engineer, I just want to try it!) This should allow the production-level RAG application to answer much more detailed and difficult questions from stakeholders.

## Task 2: Building a Quick End-to-End Prototype

Full code and prototype application hosted seperately at [this HuggingFace repo](https://huggingface.co/spaces/pattonma/AIE4_Midterm_Prototype_RAG).

Loom video showing demo of the prototype found [here](https://www.loom.com/share/4776ab3cd810434ba787c7b1b05998a1).

The prototype, at a high level, is a Langchain RAG application utilizing chainlit, which is then dockerized and hosted on a Hugging Face space for ease of access. Here's a detailed breakdown of the application stack, and why each piece was chosen:

1. **Application Framework:**

    I chose Chainlit to manage the chatbot interface and real-time messaging. The reason I picked Chainlit over something like Streamlit is that Chainlit is purpose-built for building chat-based LLM apps. I'm also more familiar with it as a tool than I am alternatives.

2. **Document Processing:**

    Document Loader (`PyMuPDFLoader`): I use `PyMuPDFLoader` from LangChain’s community module to load and parse PDFs. I selected it because it efficiently handles PDF loading and text extraction, and it's well-integrated with LangChain. While alternatives like `pdfplumber` or `PyPDF2` exist, once again I am more familiar with PyMuPDFLoader.

    Text Splitting (`RecursiveCharacterTextSplitter`): After loading the documents, I split the text into chunks of 600 characters using the RecursiveCharacterTextSplitter. This choice was made over something like a simple `CharacterTextSplitter` because `RecursiveCharacterTextSplitter` is much better at handling sentences, as chunks are broken at meaningful points (e.g., sentence boundaries) rather that strcit character counts, which improves the relevance of the retrieved context for the RAG pipeline.

3. **Vector Store and Embedding Model:**

    Qdrant Vector Store (`QdrantVectorStore`): I decided to use Qdrant as my vector store. While I am personally more familiar with Pinecone, I chose Qdrant for its seamless integration with LangChain and its built-in support for features like the ability to store vectors in memory. Since this app is hosted on Hugging Face Spaces, I opted for an in-memory store `(:memory:)` to minimize resource use.

    Embeddings Model (`all-MiniLM-L6-v2`): For generating document embeddings, I used the sentence-transformers/all-MiniLM-L6-v2 model from Hugging Face. I chose this model over something like OpenAI's embedding models due to its lower cost (it's open-source), its efficiency, and the fact that it performs well for a wide range of tasks. It strikes a balance between speed and accuracy, which makes it suitable for real-time applications like this one. And because it's open-source, I have the ability to fine-tune it for our specific application.

4. **Large Language Model (LLM):**

    LLM (ChatOpenAI): I use the `GPT-4o-mini` model as my LLM for generating responses. I opted for 4o-mini because it is a competent and cheap LLM. This application does not need a sophisticated LLM to analyze retrieved context and answer the questions that stakeholders may ask it. Also, OpenAI models are well integrated into the Langchain ecosystem.

5. **Retrieval-Augmented Generation (RAG) Pipeline:**

    Langchain RAG Chain: The `RetrievalAugmentedQAPipeline` is constructed using LangChain’s built-in constructs for combining document retrieval and answer generation. Langchain allows easy customizition and combination of different modules, tailored for our specific application. The pipeline first retrieves relevant document chunks from the Qdrant vector store based on the user's question. I opted for LangChain's built-in chain components because they allow for a flexible, modular design while abstracting away many complexities, such as managing how retrieved documents are passed along the chain.

6. **Hugging Face Integration:**

    Hosted on Hugging Face Spaces: I chose Hugging Face Spaces for deployment because it's an ideal platform for hosting NLP applications with minimal setup. Spaces provide a pre-built environment for running applications with GPU support, which can speed up LLM inference. Hosting on Hugging Face also allows the stakeholders to access the applicaiton from the publicly available web, rather than us hosting the application on our private local network, as who knows where in the world the stakeholders are.

7. **Dockerization (Hosting on Hugging Face Spaces):**

    I dockerized the application to ensure a consistent runtime environment. I chose Docker because it provides containerization, which guarantees that the application runs the same way in any environment. This also makes it easy to handle dependencies, especially when deploying the app to Hugging Face Spaces, which support Docker out of the box.


## Task 3: Creating a Golden Test Data Set

In [14]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# Setting up the LLMs and embedding model for our SDG
generator_llm = ChatOpenAI(model="gpt-3.5-turbo")
critic_llm = ChatOpenAI(model="gpt-4o-mini")
embeddings = OpenAIEmbeddings()

# Creating the generator
generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# Setting our question type distribution. I'm expecting a healthy mix of questions from the stakeholders, but nothing too in depth.
distributions = {
    simple: 0.5,
    multi_context: 0.4,
    reasoning: 0.1
}

In [None]:
# Generating the test data set from all of our available documents. I chose to generate 50 pieces of test data.
testset = generator.generate_with_langchain_docs(all_documents, 50, distributions, with_debugging_logs=False)

In [16]:
# Converting to a pandas dataframe for easy manipulation later
testset = testset.to_pandas()

In [17]:
from langsmith import Client

# Setting up a dataset in Langsmith to house my test data
client = Client()

dataset_name = "AIE4 Midterm Questions"

dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Questions AIE4's midterm pdfs"
)

# Actually adding the test data to that Langsmith dataset
for test in testset.iterrows():
  client.create_example(
      inputs={
          "question": test[1]["question"]
      },
      outputs={
          "answer": test[1]["ground_truth"]
      },
      metadata={
          "context": test[0]
      },
      dataset_id=dataset.id
  )

In [18]:
# Splitting out the questions and the ground truths for use in creating the evaluation datasets
test_questions = testset["question"].values.tolist()
test_groundtruths = testset["ground_truth"].values.tolist()

In [138]:
# With our (default chunker and untrained embedding) chain, geerate the eval dataset
recursive_untrained_dataset = generateDataset(chain=recursive_untrained_chain,test_questions=test_questions, test_groundtruths=test_groundtruths)

In [139]:
from ragas import evaluate

# Run a RAGAS evaluation on the generated dataset
rc_ue_results = evaluate(recursive_untrained_dataset, constants.METRICS)

Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]

In [140]:
# Display the results
rc_ue_results

{'faithfulness': 0.8371, 'answer_relevancy': 0.8621, 'context_recall': 0.7498, 'context_precision': 0.8878}

After evaluating my default RAG chain, using the default chunker (`RecursiveCharacterTextSplitter`) and the untrained `all-MiniLM-L6-v2` model, I have found it results in scores for the following metrics:
| Metric | Score |
| :------- | ------:|
| Faithfulness | 0.8371 |
| Answer Relevancy | 0.8621 |
| Context Recall | 0.7498 |
| Context Precision | 0.8878 |

Based on those scores, I can draw the following conclusions:
1. **Strong Relevance and Precision**: The pipeline is generally effective in generating relevant answers (with an answer relevancy score of 0.8621) and retrieving relevant document chunks (context precision of 0.8878). This suggests that our model’s retrieval and generation components are functioning well, but not perfectly optimized.

2. **Recall Gap**: The relatively lower context recall (0.7498) compared to the other metrics indicates that the retrieval process is missing some relevant information. This suggests that our default chunking strategy and untrained embedding model are not fully optimized, and some important context might be lost or not retrieved.

3. **Room for Improvement in Faithfulness and Recall**:
    - The faithfulness score of 0.8371 indicates that while answers are generally grounded in the retrieved context, there may still be hallucinations or deviations from the content, as ~16% of the generated answers may contain information that isn’t strictly derived from the retrieved documents.
    - A context recall of 0.7498 suggests that improving the retrieval step (through better embeddings or chunking) could enhance the pipeline’s ability to pull more relevant information, which could also boost faithfulness and answer relevancy.

## Task 4: Fine-Tuning Open-Source Embeddings

In [34]:
import uuid

# setting up the text splitter for generating the tuning data set (different from the 'prod' splitter to avoid data leakage)
tuning_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 750,
    chunk_overlap  = 20,
    length_function = len
)

training_documents = tuning_splitter.split_documents(all_documents)

id_set = set()

for document in training_documents:
  id = str(uuid.uuid4())
  while id in id_set:
    id = uuid.uuid4()
  id_set.add(id)
  document.metadata["id"] = id

# splitting training docs into the different training/validation/test sets
training_split_documents = training_documents[:300]
val_split_documents = training_documents[300:350]
test_split_documents = training_documents[350:400]

In [35]:
from langchain_core.prompts import ChatPromptTemplate

# generic question generation prompt
qa_prompt = """\
Given the following context, you must generate questions based on only the provided context.

You are to generate {n_questions} questions which should be provided in the following format:

1. QUESTION #1
2. QUESTION #2
...

Context:
{context}
"""

qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)

In [36]:
# creating the chain to generate the questions
question_generation_chain = qa_prompt_template | critic_llm

In [38]:
from util import create_questions

# Creating the training questions and contexts from the training documents
training_questions, training_relevant_contexts = create_questions(documents=training_split_documents,n_questions=2,chain=question_generation_chain)

post: trace=36148058-a0d7-4193-ab78-668d4138c4ec,id=36148058-a0d7-4193-ab78-668d4138c4ec; trace=36148058-a0d7-4193-ab78-668d4138c4ec,id=696aa072-859e-4547-9eb8-1104588a6a61; trace=36148058-a0d7-4193-ab78-668d4138c4ec,id=e953962f-d94c-42ff-8823-189da499f94a
post: trace=ff657f2f-583a-49b8-b898-b76826d6b84c,id=ff657f2f-583a-49b8-b898-b76826d6b84c; trace=ff657f2f-583a-49b8-b898-b76826d6b84c,id=ccd8bd29-56ce-41a0-b38e-da80a42848c9; trace=ff657f2f-583a-49b8-b898-b76826d6b84c,id=b7f0748e-afc0-4dbf-a6ce-6ed66ec4fa76; trace=a217e4fa-4ae7-4f85-b653-06992e3ecdca,id=a217e4fa-4ae7-4f85-b653-06992e3ecdca; trace=a217e4fa-4ae7-4f85-b653-06992e3ecdca,id=f5f30570-80fd-41ab-8ac1-2f6ceebb74a6; trace=a217e4fa-4ae7-4f85-b653-06992e3ecdca,id=3810104d-2e34-4795-9cce-b53595caa131; trace=79948fbf-020c-4ddc-afae-7dd8cfa85e61,id=79948fbf-020c-4ddc-afae-7dd8cfa85e61; trace=79948fbf-020c-4ddc-afae-7dd8cfa85e61,id=a9a9bcd2-ee47-4990-99cd-02395f53ae20; trace=79948fbf-020c-4ddc-afae-7dd8cfa85e61,id=85c9993b-93a2-402e-

In [39]:
# Creating the validation questions and contexts from the validation documents
val_questions, val_relevant_contexts = create_questions(documents=val_split_documents,n_questions=2,chain=question_generation_chain)

post: trace=96ee01bb-44b2-4e19-83ce-118245670471,id=b26232d9-1eee-492a-a823-e7f90aaf5ff8; trace=5718486c-6956-40fa-b0b4-ac6353f594b9,id=5718486c-6956-40fa-b0b4-ac6353f594b9; trace=5718486c-6956-40fa-b0b4-ac6353f594b9,id=8c329727-5e25-475d-868e-27828bc64be4; trace=5718486c-6956-40fa-b0b4-ac6353f594b9,id=387afbd3-e1f1-4b3a-81d9-6febbc56e79c; trace=5c72dc2a-51e7-4fbe-82bd-c19db31c0aaa,id=5c72dc2a-51e7-4fbe-82bd-c19db31c0aaa; trace=5c72dc2a-51e7-4fbe-82bd-c19db31c0aaa,id=daccabff-f986-49b9-b80e-2890fba68a72; trace=5c72dc2a-51e7-4fbe-82bd-c19db31c0aaa,id=a1bb0515-2ca7-440d-a594-5f430a306dfe; trace=a982338f-4753-4330-a658-1bc2aec0169e,id=a982338f-4753-4330-a658-1bc2aec0169e; trace=a982338f-4753-4330-a658-1bc2aec0169e,id=24db2d63-558e-49f0-a4df-6a641b68799a; trace=a982338f-4753-4330-a658-1bc2aec0169e,id=b1f6d1f8-d354-4f01-89d1-534c3ac9c345; trace=d0ef81ea-e194-41f9-9c06-aca3effbddc2,id=d0ef81ea-e194-41f9-9c06-aca3effbddc2; trace=d0ef81ea-e194-41f9-9c06-aca3effbddc2,id=e42dae25-e0fe-4706-b79b-

In [40]:
# Creating the test questions and contexts from the test documents
test_questions, test_relevant_contexts = create_questions(documents=test_split_documents,n_questions=2,chain=question_generation_chain)

Processing documents: 100%|██████████| 50/50 [00:50<00:00,  1.00s/it]


In [41]:
import json

# transforming the training questions/context into a usable format for training
training_corpus = {train_item.metadata["id"] : train_item.page_content for train_item in training_split_documents}

train_dataset = {
    "questions" : training_questions,
    "relevant_contexts" : training_relevant_contexts,
    "corpus" : training_corpus
}

with open("training_dataset.jsonl", "w") as f:
  json.dump(train_dataset, f)

In [42]:
# transforming the validation questions/context into a usable format for training
val_corpus = {val_item.metadata["id"] : val_item.page_content for val_item in val_split_documents}

val_dataset = {
    "questions" : val_questions,
    "relevant_contexts" : val_relevant_contexts,
    "corpus" : val_corpus
}

with open("val_dataset.jsonl", "w") as f:
  json.dump(val_dataset, f)

In [43]:
# transforming the test questions into a usable format for training
train_corpus = {test_item.metadata["id"] : test_item.page_content for test_item in test_split_documents}

test_dataset = {
    "questions" : test_questions,
    "relevant_contexts" : test_relevant_contexts,
    "corpus" : train_corpus
}

with open("test_dataset.jsonl", "w") as f:
  json.dump(test_dataset, f)

In [103]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sentence_transformers import InputExample
from sentence_transformers import SentenceTransformer

# Setting up our model for fine tuning (needs to be SentenceTransformer rather than the HuggingFaceEmbeddings)
model_id = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_id)

BATCH_SIZE = 12



In [104]:
corpus = train_dataset['corpus']
queries = train_dataset['questions']
relevant_docs = train_dataset['relevant_contexts']

# Creating the training data loader
examples = []
for query_id, query in queries.items():
    doc_id = relevant_docs[query_id][0]
    text = corpus[doc_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

loader = DataLoader(
    examples, batch_size=BATCH_SIZE
)

In [109]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss, ContrastiveLoss, CoSENTLoss

# Creating our loss functions
matryoshka_dimensions = [384, 256, 128, 64]
matryoshka_weights = [1, 1, 1, 0.5]
inner_train_loss = MultipleNegativesRankingLoss(model)
#inner_train_loss = CoSENTLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions, matryoshka_weights=matryoshka_weights
)

In [110]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

# Creating our validator
corpus = val_dataset['corpus']
queries = val_dataset['questions']
relevant_docs = val_dataset['relevant_contexts']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

In [96]:
EPOCHS = 10

In [111]:
warmup_steps = int(len(loader) * EPOCHS * 0.1)

# Actually fitting the model
model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='finetuned_all-MiniLM-L6-v2',
    optimizer_params={'lr': 2e-5},
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50,
)

Step,Training Loss,Validation Loss,Cosine Accuracy@1,Cosine Accuracy@3,Cosine Accuracy@5,Cosine Accuracy@10,Cosine Precision@1,Cosine Precision@3,Cosine Precision@5,Cosine Precision@10,Cosine Recall@1,Cosine Recall@3,Cosine Recall@5,Cosine Recall@10,Cosine Ndcg@10,Cosine Mrr@10,Cosine Map@100,Dot Accuracy@1,Dot Accuracy@3,Dot Accuracy@5,Dot Accuracy@10,Dot Precision@1,Dot Precision@3,Dot Precision@5,Dot Precision@10,Dot Recall@1,Dot Recall@3,Dot Recall@5,Dot Recall@10,Dot Ndcg@10,Dot Mrr@10,Dot Map@100
50,No log,No log,0.81,0.9,0.95,0.98,0.81,0.3,0.19,0.098,0.81,0.9,0.95,0.98,0.895311,0.868012,0.868647,0.81,0.9,0.95,0.98,0.81,0.3,0.19,0.098,0.81,0.9,0.95,0.98,0.895311,0.868012,0.868647
100,No log,No log,0.81,0.9,0.94,0.98,0.81,0.3,0.188,0.098,0.81,0.9,0.94,0.98,0.895616,0.868524,0.869077,0.81,0.9,0.94,0.98,0.81,0.3,0.188,0.098,0.81,0.9,0.94,0.98,0.895616,0.868524,0.869077
150,No log,No log,0.82,0.88,0.95,0.98,0.82,0.293333,0.19,0.098,0.82,0.88,0.95,0.98,0.893537,0.866373,0.866901,0.82,0.88,0.95,0.98,0.82,0.293333,0.19,0.098,0.82,0.88,0.95,0.98,0.893537,0.866373,0.866901
200,No log,No log,0.8,0.88,0.94,0.98,0.8,0.293333,0.188,0.098,0.8,0.88,0.94,0.98,0.883252,0.853028,0.853569,0.8,0.88,0.94,0.98,0.8,0.293333,0.188,0.098,0.8,0.88,0.94,0.98,0.883252,0.853028,0.853569
250,No log,No log,0.82,0.88,0.92,0.98,0.82,0.293333,0.184,0.098,0.82,0.88,0.92,0.98,0.890891,0.863528,0.864068,0.82,0.88,0.92,0.98,0.82,0.293333,0.184,0.098,0.82,0.88,0.92,0.98,0.890891,0.863528,0.864068
300,No log,No log,0.82,0.88,0.91,0.97,0.82,0.293333,0.182,0.097,0.82,0.88,0.91,0.97,0.888367,0.863206,0.864665,0.82,0.88,0.91,0.97,0.82,0.293333,0.182,0.097,0.82,0.88,0.91,0.97,0.888367,0.863206,0.864665
350,No log,No log,0.8,0.9,0.92,0.98,0.8,0.3,0.184,0.098,0.8,0.9,0.92,0.98,0.886092,0.856806,0.857378,0.8,0.9,0.92,0.98,0.8,0.3,0.184,0.098,0.8,0.9,0.92,0.98,0.886092,0.856806,0.857378
400,No log,No log,0.81,0.9,0.92,0.98,0.81,0.3,0.184,0.098,0.81,0.9,0.92,0.98,0.88929,0.861317,0.861895,0.81,0.9,0.92,0.98,0.81,0.3,0.184,0.098,0.81,0.9,0.92,0.98,0.88929,0.861317,0.861895
450,No log,No log,0.82,0.9,0.92,0.97,0.82,0.3,0.184,0.097,0.82,0.9,0.92,0.97,0.89009,0.865317,0.866799,0.82,0.9,0.92,0.97,0.82,0.3,0.184,0.097,0.82,0.9,0.92,0.97,0.89009,0.865317,0.866799
500,0.241300,No log,0.82,0.9,0.92,0.97,0.82,0.3,0.184,0.097,0.82,0.9,0.92,0.97,0.89009,0.865317,0.866804,0.82,0.9,0.92,0.97,0.82,0.3,0.184,0.097,0.82,0.9,0.92,0.97,0.89009,0.865317,0.866804


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [112]:
# Grabbing our fit model into a HuggingFaceEmbeddings embedder
trained_embeddings_temp = HuggingFaceEmbeddings(model_name="finetuned_all-MiniLM-L6-v2")

# Creating a new Qdrant vs using the new embedder
recursive_trained_retriever_temp = generateQdrantRetriever(documents=recursive_split_docs, embeddingModel=trained_embeddings_temp, nameExt="temp")

In [113]:
# Creating a new chain from that retriever
recursive_trained_chain_temp = generateChain(retriever=recursive_trained_retriever_temp)

In [77]:
# Testing the newly trained embeddings with our chain
response = recursive_trained_chain_temp.invoke({"question" : "What is the AI Bill of Rights?"})
response["response"].content

'The AI Bill of Rights is a set of principles and associated practices aimed at guiding the design, use, and deployment of automated systems to protect the rights of the American public in the age of artificial intelligence. It envisions a future where the public is protected from potential harms and can fully enjoy the benefits of automated systems.'

In [114]:
test_questions = testset["question"].values.tolist()
test_groundtruths = testset["ground_truth"].values.tolist()

# Generating a new evaluation datset with the new chain using the newly trained embeddings
temp_trained_data = generateDataset(chain=recursive_trained_chain_temp,test_questions=test_questions, test_groundtruths=test_groundtruths)

In [115]:
# Evaluate the new chain
temp_embed_results = evaluate(temp_trained_data, constants.METRICS)

Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]



In [116]:
# Print results of the newly trained embeddings' chain's evaluation
temp_embed_results

{'faithfulness': 0.8910, 'answer_relevancy': 0.9064, 'context_recall': 0.7613, 'context_precision': 0.9056}

In [62]:
# Print the existing baseline for reference
rc_te_results

{'faithfulness': 0.8675, 'answer_relevancy': 0.8658, 'context_recall': 0.7385, 'context_precision': 0.9050}

In [117]:
# Save the model locally
model.save('./AIE4_midterm_tuned_embeddings')

In [118]:
from huggingface_hub import notebook_login

# When in the notebook, log into Hugging Face
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import Repository

# Create a new model and push it to Hugging Face
model = SentenceTransformer('./AIE4_midterm_tuned_embeddings')
model.push_to_hub('pattonma/AIE4_midterm_tuned_embeddings')

A link to my trained embeddings can be found [on Hugging Face](https://huggingface.co/pattonma/AIE4_midterm_tuned_embeddings).

I chose to use the model `sentence-transformers/all-MiniLM-L6-v2` because it is an open source embedding model and it is widely downloaded off of Hugging Face, indicating to me that it is a generally well-performing and easy-to-train model. It's also intended for shorter paragraphs and sentences, which coincides well with the documents we're concerned with at the moment. Neither of the docuements have particularly long, uninterrupted strings of text. Being tuned for sentences also makes it very good with semantic searches, which are highly relevant for Q+A tasks. 

It's also a fairly small and compact model, with only ~23m paramters. This helps it strike a good balance between accuracy and speed, making it well-suited for our application, which only makes use of a few documents. Also, being small, it's quick and easy to tune on even modest hardware (no A100 required!), which is perfect for getting our prototype up and running quickly. Similar to the parameters, the actual embedding vector sizes are also fairly small, being only 384 dimensions. This makes storing them in our vector store (Qdrant) take less storage (good because we're just running Qdrant's storage in memory) and the searches quick (because fewer vectors means quicker similarity searches). Overall, it's not powerful, but it gets the job done.

## Task 5: Assessing Performance

In [141]:
# Generating the other 3 datasets to compare against our base Recrusive+Untrained
recursive_trained_dataset = generateDataset(chain=recursive_trained_chain,test_questions=test_questions, test_groundtruths=test_groundtruths)
semantic_untrained_dataset = generateDataset(chain=semantic_untrained_chain,test_questions=test_questions, test_groundtruths=test_groundtruths)
semantic_trained_dataset = generateDataset(chain=semantic_trained_chain,test_questions=test_questions, test_groundtruths=test_groundtruths)

In [142]:
# Evaluate the original Recursive Chunking strategy with the new trained embeddings
rc_te_results = evaluate(recursive_trained_dataset, constants.METRICS)

Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]



In [143]:
# Evaluate the new Semantic Chunking strategy with the origianl untrained embeddings
sc_ue_results = evaluate(semantic_untrained_dataset, constants.METRICS)

Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]



In [144]:
# Evaluate the new Semantic Chunking strategy with the new trained embeddings
sc_te_results = evaluate(semantic_trained_dataset, constants.METRICS)

Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]



In [146]:
import pandas as pd

df_ru = pd.DataFrame(list(rc_ue_results.items()), columns=['Metric', 'Recursive+Untrained'])
df_rt = pd.DataFrame(list(rc_te_results.items()), columns=['Metric', 'Recursive+Trained'])
df_su = pd.DataFrame(list(sc_ue_results.items()), columns=['Metric', 'Semantic+Untrained'])
df_st = pd.DataFrame(list(sc_te_results.items()), columns=['Metric', 'Semantic+Trained'])
# Merge them
df_merged = pd.merge(df_ru, df_rt, on='Metric')
df_merged = pd.merge(df_merged, df_su, on='Metric')
df_merged = pd.merge(df_merged, df_st, on='Metric')

df_merged

Unnamed: 0,Metric,Recursive+Untrained,Recursive+Trained,Semantic+Untrained,Semantic+Trained
0,faithfulness,0.837098,0.83023,0.90054,0.894261
1,answer_relevancy,0.862088,0.868417,0.848056,0.909716
2,context_recall,0.749788,0.739153,0.887333,0.896095
3,context_precision,0.887778,0.899444,0.901111,0.905556


Testing all combinations of the Chunking Strategies (Recursive, Semantic) and Embedding Models (Untrained, Trained):

| Metric| Recursive+Untrained | Recursive+Trained | Semantic+Untrained | Semantic+Trained |
|:--|:-:|:-:|:-:|:-:|
| faithfulness | 0.837098 | 0.830230 | 0.900540 | 0.894261 |
| answer_relevancy | 0.862088 | 0.868417 | 0.848056 | 0.909716 |
| context_recall | 0.749788 | 0.739153 | 0.887333 | 0.896095 |
| context_precision | 0.887778 | 0.899444 | 0.901111 | 0.905556 |

Given the results of the various evaluations we've done, I would recommend using the SemanticChunker and Trained Embeddings combination for our production chain. 

It is all around the best option, as it is the best combination in terms of nearly every metric, except faithufulness, which is essentially tied with the Semantic+Untrained chain. Its high faithfulness indicates this combination generates highly accurate responses based on the retrieved content. Its high answer relevancy means that the responses generated are extremely relevant to the user's questions. This metric is critical for a Q+A system because it directly reflects how well the model understands and responds to the user's query. The high context recall shows this combination is retrieving more relevant chunks that other chains, and the high context precision also shows that the chunks it does retrieve are highly relevant to the query.

Why Not the Other Combinations:
1. Recursive Chunking + Untrained/Trained Embeddings:
    These approaches have lower scores in faithfulness and context recall, likely because the simpler chunking method is splitting important information or mixing unrelated content. This reduces the system’s ability to retrieve complete, meaningful chunks.
2. Semantic Chunking + Untrained Embeddings:
    While this combination retrieves faithful information, the untrained embeddings reduce its ability to fully understand the queries (lower answer relevancy).

## Task 6: Managing Your Boss and User Expectations

> ### Introducing Our Ethical AI Chatbot: Guiding the Future of AI in Enterprise
>
>Over the past several months, our technology team has been working tirelessly to address one of the most pressing and complex challenges we face as a company: **How do we build AI solutions that are not only powerful but also ethical and aligned with our company’s values?**
>
>Through our conversations with various internal stakeholders, it became clear that there is **a growing concern around the implications of AI**—especially as we navigate an election cycle that will inevitably influence AI regulation and policy. Many of you expressed **a need to better understand the evolving landscape of AI**, particularly as it relates to government policy and regulations that are likely to shape the future of our industry. And let’s be honest—keeping up with the pace of change in AI is hard. There are new developments every day, and many of them have the potential to reshape the way we operate, the products we build, and the services we provide.
>
>With this feedback in mind, we took action. Today, I’m proud to introduce a new **AI-powered chatbot** that will help all of us better understand the evolving AI landscape, particularly how it intersects with politics, regulation, and ethical considerations. This chatbot has been designed to provide clarity around these topics using critical and pertient documents, straight from the experts who may end up writing the laws that dictate AI.
>
>These documents are the cornerstone of our chatbot’s knowledge base. Our team has developed a Retrieval-Augmented Generation (RAG) system, which uses these documents as the primary source of truth for answering your questions about AI ethics, government policies, and industry best practices.
>
>What sets this tool apart is that it’s not just a simple chatbot. We’ve fine-tuned it to our sepcific use case to ensure that the answers you receive are both relevant and grounded in the most critical, authoritative texts on the subject. Whether you’re asking about how AI regulations could impact our business or what ethical considerations we should be thinking about as we build new AI-powered tools, this chatbot is designed to provide thoughtful, reliable responses.
>
> #### Why This Matters Now
>This is more than just a tech solution—it’s part of our commitment to being a leader in the ethical deployment of AI. As we move forward, there will be political, social, and ethical questions that we will need to address head-on. This chatbot will help us as a company to navigate the evolving AI landscape, giving each of you the knowledge and tools to better understand how AI policies are being shaped—and how we can shape our own AI initiatives responsibly.
>
> #### How You Can Help Shape the Future
>In the next month, our AI Solutions team will be working with 50 internal stakeholders to test and refine this tool based on your feedback. This is an evolving project, and your input will be invaluable in ensuring the chatbot is not just answering questions, but truly helping guide our approach to ethical AI development. After this test phase, we’ll be making the chatbot available across the entire company.
>
> #### Moving Forward
>This is just the beginning. Our goal is to not only provide a tool that educates and informs but to spark conversations within the company about how we can ensure our AI systems are designed, deployed, and managed in ways that are aligned with our company’s values—and ultimately, the values of the society we serve.
>
>I encourage you all to engage with the chatbot once it’s rolled out, to ask tough questions, and to use it as a resource in your daily work. Together, we will ensure that our company remains at the forefront of responsible AI development, and that we are equipped to navigate the rapidly changing AI landscape with confidence and clarity.
>
>Thank you for your continued support, and I look forward to seeing how this tool will empower us all to lead in the future of AI.

Considerations for future updates, such as an updated or expanded list of pertinent documents (more White House briefs, executive orders, Nonprofit research papers, etc), we'd just have to modify the list of documents that we load at the start of the application. Ideally, we'd actually store them in a dedicated vector store, rather than reading them and storing them in memory at runtime, particularly if the list of context documents that we're working with ends up being quite large. 

Other considerations for the future may include changing to a different embedding model (perhaps one with more parameters or a larger embedding dimension). This would be beneficial if, again, our list of documents started getting large. Further explorations into new chunking strategies, better fine-tuning (many hyper paramters can be changed that were not when I tuned my embeddings), better prompting of the chains (all prompts at the moment are very basic), and even UI improvements would be warranted. 

One change that I honestly don't know if it's warranted or not would be to implement a fine tuned LLM for the chains, as the simple Q+A nature of this application doesn't need a particularly powerful or specialized model (which is not to say that a model tuned to Q+A wouldn't be beneficial, but it seems like it may be a diminishing return depending on the time invested versus performance improvements it may yield). But, depending on how widely used this application may become, it might behoove the company to change from using the proprietary LLMs that it uses now and move to open-source models, just as a smiple cost-cutting measure. That may also provide at least an opening to the opportunity of fine-tuning the model for Q+A purposes.