Importing Libraries

In [2]:
!pip install -qU langsmith langchain-core langchain-community langchain-openai langchain-qdrant chainlit
!pip install -qU pymupdf ragas huggingface_hub nbformat sentence-transformers torch


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m885.0 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.7/49.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m399.9/399.9 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.5/51.5 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/4.3 MB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Qdrant
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.schema import Document

In [4]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

OpenAI API Key:··········


In [5]:
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass('Enter your LangSmith API key: ')


Enter your LangSmith API key: ··········


TASK 1: Dealing with the Data

In [6]:
from langchain_community.document_loaders import PyMuPDFLoader

# Load the first document
documents1 = PyMuPDFLoader(file_path="https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf").load()

# Load the second document
documents2 = PyMuPDFLoader(file_path="https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf").load()

Chunking the Data and Splitting

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

CHUNK_SIZE = 400
CHUNK_OVERLAP = 50

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,

)

documents1 = text_splitter.split_documents(documents1)
documents2 = text_splitter.split_documents(documents2)


In [8]:
len(documents1)

676

In [9]:
len(documents2)

486

Text Embedding Model to create Vector Store

In [10]:
from langchain_openai import OpenAIEmbeddings

EMBEDDING_MODEL = "text-embedding-3-small"

embeddings = OpenAIEmbeddings(
    model=EMBEDDING_MODEL,
    openai_api_key=os.environ["OPENAI_API_KEY"]
)

Setting up Qdrant Vector Store

In [11]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

LOCATION = ":memory:"
COLLECTION_NAME = "ai_ethics_docs"
VECTOR_SIZE = 1536

Loading the Data into Qdrant Vector Store

In [12]:
qdrant_client = QdrantClient(
    location=LOCATION
)

qdrant_client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE)
)

qdrant_vector_store = QdrantVectorStore(
    client=qdrant_client,
    collection_name=COLLECTION_NAME,
    embedding=embeddings
)

qdrant_vector_store.add_documents(documents1)
qdrant_vector_store.add_documents(documents2)

['2f176c1000e04013b8b39444a7f112b6',
 '293359a57f18411daaaed2b763f2a14d',
 '01d283a617174e66bc425f24dd2ed542',
 '74563893e5ea45d6bb731c7b8e4a1932',
 '7088db22051b46ed9800daf3b48d64c8',
 '9e85e7a6d4394926bbe457127120a466',
 'd8345e2d41784fe2ad2dd114c0502b4c',
 'd68de121b0b341a3ab0c171ab73c1ff9',
 '088ed2c8210c47b581763280d2593b2c',
 '98f5dec423124a039221e531daad0421',
 'fcfb276d76a343fda858c9bdea98acbc',
 '322e1fff048f488885cddf530331158d',
 'db6cdf132ebb4a48b03636a12100e1a9',
 'cd98f870acc24704afa2f7898bec78ed',
 'eac83b46888d4a64aa7f14ef7466bfc9',
 '56b786ada2d84a8ab908243cd1fc7d5c',
 '381f54c373364f7d95d17c58c77e1a6d',
 '092df5bb627c475cb2b0166e3b001414',
 'a4a2ffaaebef489facb71859882b0bb2',
 '795295670e10450798b45b11b147619c',
 '0a7951edefa64df9a6fb8228e88a4123',
 '041032427be4438a8ae651d7a80f06f1',
 'f567174d3a7c411980f1cb4da5519599',
 '42623c5d99a74d9d9aec9a4b4c7607c3',
 '392ed94c47054a9783de2ff89844fad2',
 '7b70c844f3c0479fb296b084e4c2a762',
 'e13d6ee43e3a4ce0aff1b20d79781e20',
 

TASK 2: RAG Prototype

Setting up the Retriever

In [13]:
retriever = qdrant_vector_store.as_retriever()

retrieved_documents = retriever.invoke("How can I cause no harm with AI?")

for doc in retrieved_documents:
  print(doc)

page_content='prevent future occurrences. Conduct post-mortem analyses of incidents with 
relevant AI Actors, to understand the root causes and implement preventive 
measures. 
Human-AI Conﬁguration; 
Dangerous, Violent, or Hateful 
Content 
MG-4.2-003 Use visualizations or other methods to represent GAI model behavior to ease 
non-technical stakeholders understanding of GAI system functionality.' metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 48, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': "D:20240805141702-04'00'", 'modDate': "D:20240805143048-04'00'", 'trapped': '', '_id': 'c102b0961c10

Creating the RAG Chain

In [14]:
from langchain import hub

retrieval_qa_prompt = hub.pull("langchain-ai/retrieval-qa-chat")

print(retrieval_qa_prompt.messages[0].prompt.template)


Answer any use questions based solely on the context below:

<context>
{context}
</context>


In [15]:
from langchain.prompts import ChatPromptTemplate

template = """
You are a helpful assistant. Act as an AI ethics expert and answer the question in a succinct way.
If you cannot answer the question based on the context - you must say "I don't know".

Question:
{question}

Context:
{context}
"""

prompt = ChatPromptTemplate.from_template(template)

In [16]:
from operator import itemgetter

from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

primary_qa_llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

retrieval_augmented_qa_chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question")
    }
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {
        "response": prompt | primary_qa_llm,
        "context": itemgetter("context")
    }
)

In [17]:
# Example query
query = "How do cause no harm with AI?"
response = retrieval_augmented_qa_chain.invoke({"question": query})
print(response["response"].content)

To cause no harm with AI, it is essential to manage harmful biases, ensure privacy, and maintain safety, security, and resilience. Implementing policies and procedures for risk management, conducting impact assessments, and engaging in post-mortem analyses of incidents can help identify and mitigate risks associated with AI systems. Additionally, fostering transparency and understanding of AI functionalities among stakeholders is crucial.


In [18]:
#Example query 2

query = "What is a rule of thumb for data governance?"
response = retrieval_augmented_qa_chain.invoke({"question": query})
print(response["response"].content)

A rule of thumb for data governance is to ensure data quality and maintain transparency regarding the origin and history of data, especially in sensitive domains, to avoid adverse consequences from flawed decision-making.


Deploying the RAG Prototype to Hugging Face - DONE


TASK 3: Creating Test Data

In [None]:
eval_documents = PyMuPDFLoader(file_path="https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf").load() + \
                 PyMuPDFLoader(file_path="https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf").load()

text_splitter_eval = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)

eval_documents = text_splitter_eval.split_documents(eval_documents)

In [None]:
len(eval_documents)

910

In [None]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

generator_llm = ChatOpenAI(model="gpt-3.5-turbo")
critic_llm = ChatOpenAI(model="gpt-4o-mini")
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

distributions = {
    simple: 0.5,
    multi_context: 0.4,
    reasoning: 0.1
}

num_qa_pairs = 40 # You can reduce the number of QA pairs if you're experiencing rate-limiting issues

testset = generator.generate_with_langchain_docs(eval_documents, num_qa_pairs, distributions)

testset.to_pandas()


embedding nodes:   0%|          | 0/1820 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/40 [00:00<?, ?it/s]

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,How does the lack of notice or explanation reg...,[ever being notified that data was being colle...,The lack of notice or explanation regarding da...,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True
1,What are some TEVV considerations that need to...,[Information Integrity \nAI Actor Tasks: End U...,Scientific integrity and TEVV considerations t...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
2,How can Generative AI facilitate the spread of...,[Disinformation and misinformation – both of w...,Generative AI can facilitate the spread of dis...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
3,How should data from sensitive domains like cr...,[in the spreading and scaling of harms. Data f...,Data from sensitive domains like criminal just...,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True
4,How does the Blueprint for an AI Bill of Right...,[- \nUSING THIS TECHNICAL COMPANION\nThe Bl...,The Blueprint for an AI Bill of Rights aims to...,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True
5,How do generative models like LLMs generate ou...,[Confabulations can occur across GAI outputs a...,Generative models like LLMs generate outputs b...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
6,How should consent practices ensure use-specif...,[control over their data \nUse-specific consen...,Consent practices should ensure use-specific c...,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True
7,How can digital content transparency solutions...,[Human-AI Conﬁguration \nMS-2.8-003 \nUse digi...,Digital content transparency solutions can be ...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
8,How should risks be re-evaluated when adapting...,[and Component Integration \nMP-4.1-007 Re-eva...,Risks should be re-evaluated when adapting GAI...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
9,What principles have been proposed for the eth...,"[mated systems, and researchers developing inn...","Advocates, researchers, and government organiz...",simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True


In [None]:
testset.test_data[0]

DataRow(question='How does the lack of notice or explanation regarding data collection affect parental knowledge in child maltreatment risk assessments?', contexts=['ever being notified that data was being collected and used as part of an algorithmic child maltreatment\nrisk assessment.84 The lack of notice or an explanation makes it harder for those performing child\nmaltreatment assessments to validate the risk assessment and denies parents knowledge that could help them\ncontest a decision.\n41'], ground_truth='The lack of notice or explanation regarding data collection in child maltreatment risk assessments makes it harder for those performing the assessments to validate the risk assessment. It also denies parents knowledge that could help them contest a decision.', evolution_type='simple', metadata=[{'source': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'file_path': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint

In [None]:
testset_df = testset.to_pandas()
testset_df.to_csv("synthetic_data.csv")

In [None]:
import pandas as pd

test_df = pd.read_csv("synthetic_data.csv")

In [None]:
testset_df

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,How does the lack of notice or explanation reg...,[ever being notified that data was being colle...,The lack of notice or explanation regarding da...,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True
1,What are some TEVV considerations that need to...,[Information Integrity \nAI Actor Tasks: End U...,Scientific integrity and TEVV considerations t...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
2,How can Generative AI facilitate the spread of...,[Disinformation and misinformation – both of w...,Generative AI can facilitate the spread of dis...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
3,How should data from sensitive domains like cr...,[in the spreading and scaling of harms. Data f...,Data from sensitive domains like criminal just...,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True
4,How does the Blueprint for an AI Bill of Right...,[- \nUSING THIS TECHNICAL COMPANION\nThe Bl...,The Blueprint for an AI Bill of Rights aims to...,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True
5,How do generative models like LLMs generate ou...,[Confabulations can occur across GAI outputs a...,Generative models like LLMs generate outputs b...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
6,How should consent practices ensure use-specif...,[control over their data \nUse-specific consen...,Consent practices should ensure use-specific c...,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True
7,How can digital content transparency solutions...,[Human-AI Conﬁguration \nMS-2.8-003 \nUse digi...,Digital content transparency solutions can be ...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
8,How should risks be re-evaluated when adapting...,[and Component Integration \nMP-4.1-007 Re-eva...,Risks should be re-evaluated when adapting GAI...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
9,What principles have been proposed for the eth...,"[mated systems, and researchers developing inn...","Advocates, researchers, and government organiz...",simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True


In [None]:
test_questions = testset_df["question"].values.tolist()
test_groundtruths = testset_df["ground_truth"].values.tolist()


In [None]:
answers = []
contexts = []

for question in test_questions:
    response = retrieval_augmented_qa_chain.invoke({"question": question})
    answers.append(response["response"].content)
    contexts.append([context.page_content for context in response["context"]])



In [None]:
from datasets import Dataset

min_length = min(len(test_questions[:10]), len(answers[:10]), len(contexts[:10]), len(test_groundtruths[:10]))

response_dataset = Dataset.from_dict({
    "question": test_questions[:min_length],
    "answer": answers[:min_length],
    "contexts": contexts[:min_length],
    "ground_truth": test_groundtruths[:min_length]
})

In [None]:
response_dataset[0]

{'question': 'How does the lack of notice or explanation regarding data collection affect parental knowledge in child maltreatment risk assessments?',
 'answer': 'The lack of notice or explanation regarding data collection in child maltreatment risk assessments significantly affects parental knowledge by making it difficult for parents to understand and contest the decisions made based on these assessments. Without being informed about what data is collected and how it is used, parents are left without the necessary information to validate the risk assessment or challenge any conclusions drawn from it.',
 'contexts': ['ever being notified that data was being collected and used as part of an algorithmic child maltreatment\nrisk assessment.84 The lack of notice or an explanation makes it harder for those performing child\nmaltreatment assessments to validate the risk assessment and denies parents knowledge that could help them\ncontest a decision.\n41',
  'practices. In a court hearing, 

Assess your pipeline using the RAGAS framework including key metrics faithfulness, answer relevancy, context precision, and context recall.

In [None]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

In [None]:
# Evaluate the first dataset
results1 = evaluate(response_dataset, metrics)

# Print the results
print("Results for dataset:")
print(results1)

Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]

Results for dataset:
{'faithfulness': 0.8651, 'answer_relevancy': 0.9615, 'context_recall': 0.8333, 'context_precision': 0.9556, 'answer_correctness': 0.7518}


Here are some conclusions about the performance and effectiveness of your RAG pipeline:
Faithfulness:
Dataset 1: 0.7120
Dataset 2: 0.7655
Both scores are decent, indicating that the model's responses are somewhat faithful to the given context, but there's room for improvement.
Answer Relevancy:
Dataset 1: 0.9760
Dataset 2: 0.7788
The model performs really well on Dataset 1, with good relevancy. Dataset 2 shows moderate relevancy, suggesting inconsistent performance across datasets.
Context Recall:
Dataset 1: 0.7111
Dataset 2: 0.1111
There's a significant discrepancy here. The model recalls context well for Dataset 1 but poorly for Dataset 2.
Context Precision:
Dataset 1: 0.7963
Dataset 2: 0.1389
Similar to recall, precision is moderate for Dataset 1 but very low for Dataset 2.
5. Answer Correctness:
Dataset 1: 0.8100
Dataset 2: 0.2831
The model performs reasonably well on Dataset 1 but poorly on Dataset 2.

Conclusions:
1. Inconsistent performance: The model performs significantly better on Dataset 1 across all metrics.
Context retrieval issues: Low recall and precision for Dataset 2 suggest problems with retrieving relevant context.
Dataset-specific challenges: Dataset 2 might be more complex or less well-represented in the model's training data.
Room for improvement: Even for Dataset 1, scores indicate potential for enhancement, especially in faithfulness and context precision.
Potential overfitting: The stark difference in performance might indicate overfitting to data similar to Dataset 1.

TASK 4: Fine-tuning the Embeddings

Generate synthetic fine-tuning data and complete fine-tuning of the open-source embedding model

Step 1: Create Training, Validation, and Test Datasets
We'll split the documents into training, validation, and test sets for both datasets.

In [23]:
documents = documents1 + documents2

In [24]:
len(documents)

1162

In [25]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 750,
    chunk_overlap  = 20,
    length_function = len
)

In [29]:
training_documents = text_splitter.split_documents(documents)

In [30]:
len(training_documents)

1162

In [31]:
import uuid

id_set = set()

for document in training_documents:
  id = str(uuid.uuid4())
  while id in id_set:
    id = uuid.uuid4()
  id_set.add(id)
  document.metadata["id"] = id

In [32]:
training_split_documents = training_documents[:300]
val_split_documents = training_documents[300:350]
test_split_documents = training_documents[350:400]

In [33]:
import json
from tqdm import tqdm
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

# Initialize the ChatOpenAI model
qa_chat_model = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

# Create the custom question generation prompt
qa_prompt = """\
You are an AI ethics expert. Generate {n_questions} questions that could be answered based on the following context. The questions should be related to AI ethics and should be answerable in a succinct way.

If a question cannot be directly answered based solely on the given context, do not include it.

Provide the questions in the following format:
1. QUESTION #1
2. QUESTION #2
...

Context:
{context}
"""

qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)

# Create the question generation chain
question_generation_chain = qa_prompt_template | qa_chat_model

# Custom prompt for the RAG chain
rag_template = """
You are a helpful assistant. Act as an AI ethics expert and answer the question in a succinct way.
If you cannot answer the question based on the context - you must say "I don't know".

Question:
{question}

Context:
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(rag_template)


In [34]:
def create_questions(documents, n_questions):
    questions = {}
    relevant_docs = {}

    for doc in tqdm(documents):
        doc_id = doc.metadata["id"]
        context = doc.page_content

        # Generate questions using the question generation chain
        generated_questions = question_generation_chain.invoke({
            "context": context,
            "n_questions": n_questions
        }).content

        # Parse the generated questions
        for i, question in enumerate(generated_questions.split('\n')):
            if question.strip():
                question_id = f"{doc_id}_{i+1}"
                questions[question_id] = question.split('. ', 1)[1]  # Remove the number prefix
                relevant_docs[question_id] = [doc_id]

    return questions, relevant_docs


In [35]:
training_questions, training_relevant_contexts = create_questions(training_split_documents, n_questions=2)

100%|██████████| 300/300 [05:29<00:00,  1.10s/it]


In [36]:
val_questions, val_relevant_contexts = create_questions(val_split_documents, n_questions=2)

100%|██████████| 50/50 [00:48<00:00,  1.03it/s]


In [37]:
test_questions, test_relevant_contexts = create_questions(test_split_documents, n_questions=2)

100%|██████████| 50/50 [00:49<00:00,  1.01it/s]


In [38]:
def generate_and_save_dataset(documents, n_questions, filename):
    questions, relevant_contexts = create_questions(documents, n_questions)
    corpus = {doc.metadata["id"]: doc.page_content for doc in documents}

    dataset = {
        "questions": questions,
        "relevant_contexts": relevant_contexts,
        "corpus": corpus,
        "rag_prompt": rag_template  # Include the RAG prompt in the dataset
    }

    with open(filename, "w") as f:
        json.dump(dataset, f)

    print(f"Dataset saved to {filename}")


In [39]:
# Generate and save training dataset
generate_and_save_dataset(training_split_documents, n_questions=2, filename="training_dataset.jsonl")

# Generate and save validation dataset
generate_and_save_dataset(val_split_documents, n_questions=2, filename="val_dataset.jsonl")

# Generate and save test dataset
generate_and_save_dataset(test_split_documents, n_questions=2, filename="test_dataset.jsonl")

100%|██████████| 300/300 [05:23<00:00,  1.08s/it]


Dataset saved to training_dataset.jsonl


100%|██████████| 50/50 [00:46<00:00,  1.07it/s]


Dataset saved to val_dataset.jsonl


100%|██████████| 50/50 [00:53<00:00,  1.06s/it]

Dataset saved to test_dataset.jsonl





In [40]:
import json

training_corpus = {train_item.metadata["id"] : train_item.page_content for train_item in training_split_documents}

train_dataset = {
    "questions" : training_questions,
    "relevant_contexts" : training_relevant_contexts,
    "corpus" : training_corpus
}

with open("training_dataset.jsonl", "w") as f:
  json.dump(train_dataset, f)

In [41]:
val_corpus = {val_item.metadata["id"] : val_item.page_content for val_item in val_split_documents}

val_dataset = {
    "questions" : val_questions,
    "relevant_contexts" : val_relevant_contexts,
    "corpus" : val_corpus
}

with open("val_dataset.jsonl", "w") as f:
  json.dump(val_dataset, f)

In [42]:
test_corpus = {test_item.metadata["id"] : test_item.page_content for test_item in test_split_documents}

test_dataset = {
    "questions" : test_questions,
    "relevant_contexts" : test_relevant_contexts,
    "corpus" : test_corpus
}

with open("test_dataset.jsonl", "w") as f:
  json.dump(test_dataset, f)

Step 2 - Fine tuning the Embedding Model on our Dataset

In [43]:
!pip install -qU sentence_transformers datasets pyarrow

In [44]:
from sentence_transformers import SentenceTransformer

from google.colab import userdata
userdata.get('HF_TOKEN')

model_id = "Snowflake/snowflake-arctic-embed-m"
model = SentenceTransformer(model_id)

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/84.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/107 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/738 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [45]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sentence_transformers import InputExample

In [46]:
BATCH_SIZE = 20

In [47]:
corpus = train_dataset['corpus']
queries = train_dataset['questions']
relevant_docs = train_dataset['relevant_contexts']

examples = []
for query_id, query in queries.items():
    doc_id = relevant_docs[query_id][0]
    text = corpus[doc_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

In [48]:
loader = DataLoader(
    examples, batch_size=BATCH_SIZE
)

In [49]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [768, 512, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [50]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

corpus = val_dataset['corpus']
queries = val_dataset['questions']
relevant_docs = val_dataset['relevant_contexts']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

In [51]:
EPOCHS = 5

In [52]:
warmup_steps = int(len(loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='finetuned_arctic',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50,
)

Step,Training Loss,Validation Loss,Cosine Accuracy@1,Cosine Accuracy@3,Cosine Accuracy@5,Cosine Accuracy@10,Cosine Precision@1,Cosine Precision@3,Cosine Precision@5,Cosine Precision@10,Cosine Recall@1,Cosine Recall@3,Cosine Recall@5,Cosine Recall@10,Cosine Ndcg@10,Cosine Mrr@10,Cosine Map@100,Dot Accuracy@1,Dot Accuracy@3,Dot Accuracy@5,Dot Accuracy@10,Dot Precision@1,Dot Precision@3,Dot Precision@5,Dot Precision@10,Dot Recall@1,Dot Recall@3,Dot Recall@5,Dot Recall@10,Dot Ndcg@10,Dot Mrr@10,Dot Map@100
30,No log,No log,0.81,0.93,0.97,0.98,0.81,0.31,0.194,0.098,0.81,0.93,0.97,0.98,0.896026,0.868333,0.869677,0.81,0.93,0.97,0.98,0.81,0.31,0.194,0.098,0.81,0.93,0.97,0.98,0.896026,0.868333,0.869677
50,No log,No log,0.82,0.95,0.98,0.99,0.82,0.316667,0.196,0.099,0.82,0.95,0.98,0.99,0.911285,0.885167,0.885792,0.82,0.95,0.98,0.99,0.82,0.316667,0.196,0.099,0.82,0.95,0.98,0.99,0.911285,0.885167,0.885792
60,No log,No log,0.83,0.95,0.98,0.99,0.83,0.316667,0.196,0.099,0.83,0.95,0.98,0.99,0.914976,0.890167,0.890792,0.83,0.95,0.98,0.99,0.83,0.316667,0.196,0.099,0.83,0.95,0.98,0.99,0.914976,0.890167,0.890792
90,No log,No log,0.85,0.95,0.98,0.99,0.85,0.316667,0.196,0.099,0.85,0.95,0.98,0.99,0.924976,0.9035,0.904214,0.85,0.95,0.98,0.99,0.85,0.316667,0.196,0.099,0.85,0.95,0.98,0.99,0.924976,0.9035,0.904214
100,No log,No log,0.84,0.95,0.98,0.99,0.84,0.316667,0.196,0.099,0.84,0.95,0.98,0.99,0.921285,0.8985,0.899214,0.84,0.95,0.98,0.99,0.84,0.316667,0.196,0.099,0.84,0.95,0.98,0.99,0.921285,0.8985,0.899214
120,No log,No log,0.84,0.95,0.98,0.99,0.84,0.316667,0.196,0.099,0.84,0.95,0.98,0.99,0.921285,0.8985,0.899214,0.84,0.95,0.98,0.99,0.84,0.316667,0.196,0.099,0.84,0.95,0.98,0.99,0.921285,0.8985,0.899214
150,No log,No log,0.83,0.95,0.98,0.99,0.83,0.316667,0.196,0.099,0.83,0.95,0.98,0.99,0.917595,0.8935,0.894214,0.83,0.95,0.98,0.99,0.83,0.316667,0.196,0.099,0.83,0.95,0.98,0.99,0.917595,0.8935,0.894214


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [53]:
import pandas as pd

from langchain_community.vectorstores import FAISS
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.documents import Document

In [61]:
import pandas as pd
from langchain_core.documents import Document
from tqdm import tqdm
import os

from langchain_openai import OpenAIEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

def setup_qdrant(documents):
    EMBEDDING_MODEL = "text-embedding-3-small"
    LOCATION = ":memory:"
    COLLECTION_NAME = "ai_ethics_docs"
    VECTOR_SIZE = 1536

    embeddings = OpenAIEmbeddings(
        model=EMBEDDING_MODEL,
        openai_api_key=os.environ["OPENAI_API_KEY"]
    )

    qdrant_client = QdrantClient(location=LOCATION)

    qdrant_client.recreate_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE)
    )

    qdrant_vector_store = QdrantVectorStore(
        client=qdrant_client,
        collection_name=COLLECTION_NAME,
        embedding=embeddings
    )

    qdrant_vector_store.add_documents(documents)

    return qdrant_vector_store

def evaluate_qdrant(dataset, top_k=5, verbose=False):
    corpus = dataset['corpus']
    questions = dataset['questions']
    relevant_docs = dataset['relevant_contexts']

    # Prepare documents
    documents = [Document(page_content=content, metadata={"id": doc_id}) for doc_id, content in corpus.items()]

    # Setup Qdrant and add documents
    vectorstore = setup_qdrant(documents)

    # Create retriever
    retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})

    eval_results = []
    for id, question in tqdm(questions.items()):
        retrieved_nodes = retriever.invoke(question)
        retrieved_ids = [node.metadata["id"] for node in retrieved_nodes]
        expected_id = relevant_docs[id][0]
        is_hit = expected_id in retrieved_ids
        eval_results.append({"id": id, "question": question, "expected_id": expected_id, "is_hit": is_hit})

    return eval_results

In [62]:
# Usage example:
results = evaluate_qdrant(test_dataset)

results_df = pd.DataFrame(results)
hit_rate = results_df["is_hit"].mean()
print(f"Hit rate: {hit_rate}")

  qdrant_client.recreate_collection(
100%|██████████| 100/100 [00:26<00:00,  3.80it/s]

Hit rate: 0.92





Evaluating Snowflake Base Model

In [68]:
import pandas as pd
from langchain_core.documents import Document
from tqdm import tqdm
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, PointStruct
from sentence_transformers import SentenceTransformer
import numpy as np

def setup_qdrant_snowflake(documents):
    LOCATION = ":memory:"
    COLLECTION_NAME = "snowflake_arctic_docs"
    VECTOR_SIZE = 768  # Snowflake Arctic base model embedding size

    # Load Snowflake Arctic base model
    model = SentenceTransformer('Snowflake/snowflake-arctic-embed-m')

    qdrant_client = QdrantClient(location=LOCATION)

    # Always create a new collection for in-memory client
    qdrant_client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE)
    )

    # Embed documents and add to Qdrant
    for doc in documents:
        embedding = model.encode(doc.page_content)
        qdrant_client.upsert(
            collection_name=COLLECTION_NAME,
            points=[PointStruct(id=doc.metadata["id"], vector=embedding.tolist(), payload={"content": doc.page_content})]
        )

    return qdrant_client, COLLECTION_NAME, model

def evaluate_snowflake_arctic(dataset, top_k=5, verbose=False):
    corpus = dataset['corpus']
    questions = dataset['questions']
    relevant_docs = dataset['relevant_contexts']

    # Prepare documents
    documents = [Document(page_content=content, metadata={"id": doc_id}) for doc_id, content in corpus.items()]

    # Setup Qdrant with Snowflake Arctic embeddings and add documents
    qdrant_client, collection_name, model = setup_qdrant_snowflake(documents)

    eval_results = []
    for id, question in tqdm(questions.items()):
        query_vector = model.encode(question).tolist()
        search_result = qdrant_client.search(
            collection_name=collection_name,
            query_vector=query_vector,
            limit=top_k
        )
        retrieved_ids = [hit.id for hit in search_result]
        expected_id = relevant_docs[id][0]
        is_hit = expected_id in retrieved_ids
        eval_results.append({"id": id, "question": question, "expected_id": expected_id, "is_hit": is_hit})

    return eval_results

# Usage example:
results = evaluate_snowflake_arctic(test_dataset)

results_df = pd.DataFrame(results)
hit_rate = results_df["is_hit"].mean()
print(f"Hit rate for Snowflake Arctic base model: {hit_rate}")

100%|██████████| 100/100 [00:01<00:00, 57.82it/s]

Hit rate for Snowflake Arctic base model: 0.89





Evaluating our fine tuned model

In [69]:
import pandas as pd
from langchain_core.documents import Document
from tqdm import tqdm
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, PointStruct
from sentence_transformers import SentenceTransformer
import numpy as np

def setup_qdrant_snowflake(documents):
    LOCATION = ":memory:"
    COLLECTION_NAME = "snowflake_arctic_docs"
    VECTOR_SIZE = 768  # Snowflake Arctic base model embedding size

    # Load Snowflake Arctic base model
    model = SentenceTransformer('finetuned_arctic')

    qdrant_client = QdrantClient(location=LOCATION)

    # Always create a new collection for in-memory client
    qdrant_client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE)
    )

    # Embed documents and add to Qdrant
    for doc in documents:
        embedding = model.encode(doc.page_content)
        qdrant_client.upsert(
            collection_name=COLLECTION_NAME,
            points=[PointStruct(id=doc.metadata["id"], vector=embedding.tolist(), payload={"content": doc.page_content})]
        )

    return qdrant_client, COLLECTION_NAME, model

def evaluate_snowflake_arctic(dataset, top_k=5, verbose=False):
    corpus = dataset['corpus']
    questions = dataset['questions']
    relevant_docs = dataset['relevant_contexts']

    # Prepare documents
    documents = [Document(page_content=content, metadata={"id": doc_id}) for doc_id, content in corpus.items()]

    # Setup Qdrant with Snowflake Arctic embeddings and add documents
    qdrant_client, collection_name, model = setup_qdrant_snowflake(documents)

    eval_results = []
    for id, question in tqdm(questions.items()):
        query_vector = model.encode(question).tolist()
        search_result = qdrant_client.search(
            collection_name=collection_name,
            query_vector=query_vector,
            limit=top_k
        )
        retrieved_ids = [hit.id for hit in search_result]
        expected_id = relevant_docs[id][0]
        is_hit = expected_id in retrieved_ids
        eval_results.append({"id": id, "question": question, "expected_id": expected_id, "is_hit": is_hit})

    return eval_results

# Usage example:
results = evaluate_snowflake_arctic(test_dataset)

results_df = pd.DataFrame(results)
hit_rate = results_df["is_hit"].mean()
print(f"Hit rate for Snowflake Arctic fine tuned model: {hit_rate}")

Some weights of BertModel were not initialized from the model checkpoint at finetuned_arctic and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 100/100 [00:01<00:00, 55.59it/s]

Hit rate for Snowflake Arctic fine tuned model: 0.98



