In [1]:
# Import necessary libraries
import os
import pandas as pd
from typing import List, Dict, Any
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Qdrant
from langchain.schema.document import Document
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
from dotenv import load_dotenv
import time
import uuid  # Import UUID module

# Load environment variables
load_dotenv()

# Configuration
COLLECTION_NAME = "founders"
VECTOR_DIM = 1536  # OpenAI embedding dimension
EMBEDDING_MODEL = "text-embedding-3-small"
BATCH_SIZE = 10  # Process profiles in smaller batches to avoid rate limits

print("Step 1: Setting up the vector database...")
# Initialize Qdrant client (in-memory)
client = QdrantClient(":memory:")

# Create collection
client.recreate_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(size=VECTOR_DIM, distance=Distance.COSINE)
)
print("✅ Vector database initialized")

print("\nStep 2: Loading profiles from CSV...")
# Update this path to your CSV file location
csv_path = "specter-people-db--export_small.csv"  # Update this to your CSV file path
df = pd.read_csv(csv_path)
profiles = df.to_dict('records')
print(f"✅ Loaded {len(profiles)} profiles from CSV")

print("\nStep 3: Creating LangChain documents for embedding...")
# Initialize the embeddings model
embeddings_model = OpenAIEmbeddings(model=EMBEDDING_MODEL)

# Create LangChain documents with UUID4 IDs
documents = []
document_ids = []  # Store UUIDs separately

for p in profiles:
    # Generate a UUID4 for this document
    doc_id = str(uuid.uuid4())
    
    # Build a text representation including key fields
    text_parts = []
    
    # Add core identity information
    if p.get('Full Name'): text_parts.append(f"Name: {p.get('Full Name')}")
    if p.get('Current Position'): text_parts.append(f"Position: {p.get('Current Position Title')}")
    if p.get('Company'): text_parts.append(f"Company: {p.get('Current Position Company Name')}")
    if p.get('Location'): text_parts.append(f"Location: {p.get('Location')}")
    
    # Add contact and social media information
    if p.get('LinkedIn'): text_parts.append(f"LinkedIn: {p.get('LinkedIn - URL')}")
    if p.get('Twitter'): text_parts.append(f"Twitter: {p.get('Twitter - URL')}")
    if p.get('Website'): text_parts.append(f"Website: {p.get('Website - URL')}")
    if p.get('Email'): text_parts.append(f"Email: {p.get('Email')}")
    
    # Add detailed professional information
    if p.get('About'): text_parts.append(f"About: {p.get('About')}")
    if p.get('Skills'): text_parts.append(f"Skills: {p.get('Skills')}")
    if p.get('Experience'): text_parts.append(f"Experience: {p.get('Experience')}")
    if p.get('Education'): text_parts.append(f"Education: {p.get('Education')}")
    
    # Add any industry or sector information
    if p.get('Industry'): text_parts.append(f"Industry: {p.get('Industry')}")
    if p.get('Sector'): text_parts.append(f"Sector: {p.get('Sector')}")
    
    # Add any entrepreneurial information
    if p.get('Previous Startups'): text_parts.append(f"Previous Startups: {p.get('Previous Startups')}")
    if p.get('Funding History'): text_parts.append(f"Funding History: {p.get('Funding History')}")
    
    # Add any additional fields that might be in the CSV
    for key, value in p.items():
        if (key not in ['Full Name', 'Current Position', 'Company', 'Location', 
                       'LinkedIn', 'Twitter', 'Website', 'Email',
                       'About', 'Skills', 'Experience', 'Education', 
                       'Industry', 'Sector', 'Previous Startups', 'Funding History'] 
            and value and str(value).lower() != 'nan'):
            text_parts.append(f"{key}: {value}")
    
    # Join all parts with newlines for better separation
    text = "\n".join(text_parts)
    
    # Add the UUID to the metadata
    p['doc_id'] = doc_id
    
    # Create a LangChain Document with the text and metadata
    document = Document(
        page_content=text,
        metadata=p  # Store the original profile as metadata (now includes doc_id)
    )
    documents.append(document)
    document_ids.append(doc_id)

# Print a sample document with its UUID
if documents:
    print("\nSample document for embedding:")
    print("-" * 50)
    sample_doc = documents[0]
    sample_id = document_ids[0]
    print(f"Document ID (UUID4): {sample_id}")
    if len(sample_doc.page_content) > 500:
        print(sample_doc.page_content[:500] + "...")
    else:
        print(sample_doc.page_content)
    print("-" * 50)

print(f"✅ Created {len(documents)} documents with UUID4 identifiers")

print("\nStep 4: Creating LangChain vector store and retriever...")
# First, create the vector store with the existing client
vector_store = Qdrant(
    client=client,  # Use the existing client
    collection_name=COLLECTION_NAME,
    embeddings=embeddings_model
)

# Process in batches to avoid rate limits
total_documents = len(documents)
num_batches = (total_documents + BATCH_SIZE - 1) // BATCH_SIZE

# Process documents in batches
for batch_idx in range(num_batches):
    start_idx = batch_idx * BATCH_SIZE
    end_idx = min(start_idx + BATCH_SIZE, total_documents)
    
    print(f"Processing batch {batch_idx+1}/{num_batches} (documents {start_idx+1}-{end_idx})...")
    
    batch_docs = documents[start_idx:end_idx]
    batch_ids = document_ids[start_idx:end_idx]
    
    try:
        # Extract texts and metadatas from documents
        texts = [doc.page_content for doc in batch_docs]
        metadatas = [doc.metadata for doc in batch_docs]
        
        # Add texts to the vector store with their UUIDs
        vector_store.add_texts(
            texts=texts,
            metadatas=metadatas,
            ids=batch_ids  # Use UUID4 strings as IDs
        )
        
        # Sleep to avoid rate limits
        if batch_idx < num_batches - 1:
            print("Waiting 1 second before next batch...")
            time.sleep(1)  # 1 second delay between batches
        
    except Exception as e:
        print(f"Error processing batch {batch_idx+1}/{num_batches}: {str(e)}")
        # Continue with next batch

print(f"✅ Finished processing {total_documents} documents with UUID4 identifiers")

# Create a retriever from the vector store
retriever = vector_store.as_retriever(
    search_type="similarity",  # Options: "similarity", "mmr"
    search_kwargs={"k": 3}     # Return top 3 results
)

print("✅ Created LangChain retriever")

print("\n" + "="*50)

Step 1: Setting up the vector database...
✅ Vector database initialized

Step 2: Loading profiles from CSV...
✅ Loaded 20 profiles from CSV

Step 3: Creating LangChain documents for embedding...

Sample document for embedding:
--------------------------------------------------
Document ID (UUID4): 76749484-3c7f-42cc-b615-623e55fb205c
Name: Keith Teare
Location: Palo Alto, California, United States, United States
About: I am a founder and CEO at SignalRank, a technology company that uses data intelligence to form a partner network with top-performing managers. We have built a financial instrument that captures top decile value creation that is exclusive to the best early stage companies. I have over 40 years of experience in digital technology, as a founder, CTO, or CEO, in various domains, such as internet services, keywords...
--------------------------------------------------
✅ Created 20 documents with UUID4 identifiers

Step 4: Creating LangChain vector store and retriever...
Proce

  client.recreate_collection(
  vector_store = Qdrant(


Waiting 1 second before next batch...
Processing batch 2/2 (documents 11-20)...
✅ Finished processing 20 documents with UUID4 identifiers
✅ Created LangChain retriever



In [2]:
print("EXAMPLE 1: BASIC RETRIEVAL")
print("="*50)

query = "AI experts in healthcare"  # You can change this query
print(f"Retrieving documents for: '{query}'")

# Retrieve documents
retrieved_docs = retriever.invoke(query)

# Display results
print(f"Retrieved {len(retrieved_docs)} documents")
for i, doc in enumerate(retrieved_docs):
    profile = doc.metadata
    
    print(f"\nResult #{i+1}: {profile.get('Full Name', 'Unknown')}")
    print(f"Document ID: {profile.get('doc_id')}")  # Display the UUID
    print(f"Position: {profile.get('Current Position', 'N/A')}")
    print(f"Company: {profile.get('Company', 'N/A')}")
    print(f"Location: {profile.get('Location', 'N/A')}")
    if profile.get('Skills'):
        print(f"Skills: {profile.get('Skills')}")
    
    # Print a snippet of the document content
    content = doc.page_content
    if len(content) > 200:
        print(f"Content snippet: {content[:200]}...")
    else:
        print(f"Content: {content}")

print("\n" + "="*50)

EXAMPLE 1: BASIC RETRIEVAL
Retrieving documents for: 'AI experts in healthcare'
Retrieved 3 documents

Result #1: Alexey Skutin
Document ID: ef776d21-76d3-4fd2-b9b2-36428f019e0f
Position: N/A
Company: N/A
Location: Pacifica, California, United States, United States
Skills: ["Cloud Computing","Software Development","Product Management","Start-ups","SaaS","Agile Methodologies","Entrepreneurship","Enterprise Software","Java","Mobile Applications","PaaS","Project Management","Software Project Management","Business Intelligence","IaaS","Java Enterprise Edition","Web Development","Apache","Integration","MySQL","Scrum","Cloud Security","Distributed Systems","Leadership","Management skills","Mobile Devices","Scalability","Software Engineering","Software as a Service (SaaS)","Technical expertise","Web Services","Git","PHP","Strategic Partnerships","Web Applications","Analytical Skills","B2B2C","B2C","Business-to-Business (B2B)","Cost Management","ERP Software","Problem Solving","Risk Management

In [5]:
from langchain_core.prompts import ChatPromptTemplate

RAG_PROMPT = """
CONTEXT:
{context}

QUERY:
{question}

You are a helpful assistant. Use the available context to answer the question. If you can't answer the question, say you don't know.
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)

In [6]:
from langchain_openai import ChatOpenAI

openai_chat_model = ChatOpenAI(model="gpt-4o-mini")

In [7]:
from operator import itemgetter
from langchain.schema.output_parser import StrOutputParser

rag_chain = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | rag_prompt | openai_chat_model | StrOutputParser()
)

In [10]:
rag_chain.invoke({"question" : "can you help me find founders who would be a good fit for a startup that is building a platform for AI agents to collaborate on tasks?"})

"Based on the provided context, here are a few founders who could be a good fit for a startup developing a platform for AI agents to collaborate on tasks:\n\n1. **Mark Goldenson**\n   - **Location:** Mountain View, California, United States\n   - **Tagline:** Founder & startup investor\n   - **About:** Mark has extensive experience in startups and technology. His focus includes building products that solve real-world problems and helping founders. His background in product management, especially at Google, could be advantageous for developing collaborative AI systems.\n\n   **LinkedIn:** [Mark Goldenson](https://www.linkedin.com/in/goldenson)\n\n2. **Alexey Skutin**\n   - **Location:** Pacifica, California, United States\n   - **Tagline:** Founder @ Aide AI | Co-Founder and CTO @ CultureBee | IT Expertise\n   - **About:** Alexey has 20+ years of experience in the IT industry, specializing in enterprise, security, and SaaS/PaaS software. His leadership in creating complex high-load syst

In [15]:
!pip install -qU ragas

1913.69s - pydevd: Sending message related to process being replaced timed-out after 5 seconds



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [12]:
from getpass import getpass
os.environ["RAGAS_APP_TOKEN"] = getpass("Please enter your Ragas API key!")

In [16]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

In [18]:
!pip install rapidfuzz

1959.16s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Collecting rapidfuzz
  Obtaining dependency information for rapidfuzz from https://files.pythonhosted.org/packages/1d/ce/f209f437c6df46ba523a6898ebd854b30196650f77dcddf203191f09bf9b/rapidfuzz-3.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached rapidfuzz-3.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Using cached rapidfuzz-3.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
Installing collected packages: rapidfuzz
Successfully installed rapidfuzz-3.12.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [19]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(documents, testset_size=10)

Applying SummaryExtractor:  24%|██▍       | 5/21 [00:24<01:18,  4.93s/it]   Property 'summary' already exists in node '13636c'. Skipping!
Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/113 [00:00<?, ?it/s]Property 'summary_embedding' already exists in node '13636c'. Skipping!
Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:  73%|███████▎  | 83/113 [02:46<01:39,  3.33s/it]unable to apply transformation: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-LsdVv2bXqJ9TPrSXuJ0Tazh3 on tokens per min (TPM): Limit 30000, Used 28335, Requested 3502. Please try again in 3.674s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:  83%|████████▎ | 94/113 [03:22<01:10,  3.73s/it]unable to apply transformation: Error code: 429 - {'error': {'message': 'Rate limit reached

In [20]:
dataset.to_pandas()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,Wht is the role of SignalRank Corporation?,"[Experience"",""Consumer Internet"",""Digital Mark...",SignalRank Corporation is a company where the ...,single_hop_specifc_query_synthesizer
1,What JPRS do in Japan?,"[about the IP Communications space, and partic...","In Japan, JPRS was involved in deals for names...",single_hop_specifc_query_synthesizer
2,What services does Airbnb provide?,"[Experience: [{""Company Name"":""Start-up-Chris-...","Airbnb, Inc. operates an online marketplace fo...",single_hop_specifc_query_synthesizer
3,What role does Airbnb play in Christopher Ober...,[Name: Christopher Obereder Location: Atherton...,Airbnb is one of the industry giants included ...,single_hop_specifc_query_synthesizer
4,How does Poornima Vijayashanker's experience i...,[<1-hop>\n\nName: Poornima Vijayashanker Locat...,Poornima Vijayashanker's role at Apple involve...,multi_hop_abstract_query_synthesizer
5,How does B. Pagels-Minor's advocacy for mutual...,[<1-hop>\n\nName: B. Pagels-Minor Location: Lo...,B. Pagels-Minor's advocacy for mutual accounta...,multi_hop_abstract_query_synthesizer
6,How has Mark Goldenson's experience in artific...,"[<1-hop>\n\nEducation"",""HR and Recruiting"",""He...",Mark Goldenson's extensive experience in artif...,multi_hop_abstract_query_synthesizer
7,How has Greg Badros contributed to brand trans...,"[<1-hop>\n\nExperience: [{""Company Name"":""Glur...",Greg Badros has significantly contributed to b...,multi_hop_abstract_query_synthesizer
8,How does Greg Badros' experience and expertise...,[<1-hop>\n\nName: Greg Badros Location: Los Al...,"Greg Badros, with his extensive experience in ...",multi_hop_specific_query_synthesizer
9,How has Jeff Pressman's experience with Operam...,[<1-hop>\n\nName: Jeff Pressman Location: Los ...,"Jeff Pressman's experience with Operam, where ...",multi_hop_specific_query_synthesizer


In [21]:
dataset.upload()

Testset uploaded! View at https://app.ragas.io/dashboard/alignment/testset/463522e1-c768-4c3d-ac15-47caad4f1e26


'https://app.ragas.io/dashboard/alignment/testset/463522e1-c768-4c3d-ac15-47caad4f1e26'

In [22]:
def retrieve(state):
  retrieved_docs = retriever.invoke(state["question"])
  return {"context" : retrieved_docs}

In [23]:
from langchain.prompts import ChatPromptTemplate

RAG_PROMPT = """\
You are a helpful assistant who answers questions based on provided context. You must only use the provided context, and cannot use your own knowledge.

### Question
{question}

### Context
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)

In [24]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

In [25]:
def generate(state):
  docs_content = "\n\n".join(doc.page_content for doc in state["context"])
  messages = rag_prompt.format_messages(question=state["question"], context=docs_content)
  response = llm.invoke(messages)
  return {"response" : response.content}

In [26]:
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
from langchain_core.documents import Document

class State(TypedDict):
  question: str
  context: List[Document]
  response: str

In [27]:
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [28]:
response = graph.invoke({"question" : "Which profiles are best suited for building autonomous agents startups?"})

In [29]:
response["response"]

'The profiles best suited for building autonomous agents startups based on the provided context include:\n\n1. **Christopher Obereder**: \n   - Position: Chief Executive Officer at Start-Up-Chris Ventures\n   - Experience: Over 12 years in tech, notable for his background in venture capital, entrepreneurship, and growth hacking. He has successfully led multiple startups and invested heavily in the tech industry, including major players in blockchain and software.\n   - Skills: Strong in fundraising and growth strategies, making him well-placed to drive startups focusing on innovative solutions like autonomous agents.\n\n2. **Anthony Kelani**:\n   - Position: Chief Technology Officer at Mursion\n   - Experience: Over 20 years in the tech industry, with a strong focus on artificial intelligence and real-time technology. He is actively developing AI-powered platforms that simulate human interactions, which align closely with the fundamentals of creating autonomous agents.\n   - Skills: Ex

In [30]:
for test_row in dataset:
  response = graph.invoke({"question" : test_row.eval_sample.user_input})
  test_row.eval_sample.response = response["response"]
  test_row.eval_sample.retrieved_contexts = [context.page_content for context in response["context"]]

In [31]:
dataset.to_pandas()

Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,synthesizer_name
0,Wht is the role of SignalRank Corporation?,"[Name: Keith Teare\nLocation: Palo Alto, Calif...","[Experience"",""Consumer Internet"",""Digital Mark...","The role of SignalRank Corporation, as describ...",SignalRank Corporation is a company where the ...,single_hop_specifc_query_synthesizer
1,What JPRS do in Japan?,[Name: coinstats.app\nLocation: 42629\nAbout: ...,"[about the IP Communications space, and partic...",The context provided does not contain any spec...,"In Japan, JPRS was involved in deals for names...",single_hop_specifc_query_synthesizer
2,What services does Airbnb provide?,"[Name: Allen Narcisse\nLocation: Los Angeles, ...","[Experience: [{""Company Name"":""Start-up-Chris-...",The provided context does not mention Airbnb o...,"Airbnb, Inc. operates an online marketplace fo...",single_hop_specifc_query_synthesizer
3,What role does Airbnb play in Christopher Ober...,[Name: Christopher Obereder\nLocation: Atherto...,[Name: Christopher Obereder Location: Atherton...,Airbnb plays a significant role in Christopher...,Airbnb is one of the industry giants included ...,single_hop_specifc_query_synthesizer
4,How does Poornima Vijayashanker's experience i...,[Name: Poornima Vijayashanker\nLocation: Palo ...,[<1-hop>\n\nName: Poornima Vijayashanker Locat...,Poornima Vijayashanker's experience in cross-f...,Poornima Vijayashanker's role at Apple involve...,multi_hop_abstract_query_synthesizer
5,How does B. Pagels-Minor's advocacy for mutual...,"[Name: B. Pagels-Minor\nLocation: Los Angeles,...",[<1-hop>\n\nName: B. Pagels-Minor Location: Lo...,B. Pagels-Minor's advocacy for mutual accounta...,B. Pagels-Minor's advocacy for mutual accounta...,multi_hop_abstract_query_synthesizer
6,How has Mark Goldenson's experience in artific...,[Name: Mark Goldenson\nLocation: Mountain View...,"[<1-hop>\n\nEducation"",""HR and Recruiting"",""He...",Mark Goldenson's extensive experience in artif...,Mark Goldenson's extensive experience in artif...,multi_hop_abstract_query_synthesizer
7,How has Greg Badros contributed to brand trans...,"[Name: Greg Badros\nLocation: Los Altos, Calif...","[<1-hop>\n\nExperience: [{""Company Name"":""Glur...",Greg Badros has made significant contributions...,Greg Badros has significantly contributed to b...,multi_hop_abstract_query_synthesizer
8,How does Greg Badros' experience and expertise...,"[Name: Greg Badros\nLocation: Los Altos, Calif...",[<1-hop>\n\nName: Greg Badros Location: Los Al...,Greg Badros' experience and expertise signific...,"Greg Badros, with his extensive experience in ...",multi_hop_specific_query_synthesizer
9,How has Jeff Pressman's experience with Operam...,"[Name: Jeff Pressman\nLocation: Los Angeles, C...",[<1-hop>\n\nName: Jeff Pressman Location: Los ...,Jeff Pressman's experience with Operam and OPE...,"Jeff Pressman's experience with Operam, where ...",multi_hop_specific_query_synthesizer


In [32]:
from ragas import EvaluationDataset

evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())

In [33]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))

In [34]:
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity
from ragas import evaluate, RunConfig

custom_run_config = RunConfig(timeout=360)

result = evaluate(
    dataset=evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
)
result

Evaluating:  24%|██▎       | 17/72 [05:21<26:20, 28.74s/it]Exception raised in Job[0]: TimeoutError()
Exception raised in Job[4]: TimeoutError()
Exception raised in Job[12]: TimeoutError()
Exception raised in Job[13]: TimeoutError()
Evaluating:  25%|██▌       | 18/72 [06:00<28:38, 31.82s/it]Exception raised in Job[1]: TimeoutError()
Exception raised in Job[16]: TimeoutError()
Evaluating:  32%|███▏      | 23/72 [06:02<08:34, 10.51s/it]Exception raised in Job[18]: TimeoutError()
Evaluating:  33%|███▎      | 24/72 [06:03<07:07,  8.91s/it]Exception raised in Job[19]: TimeoutError()
Evaluating:  38%|███▊      | 27/72 [06:11<04:30,  6.00s/it]Exception raised in Job[22]: TimeoutError()
Evaluating:  40%|████      | 29/72 [06:41<06:42,  9.36s/it]Exception raised in Job[23]: TimeoutError()
Evaluating:  42%|████▏     | 30/72 [06:47<05:56,  8.50s/it]Exception raised in Job[24]: TimeoutError()
Evaluating:  44%|████▍     | 32/72 [06:57<04:43,  7.09s/it]Exception raised in Job[25]: TimeoutError()
Eva

{'context_recall': 0.3333, 'faithfulness': 0.5122, 'factual_correctness': 0.4292, 'answer_relevancy': 0.7233, 'context_entity_recall': 0.4103, 'noise_sensitivity_relevant': 0.1962}

## Results of RAGAS Eval Prior to Fine-tuning:
{'context_recall': 0.3333, 'faithfulness': 0.5122, 'factual_correctness': 0.4292, 'answer_relevancy': 0.7233, 'context_entity_recall': 0.4103, 'noise_sensitivity_relevant': 0.1962}



In [38]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 750,
    chunk_overlap  = 20,
    length_function = len
)
training_documents = text_splitter.split_documents(documents)

In [39]:
id_set = set()

for document in training_documents:
  id = str(uuid.uuid4())
  while id in id_set:
    id = uuid.uuid4()
  id_set.add(id)
  document.metadata["id"] = id

In [40]:
len(training_documents)

513

In [46]:
# For a dataset of 513 documents
# Maintaining approximately the same proportions (76.5% / 11.8% / 11.8%)

# Calculate the number of documents for validation and test sets
val_size = int(513 * 0.118)  # ~60 documents
test_size = int(513 * 0.118)  # ~60 documents
train_size = 513 - val_size - test_size  # ~393 documents

# Create the splits
training_split_documents = training_documents[:train_size]
val_split_documents = training_documents[train_size:train_size + val_size]
test_split_documents = training_documents[train_size + val_size:]

# Print the sizes to verify
print(f"Training set size: {len(training_split_documents)}")
print(f"Validation set size: {len(val_split_documents)}")
print(f"Test set size: {len(test_split_documents)}")
print(f"Total: {len(training_split_documents) + len(val_split_documents) + len(test_split_documents)}")

Training set size: 393
Validation set size: 60
Test set size: 60
Total: 513


In [48]:
from langchain_openai import ChatOpenAI

qa_chat_model = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

In [49]:
import nest_asyncio

nest_asyncio.apply()

In [50]:
from langchain_core.prompts import ChatPromptTemplate

qa_prompt = """\
Given the following context, you must generate questions based on only the provided context.

You are to generate {n_questions} questions which should be provided in the following format:

1. QUESTION #1
2. QUESTION #2
...

Context:
{context}
"""

qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)

In [51]:
question_generation_chain = qa_prompt_template | qa_chat_model

In [52]:
import asyncio
import uuid
from tqdm import tqdm

async def process_document(document, n_questions):
    questions_generated = await question_generation_chain.ainvoke({"context": document.page_content, "n_questions": n_questions})
    
    doc_questions = {}
    doc_relevant_docs = {}
    
    for question in questions_generated.content.split("\n"):
        question_id = str(uuid.uuid4())
        doc_questions[question_id] = "".join(question.split(".")[1:]).strip()
        doc_relevant_docs[question_id] = [document.metadata["id"]]
    
    return doc_questions, doc_relevant_docs

async def create_questions(documents, n_questions):
    tasks = [process_document(doc, n_questions) for doc in documents]
    
    questions = {}
    relevant_docs = {}
    
    for task in tqdm(asyncio.as_completed(tasks), total=len(documents), desc="Processing documents"):
        doc_questions, doc_relevant_docs = await task
        questions.update(doc_questions)
        relevant_docs.update(doc_relevant_docs)
    return questions, relevant_docs

In [53]:
training_questions, training_relevant_contexts = await create_questions(training_split_documents, 2)

Processing documents: 100%|██████████| 393/393 [00:13<00:00, 29.88it/s] 


In [54]:
val_questions, val_relevant_contexts = await create_questions(val_split_documents, 2)

Processing documents: 100%|██████████| 60/60 [00:06<00:00,  9.04it/s]


In [55]:
test_questions, test_relevant_contexts = await create_questions(test_split_documents, 2)

Processing documents: 100%|██████████| 60/60 [00:03<00:00, 17.98it/s]


In [56]:
import json

training_corpus = {train_item.metadata["id"] : train_item.page_content for train_item in training_split_documents}

train_dataset = {
    "questions" : training_questions,
    "relevant_contexts" : training_relevant_contexts,
    "corpus" : training_corpus
}

with open("training_dataset.jsonl", "w") as f:
  json.dump(train_dataset, f)

In [57]:
val_corpus = {val_item.metadata["id"] : val_item.page_content for val_item in val_split_documents}

val_dataset = {
    "questions" : val_questions,
    "relevant_contexts" : val_relevant_contexts,
    "corpus" : val_corpus
}

with open("val_dataset.jsonl", "w") as f:
  json.dump(val_dataset, f)

In [58]:
train_corpus = {test_item.metadata["id"] : test_item.page_content for test_item in test_split_documents}

test_dataset = {
    "questions" : test_questions,
    "relevant_contexts" : test_relevant_contexts,
    "corpus" : train_corpus
}

with open("test_dataset.jsonl", "w") as f:
  json.dump(test_dataset, f)

In [59]:
!pip install -qU sentence_transformers datasets pyarrow

6998.55s - pydevd: Sending message related to process being replaced timed-out after 5 seconds



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [60]:
from sentence_transformers import SentenceTransformer

model_id = "Snowflake/snowflake-arctic-embed-l"
model = SentenceTransformer(model_id)

In [61]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sentence_transformers import InputExample

In [62]:
BATCH_SIZE = 10

In [63]:
corpus = train_dataset['corpus']
queries = train_dataset['questions']
relevant_docs = train_dataset['relevant_contexts']

examples = []
for query_id, query in queries.items():
    doc_id = relevant_docs[query_id][0]
    text = corpus[doc_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

In [64]:
loader = DataLoader(
    examples, batch_size=BATCH_SIZE
)

In [65]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [768, 512, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [66]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

corpus = val_dataset['corpus']
queries = val_dataset['questions']
relevant_docs = val_dataset['relevant_contexts']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

In [67]:
EPOCHS = 10

In [69]:
!pip install wandb
import wandb
wandb.init(mode="disabled")

7323.92s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Collecting wandb
  Obtaining dependency information for wandb from https://files.pythonhosted.org/packages/7d/53/361846bf44dcf3bc5a4be0e0cb662b42c7e6996d71c903c936e191657e0d/wandb-0.19.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading wandb-0.19.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Obtaining dependency information for docker-pycreds>=0.4.0 from https://files.pythonhosted.org/packages/f5/e8/f6bd1eee09314e7e6dee49cbe2c5e22314ccdb38db16c9fc72d2fa80d054/docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Obtaining dependency information for gitpython!=3.1.29,>=1.0.0 from https://files.pythonhosted.org/packages/1d/9a/4114a9057db2f1462d5c8f8390ab7383925fe1ac012eaa42402ad65c2963/GitPython-3.1.44-py3-none-any.whl.metadata
  Downloading GitPython-3.1.44-py3

In [77]:
!pip install -qU faiss-cpu python-pptx==1.0.2 nltk==3.9.1 pymupdf beautifulsoup4 lxml 

7863.63s - pydevd: Sending message related to process being replaced timed-out after 5 seconds



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [75]:
!pip install accelerate>=0.26.0

7805.20s - pydevd: Sending message related to process being replaced timed-out after 5 seconds



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [84]:
!pip install --upgrade sentence-transformers

8136.05s - pydevd: Sending message related to process being replaced timed-out after 5 seconds



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [87]:
!pip install -qU transformers[torch]>=4.48.3

235141.37s - pydevd: Sending message related to process being replaced timed-out after 5 seconds



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [89]:
!pip install -qU accelerate>=1.3.0

235202.65s - pydevd: Sending message related to process being replaced timed-out after 5 seconds



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [90]:
warmup_steps = int(len(loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='finetuned_arctic_ft',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50
    )

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

In [91]:
!pip show transformers accelerate

235341.52s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Name: transformers
Version: 4.49.0
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /home/ngiometti/aie5/code/aie5/Midterm/.venv/lib/python3.11/site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: sentence-transformers
---
Name: accelerate
Version: 1.4.0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: zach.mueller@huggingface.co
License: Apache
Location: /home/ngiometti/aie5/code/aie5/Midterm/.venv/lib/python3.11/site-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: 


In [92]:
!pip show torch

235483.77s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Name: torch
Version: 2.6.0
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3-Clause
Location: /home/ngiometti/aie5/code/aie5/Midterm/.venv/lib/python3.11/site-packages
Requires: filelock, fsspec, jinja2, networkx, nvidia-cublas-cu12, nvidia-cuda-cupti-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-runtime-cu12, nvidia-cudnn-cu12, nvidia-cufft-cu12, nvidia-curand-cu12, nvidia-cusolver-cu12, nvidia-cusparse-cu12, nvidia-cusparselt-cu12, nvidia-nccl-cu12, nvidia-nvjitlink-cu12, nvidia-nvtx-cu12, sympy, triton, typing-extensions
Required-by: accelerate, sentence-transformers
