In [1]:
import os
import pandas as pd
from typing import List, Dict, Any
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Qdrant
from langchain.schema.document import Document
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
from dotenv import load_dotenv
import time
import uuid  # Import UUID module

# Load environment variables
load_dotenv()

# Configuration
COLLECTION_NAME = "founders"
VECTOR_DIM = 1536  # OpenAI embedding dimension
EMBEDDING_MODEL = "text-embedding-3-small"
BATCH_SIZE = 10  # Process profiles in smaller batches to avoid rate limits

print("Step 1: Setting up the vector database...")
# Initialize Qdrant client (in-memory)
client = QdrantClient(":memory:")

# Create collection
client.recreate_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(size=VECTOR_DIM, distance=Distance.COSINE)
)
print("✅ Vector database initialized")

print("\nStep 2: Loading profiles from CSV...")
# Update this path to your CSV file location
csv_path = "specter-people-db--export_small.csv"  # Update this to your CSV file path
df = pd.read_csv(csv_path)
profiles = df.to_dict('records')
print(f"✅ Loaded {len(profiles)} profiles from CSV")

print("\nStep 3: Creating LangChain documents for embedding...")
# Initialize the embeddings model
embeddings_model = OpenAIEmbeddings(model=EMBEDDING_MODEL)

# Create LangChain documents with UUID4 IDs
documents = []
document_ids = []  # Store UUIDs separately

for p in profiles:
    # Generate a UUID4 for this document
    doc_id = str(uuid.uuid4())
    
    # Build a text representation including key fields
    text_parts = []
    
    # Add core identity information
    if p.get('Full Name'): text_parts.append(f"Name: {p.get('Full Name')}")
    if p.get('Current Position'): text_parts.append(f"Position: {p.get('Current Position Title')}")
    if p.get('Company'): text_parts.append(f"Company: {p.get('Current Position Company Name')}")
    if p.get('Location'): text_parts.append(f"Location: {p.get('Location')}")
    
    # Add contact and social media information
    if p.get('LinkedIn'): text_parts.append(f"LinkedIn: {p.get('LinkedIn - URL')}")
    if p.get('Twitter'): text_parts.append(f"Twitter: {p.get('Twitter - URL')}")
    if p.get('Website'): text_parts.append(f"Website: {p.get('Website - URL')}")
    if p.get('Email'): text_parts.append(f"Email: {p.get('Email')}")
    
    # Add detailed professional information
    if p.get('About'): text_parts.append(f"About: {p.get('About')}")
    if p.get('Skills'): text_parts.append(f"Skills: {p.get('Skills')}")
    if p.get('Experience'): text_parts.append(f"Experience: {p.get('Experience')}")
    if p.get('Education'): text_parts.append(f"Education: {p.get('Education')}")
    
    # Add any industry or sector information
    if p.get('Industry'): text_parts.append(f"Industry: {p.get('Industry')}")
    if p.get('Sector'): text_parts.append(f"Sector: {p.get('Sector')}")
    
    # Add any entrepreneurial information
    if p.get('Previous Startups'): text_parts.append(f"Previous Startups: {p.get('Previous Startups')}")
    if p.get('Funding History'): text_parts.append(f"Funding History: {p.get('Funding History')}")
    
    # Add any additional fields that might be in the CSV
    for key, value in p.items():
        if (key not in ['Full Name', 'Current Position', 'Company', 'Location', 
                       'LinkedIn', 'Twitter', 'Website', 'Email',
                       'About', 'Skills', 'Experience', 'Education', 
                       'Industry', 'Sector', 'Previous Startups', 'Funding History'] 
            and value and str(value).lower() != 'nan'):
            text_parts.append(f"{key}: {value}")
    
    # Join all parts with newlines for better separation
    text = "\n".join(text_parts)
    
    # Add the UUID to the metadata
    p['doc_id'] = doc_id
    
    # Create a LangChain Document with the text and metadata
    document = Document(
        page_content=text,
        metadata=p  # Store the original profile as metadata (now includes doc_id)
    )
    documents.append(document)
    document_ids.append(doc_id)

# Print a sample document with its UUID
if documents:
    print("\nSample document for embedding:")
    print("-" * 50)
    sample_doc = documents[0]
    sample_id = document_ids[0]
    print(f"Document ID (UUID4): {sample_id}")
    if len(sample_doc.page_content) > 500:
        print(sample_doc.page_content[:500] + "...")
    else:
        print(sample_doc.page_content)
    print("-" * 50)

print(f"✅ Created {len(documents)} documents with UUID4 identifiers")

print("\nStep 4: Creating LangChain vector store and retriever...")
# First, create the vector store with the existing client
vector_store = Qdrant(
    client=client,  # Use the existing client
    collection_name=COLLECTION_NAME,
    embeddings=embeddings_model
)

# Process in batches to avoid rate limits
total_documents = len(documents)
num_batches = (total_documents + BATCH_SIZE - 1) // BATCH_SIZE

# Process documents in batches
for batch_idx in range(num_batches):
    start_idx = batch_idx * BATCH_SIZE
    end_idx = min(start_idx + BATCH_SIZE, total_documents)
    
    print(f"Processing batch {batch_idx+1}/{num_batches} (documents {start_idx+1}-{end_idx})...")
    
    batch_docs = documents[start_idx:end_idx]
    batch_ids = document_ids[start_idx:end_idx]
    
    try:
        # Extract texts and metadatas from documents
        texts = [doc.page_content for doc in batch_docs]
        metadatas = [doc.metadata for doc in batch_docs]
        
        # Add texts to the vector store with their UUIDs
        vector_store.add_texts(
            texts=texts,
            metadatas=metadatas,
            ids=batch_ids  # Use UUID4 strings as IDs
        )
        
        # Sleep to avoid rate limits
        if batch_idx < num_batches - 1:
            print("Waiting 1 second before next batch...")
            time.sleep(1)  # 1 second delay between batches
        
    except Exception as e:
        print(f"Error processing batch {batch_idx+1}/{num_batches}: {str(e)}")
        # Continue with next batch

print(f"✅ Finished processing {total_documents} documents with UUID4 identifiers")

# Create a retriever from the vector store
retriever = vector_store.as_retriever(
    search_type="similarity",  # Options: "similarity", "mmr"
    search_kwargs={"k": 3}     # Return top 3 results
)

print("✅ Created LangChain retriever")

print("\n" + "="*50)

Step 1: Setting up the vector database...
✅ Vector database initialized

Step 2: Loading profiles from CSV...
✅ Loaded 20 profiles from CSV

Step 3: Creating LangChain documents for embedding...

Sample document for embedding:
--------------------------------------------------
Document ID (UUID4): 691a9329-8d0e-4eaa-839d-58bf312baca4
Name: Keith Teare
Location: Palo Alto, California, United States, United States
About: I am a founder and CEO at SignalRank, a technology company that uses data intelligence to form a partner network with top-performing managers. We have built a financial instrument that captures top decile value creation that is exclusive to the best early stage companies. I have over 40 years of experience in digital technology, as a founder, CTO, or CEO, in various domains, such as internet services, keywords...
--------------------------------------------------
✅ Created 20 documents with UUID4 identifiers

Step 4: Creating LangChain vector store and retriever...
Proce

  client.recreate_collection(
  vector_store = Qdrant(


Waiting 1 second before next batch...
Processing batch 2/2 (documents 11-20)...
✅ Finished processing 20 documents with UUID4 identifiers
✅ Created LangChain retriever



In [2]:
from langchain_core.prompts import ChatPromptTemplate

RAG_PROMPT = """
CONTEXT:
{context}

QUERY:
{question}

You are a helpful assistant. Use the available context to answer the question. If you can't answer the question, say you don't know.
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)

In [3]:
from langchain_openai import ChatOpenAI

openai_chat_model = ChatOpenAI(model="gpt-4o-mini")

In [5]:
from operator import itemgetter
from langchain.schema.output_parser import StrOutputParser

rag_chain = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | rag_prompt | openai_chat_model | StrOutputParser()
)

In [6]:
rag_chain.invoke({"question" : "can you help me find founders who would be a good fit for a startup that is building a platform for AI agents to collaborate on tasks?"})

"Based on the profiles available, here are two founders who may be a good fit for a startup that is building a platform for AI agents to collaborate on tasks:\n\n1. **Mark Goldenson**\n   - **Current Position**: Investor & Advisor at Regrello\n   - **Experience**: Mark has extensive experience in building products and helping founders, particularly in AI. His background includes roles at Google where he led product management and built AI-powered solutions.\n   - **About**: He focuses on technology and entrepreneurship, making him a valuable asset in the development of AI products.\n   - **LinkedIn**: [Mark Goldenson](https://www.linkedin.com/in/goldenson)\n\n2. **Alexey Skutin**\n   - **Current Position**: Founder at AideAI\n   - **Experience**: With a Ph.D. and over 20 years in the IT industry, Alexey specializes in enterprise, security, and SaaS/PaaS software. His work at AideAI involves creating AI solutions, which aligns closely with the startup's mission of AI agent collaboration

In [8]:
from getpass import getpass
os.environ["RAGAS_APP_TOKEN"] = getpass("Please enter your Ragas API key!")

In [9]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

  for match in re.finditer('{0}\s*'.format(re.escape(sent)), self.original_text):
  txt = re.sub('(?<={0})\.'.format(am), '∯', txt)
  txt = re.sub('(?<={0})\.'.format(am), '∯', txt)


In [10]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 750,
    chunk_overlap  = 20,
    length_function = len
)
training_documents = text_splitter.split_documents(documents)

In [11]:
id_set = set()

for document in training_documents:
  id = str(uuid.uuid4())
  while id in id_set:
    id = uuid.uuid4()
  id_set.add(id)
  document.metadata["id"] = id

In [12]:
len(training_documents)

513

In [13]:
# For a dataset of 513 documents
# Maintaining approximately the same proportions (76.5% / 11.8% / 11.8%)

# Calculate the number of documents for validation and test sets
val_size = int(513 * 0.118)  # ~60 documents
test_size = int(513 * 0.118)  # ~60 documents
train_size = 513 - val_size - test_size  # ~393 documents

# Create the splits
training_split_documents = training_documents[:train_size]
val_split_documents = training_documents[train_size:train_size + val_size]
test_split_documents = training_documents[train_size + val_size:]

# Print the sizes to verify
print(f"Training set size: {len(training_split_documents)}")
print(f"Validation set size: {len(val_split_documents)}")
print(f"Test set size: {len(test_split_documents)}")
print(f"Total: {len(training_split_documents) + len(val_split_documents) + len(test_split_documents)}")

Training set size: 393
Validation set size: 60
Test set size: 60
Total: 513


In [14]:
from langchain_openai import ChatOpenAI

qa_chat_model = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

In [15]:
from langchain_core.prompts import ChatPromptTemplate

qa_prompt = """\
Given the following context, you must generate questions based on only the provided context.

You are to generate {n_questions} questions which should be provided in the following format:

1. QUESTION #1
2. QUESTION #2
...

Context:
{context}
"""

qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)

In [16]:
question_generation_chain = qa_prompt_template | qa_chat_model

In [17]:
import asyncio
from tqdm import tqdm

async def process_document(document, n_questions):
    questions_generated = await question_generation_chain.ainvoke({"context": document.page_content, "n_questions": n_questions})
    
    doc_questions = {}
    doc_relevant_docs = {}
    
    for question in questions_generated.content.split("\n"):
        question_id = str(uuid.uuid4())
        doc_questions[question_id] = "".join(question.split(".")[1:]).strip()
        doc_relevant_docs[question_id] = [document.metadata["id"]]
    
    return doc_questions, doc_relevant_docs

async def create_questions(documents, n_questions):
    tasks = [process_document(doc, n_questions) for doc in documents]
    
    questions = {}
    relevant_docs = {}
    
    for task in tqdm(asyncio.as_completed(tasks), total=len(documents), desc="Processing documents"):
        doc_questions, doc_relevant_docs = await task
        questions.update(doc_questions)
        relevant_docs.update(doc_relevant_docs)
    return questions, relevant_docs

In [18]:
training_questions, training_relevant_contexts = await create_questions(training_split_documents, 2)

Processing documents: 100%|██████████| 393/393 [00:25<00:00, 15.45it/s] 


In [19]:
val_questions, val_relevant_contexts = await create_questions(val_split_documents, 2)

Processing documents: 100%|██████████| 60/60 [00:07<00:00,  8.50it/s]


In [20]:
test_questions, test_relevant_contexts = await create_questions(test_split_documents, 2)

Processing documents: 100%|██████████| 60/60 [00:04<00:00, 14.14it/s]


In [21]:
import json

training_corpus = {train_item.metadata["id"] : train_item.page_content for train_item in training_split_documents}

train_dataset = {
    "questions" : training_questions,
    "relevant_contexts" : training_relevant_contexts,
    "corpus" : training_corpus
}

with open("training_dataset.jsonl", "w") as f:
  json.dump(train_dataset, f)

In [22]:
val_corpus = {val_item.metadata["id"] : val_item.page_content for val_item in val_split_documents}

val_dataset = {
    "questions" : val_questions,
    "relevant_contexts" : val_relevant_contexts,
    "corpus" : val_corpus
}

with open("val_dataset.jsonl", "w") as f:
  json.dump(val_dataset, f)

In [23]:
train_corpus = {test_item.metadata["id"] : test_item.page_content for test_item in test_split_documents}

test_dataset = {
    "questions" : test_questions,
    "relevant_contexts" : test_relevant_contexts,
    "corpus" : train_corpus
}

with open("test_dataset.jsonl", "w") as f:
  json.dump(test_dataset, f)

In [24]:
from sentence_transformers import SentenceTransformer

model_id = "Snowflake/snowflake-arctic-embed-l"
model = SentenceTransformer(model_id)

In [25]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sentence_transformers import InputExample

In [26]:
BATCH_SIZE = 10

In [27]:
corpus = train_dataset['corpus']
queries = train_dataset['questions']
relevant_docs = train_dataset['relevant_contexts']

examples = []
for query_id, query in queries.items():
    doc_id = relevant_docs[query_id][0]
    text = corpus[doc_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

In [28]:
loader = DataLoader(
    examples, batch_size=BATCH_SIZE
)

In [29]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [768, 512, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [30]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

corpus = val_dataset['corpus']
queries = val_dataset['questions']
relevant_docs = val_dataset['relevant_contexts']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

In [31]:
EPOCHS = 10

In [32]:
import wandb
wandb.init(mode="disabled")

In [33]:
warmup_steps = int(len(loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='finetuned_arctic_ft',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50
)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



Step,Training Loss,Validation Loss,Cosine Accuracy@1,Cosine Accuracy@3,Cosine Accuracy@5,Cosine Accuracy@10,Cosine Precision@1,Cosine Precision@3,Cosine Precision@5,Cosine Precision@10,Cosine Recall@1,Cosine Recall@3,Cosine Recall@5,Cosine Recall@10,Cosine Ndcg@10,Cosine Mrr@10,Cosine Map@100
50,No log,No log,0.783333,0.95,0.95,0.975,0.783333,0.316667,0.19,0.0975,0.783333,0.95,0.95,0.975,0.891719,0.863968,0.865364
79,No log,No log,0.808333,0.958333,0.975,0.983333,0.808333,0.319444,0.195,0.098333,0.808333,0.958333,0.975,0.983333,0.908024,0.8825,0.883472
100,No log,No log,0.85,0.958333,0.966667,0.991667,0.85,0.319444,0.193333,0.099167,0.85,0.958333,0.966667,0.991667,0.926481,0.904861,0.905075
150,No log,No log,0.808333,0.975,0.975,0.975,0.808333,0.325,0.195,0.0975,0.808333,0.975,0.975,0.975,0.909124,0.886111,0.887604
158,No log,No log,0.816667,0.975,0.975,0.975,0.816667,0.325,0.195,0.0975,0.816667,0.975,0.975,0.975,0.910017,0.8875,0.889003
200,No log,No log,0.841667,0.975,0.975,0.975,0.841667,0.325,0.195,0.0975,0.841667,0.975,0.975,0.975,0.921426,0.902778,0.904089
237,No log,No log,0.816667,0.958333,0.975,0.975,0.816667,0.319444,0.195,0.0975,0.816667,0.958333,0.975,0.975,0.911044,0.888889,0.890263
250,No log,No log,0.833333,0.958333,0.975,0.975,0.833333,0.319444,0.195,0.0975,0.833333,0.958333,0.975,0.975,0.916104,0.895833,0.897044
300,No log,No log,0.825,0.958333,0.975,0.975,0.825,0.319444,0.195,0.0975,0.825,0.958333,0.975,0.975,0.910847,0.888889,0.890002
316,No log,No log,0.833333,0.966667,0.975,0.975,0.833333,0.322222,0.195,0.0975,0.833333,0.966667,0.975,0.975,0.9145,0.89375,0.895011


In [34]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [35]:
hf_username = "ngiometti"

In [37]:
model.push_to_hub(f"{hf_username}/legal-ft-3")

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

'https://huggingface.co/ngiometti/legal-ft-3/commit/7b3153ca1e1cbfe487564581ed2a67b41701f3d8'

In [38]:
from langchain_huggingface import HuggingFaceEmbeddings
finetune_embeddings = HuggingFaceEmbeddings(model_name="finetuned_arctic_ft")

Some weights of BertModel were not initialized from the model checkpoint at finetuned_arctic_ft and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, embedding_model=finetune_embeddings)
dataset = generator.generate_with_langchain_docs(documents, testset_size=10)

Applying HeadlinesExtractor:   0%|          | 0/20 [00:00<?, ?it/s]

Applying HeadlineSplitter:   0%|          | 0/20 [00:00<?, ?it/s]

Applying SummaryExtractor:   0%|          | 0/21 [00:00<?, ?it/s]

Property 'summary' already exists in node '183070'. Skipping!


Applying CustomNodeFilter:   0%|          | 0/50 [00:00<?, ?it/s]

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/111 [00:00<?, ?it/s]

Property 'summary_embedding' already exists in node '183070'. Skipping!
unable to apply transformation: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-LsdVv2bXqJ9TPrSXuJ0Tazh3 on tokens per min (TPM): Limit 30000, Used 29748, Requested 3047. Please try again in 5.59s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
unable to apply transformation: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-LsdVv2bXqJ9TPrSXuJ0Tazh3 on tokens per min (TPM): Limit 30000, Used 29116, Requested 3775. Please try again in 5.782s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
unable to apply transformation: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-LsdVv2bXqJ9TPrSXuJ0Tazh3 on tokens per min (TPM): 

Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/2 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/10 [00:00<?, ?it/s]

In [43]:
dataset.to_pandas()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,What role does the University of Kent in Ameri...,"[Experience"",""Consumer Internet"",""Digital Mark...",The individual serves as a volunteer director ...,single_hop_specifc_query_synthesizer
1,What Keith Teare do in United States?,"[Name: Keith Teare Location: Palo Alto, Califo...",Keith Teare is a founder and CEO at SignalRank...,single_hop_specifc_query_synthesizer
2,how taiwan fit into the strategy for realnames...,"[about the IP Communications space, and partic...","RealNames made deals across 12 countries, incl...",single_hop_specifc_query_synthesizer
3,what u know bout University of Kent and what k...,"[Education"",""Hardware"",""Internet Services"",""Te...",Keith Teare was presented with an honorary doc...,single_hop_specifc_query_synthesizer
4,As an International Finance and Strategy Exper...,"[Experience: [{""Company Name"":""Start-up-Chris-...",Polkadot is a blockspace ecosystem designed fo...,single_hop_specifc_query_synthesizer
5,How can an aspiring tech entrepreneur leverage...,[<1-hop>\n\nName: Anthony Kelani Location: Los...,An aspiring tech entrepreneur can learn from A...,multi_hop_specific_query_synthesizer
6,How has being featured in Forbes and other maj...,[<1-hop>\n\nWANT TO GET IN TOUCH? Shoot me a D...,Being featured in major publications such as F...,multi_hop_specific_query_synthesizer
7,What be the connection between OPEAR and Opera...,"[<1-hop>\n\nHighlights: [""fortune_500_experien...",The connection between OPEAR and Operam in ter...,multi_hop_specific_query_synthesizer
8,How did Poornima Vijayashanker and Jeff Pressm...,"[<1-hop>\n\nSkills: [""Software Development"",""P...","Poornima Vijayashanker, as the Co-Founder and ...",multi_hop_specific_query_synthesizer
9,What role did Oliver Walsh play in the growth ...,[<1-hop>\n\nName: Oliver Walsh Location: Los A...,Oliver Walsh served as the CMO and Board Direc...,multi_hop_specific_query_synthesizer


In [44]:
dataset.upload()

Testset uploaded! View at https://app.ragas.io/dashboard/alignment/testset/fe27a246-a89e-4779-8f69-435d3c6c2a83


'https://app.ragas.io/dashboard/alignment/testset/fe27a246-a89e-4779-8f69-435d3c6c2a83'

In [45]:
def retrieve(state):
  retrieved_docs = retriever.invoke(state["question"])
  return {"context" : retrieved_docs}

In [46]:
from langchain.prompts import ChatPromptTemplate

RAG_PROMPT = """\
You are a helpful assistant who answers questions based on provided context. You must only use the provided context, and cannot use your own knowledge.

### Question
{question}

### Context
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)

In [47]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

In [48]:
def generate(state):
  docs_content = "\n\n".join(doc.page_content for doc in state["context"])
  messages = rag_prompt.format_messages(question=state["question"], context=docs_content)
  response = llm.invoke(messages)
  return {"response" : response.content}

In [49]:
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
from langchain_core.documents import Document

class State(TypedDict):
  question: str
  context: List[Document]
  response: str

graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [50]:
response = graph.invoke({"question" : "Who has the best profile suited to founding an AI startup?"})

In [51]:
response["response"]

'Based on the provided context, both Christopher Obereder and Mark Goldenson have strong profiles suited to founding an AI startup, but they each bring different strengths:\n\n### Christopher Obereder\n- **Experience**: Over 12 years in tech, with successful exits of four companies and a deep involvement in venture capital, notably through his firm, Start-Up-Chris Ventures.\n- **Recognition**: Featured in Forbes 30 Under 30, indicating significant achievement in a short period.\n- **Skills**: Expertise in growth hacking and fundraising, with a proven track record of raising over $360 million for startups.\n- **Network**: An extensive investment portfolio with notable companies like Coinbase, Airbnb, and more, showing strong connections in the industry.\n- **Vision**: Driven by a passion for transformative solutions, indicating a strong entrepreneurial spirit.\n\n### Mark Goldenson\n- **Experience**: 28 years in technology and startups, with a focus on artificial intelligence, health ca

In [52]:
for test_row in dataset:
  response = graph.invoke({"question" : test_row.eval_sample.user_input})
  test_row.eval_sample.response = response["response"]
  test_row.eval_sample.retrieved_contexts = [context.page_content for context in response["context"]]

In [53]:
dataset.to_pandas()

Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,synthesizer_name
0,What role does the University of Kent in Ameri...,"[Name: Christian Busch\nLocation: Los Angeles,...","[Experience"",""Consumer Internet"",""Digital Mark...",The University of Kent in America plays a role...,The individual serves as a volunteer director ...,single_hop_specifc_query_synthesizer
1,What Keith Teare do in United States?,"[Name: Keith Teare\nLocation: Palo Alto, Calif...","[Name: Keith Teare Location: Palo Alto, Califo...","Keith Teare is based in Palo Alto, California,...",Keith Teare is a founder and CEO at SignalRank...,single_hop_specifc_query_synthesizer
2,how taiwan fit into the strategy for realnames...,"[Name: Alan Chiu\nLocation: Palo Alto, Califor...","[about the IP Communications space, and partic...",Taiwan fit into the strategy for RealNames dur...,"RealNames made deals across 12 countries, incl...",single_hop_specifc_query_synthesizer
3,what u know bout University of Kent and what k...,"[Name: Keith Teare\nLocation: Palo Alto, Calif...","[Education"",""Hardware"",""Internet Services"",""Te...",Keith Teare has an honorary doctorate from the...,Keith Teare was presented with an honorary doc...,single_hop_specifc_query_synthesizer
4,As an International Finance and Strategy Exper...,"[Name: Keith Teare\nLocation: Palo Alto, Calif...","[Experience: [{""Company Name"":""Start-up-Chris-...",Polkadot significantly contributes to the adva...,Polkadot is a blockspace ecosystem designed fo...,single_hop_specifc_query_synthesizer
5,How can an aspiring tech entrepreneur leverage...,[Name: Poornima Vijayashanker\nLocation: Palo ...,[<1-hop>\n\nName: Anthony Kelani Location: Los...,An aspiring tech entrepreneur can leverage the...,An aspiring tech entrepreneur can learn from A...,multi_hop_specific_query_synthesizer
6,How has being featured in Forbes and other maj...,"[Name: James Creech\nLocation: Los Angeles, Ca...",[<1-hop>\n\nWANT TO GET IN TOUCH? Shoot me a D...,Being featured in prestigious publications lik...,Being featured in major publications such as F...,multi_hop_specific_query_synthesizer
7,What be the connection between OPEAR and Opera...,[Name: Poornima Vijayashanker\nLocation: Palo ...,"[<1-hop>\n\nHighlights: [""fortune_500_experien...",The connection between OPEAR and Operam in ter...,The connection between OPEAR and Operam in ter...,multi_hop_specific_query_synthesizer
8,How did Poornima Vijayashanker and Jeff Pressm...,"[Name: Jeff Pressman\nLocation: Los Angeles, C...","[<1-hop>\n\nSkills: [""Software Development"",""P...",Poornima Vijayashanker and Jeff Pressman made ...,"Poornima Vijayashanker, as the Co-Founder and ...",multi_hop_specific_query_synthesizer
9,What role did Oliver Walsh play in the growth ...,"[Name: Oliver Walsh\nLocation: Los Angeles, Ca...",[<1-hop>\n\nName: Oliver Walsh Location: Los A...,Oliver Walsh played a crucial role in the grow...,Oliver Walsh served as the CMO and Board Direc...,multi_hop_specific_query_synthesizer


In [54]:
from ragas import EvaluationDataset

evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())

In [55]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))

In [56]:
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity
from ragas import evaluate, RunConfig

custom_run_config = RunConfig(timeout=360)

result = evaluate(
    dataset=evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
)
result

Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]

Exception raised in Job[1]: TimeoutError()
Exception raised in Job[4]: TimeoutError()
Exception raised in Job[6]: TimeoutError()
Exception raised in Job[7]: TimeoutError()
Exception raised in Job[10]: TimeoutError()
Exception raised in Job[13]: TimeoutError()
Exception raised in Job[11]: TimeoutError()
Exception raised in Job[12]: TimeoutError()
Exception raised in Job[16]: TimeoutError()
Exception raised in Job[18]: TimeoutError()
Exception raised in Job[19]: TimeoutError()
Exception raised in Job[22]: TimeoutError()
Exception raised in Job[24]: TimeoutError()
Exception raised in Job[25]: TimeoutError()
Exception raised in Job[28]: TimeoutError()
Exception raised in Job[29]: TimeoutError()
Exception raised in Job[30]: TimeoutError()
Exception raised in Job[34]: TimeoutError()
Exception raised in Job[31]: TimeoutError()
Exception raised in Job[36]: TimeoutError()
Exception raised in Job[37]: TimeoutError()
Exception raised in Job[35]: TimeoutError()
Exception raised in Job[40]: Timeout

{'context_recall': 1.0000, 'faithfulness': 0.7812, 'factual_correctness': 0.4720, 'answer_relevancy': 0.9383, 'context_entity_recall': 0.4722, 'noise_sensitivity_relevant': 0.4856}