In [1]:
%pip install dspy-ai
%pip install langchain-experimental

Collecting joblib~=1.3.2 (from dspy-ai)
  Using cached joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting pydantic~=2.0 (from dspy-ai)
  Using cached pydantic-2.7.4-py3-none-any.whl.metadata (109 kB)
Using cached joblib-1.3.2-py3-none-any.whl (302 kB)
Using cached pydantic-2.7.4-py3-none-any.whl (409 kB)
Installing collected packages: joblib, pydantic
  Attempting uninstall: joblib
    Found existing installation: joblib 1.4.2
    Uninstalling joblib-1.4.2:
      Successfully uninstalled joblib-1.4.2
  Attempting uninstall: pydantic
    Found existing installation: pydantic 1.10.17
    Uninstalling pydantic-1.10.17:
      Successfully uninstalled pydantic-1.10.17
Successfully installed joblib-1.3.2 pydantic-2.7.4
Note: you may need to restart the kernel to use updated packages.


In [10]:
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
from langchain_community.document_loaders import WikipediaLoader

query = "Elon Musk"
raw_documents = WikipediaLoader(query=query).load()



  lis = BeautifulSoup(html).find_all('li')


In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

# Split
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=512, chunk_overlap=100
)
all_splits = text_splitter.split_documents(raw_documents)

In [5]:
from langchain_openai import OpenAIEmbeddings
# import chromadb.utils.embedding_functions as embedding_functions

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

CHROMA_COLLECTION_NAME = "dspy-rag-chroma"
CHROMADB_DIR = "dspy_rag_chroma/"

# Index
vectorstore = Chroma.from_documents(
    documents=all_splits,
    collection_name=CHROMA_COLLECTION_NAME,
    embedding=embeddings,
    persist_directory=CHROMADB_DIR,
)
retriever = vectorstore.as_retriever()

In [6]:
import os

from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
from dspy.retrieve.chromadb_rm import ChromadbRM

embedding_function = OpenAIEmbeddingFunction(
    api_key=os.environ.get("OPENAI_API_KEY"), model_name="text-embedding-ada-002"
)

rm = ChromadbRM(CHROMA_COLLECTION_NAME, CHROMADB_DIR, embedding_function, k=3)

In [8]:
# connect to our neo4j database
from langchain_community.graphs import Neo4jGraph

NEO4J_URI = os.getenv("LOCAL_NEO4J_URI")
NEO4J_USERNAME = os.getenv("LOCAL_NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("LOCAL_NEO4J_PASSWORD")

graph = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD)

In [11]:
# DiffbotGraphTransformer calls Diffbot Natural Language API to extract entities and relationships in the article
from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer
import os

DIFFBOT_API_KEY = os.getenv("DIFFBOT_API_KEY")

diffbot_nlp = DiffbotGraphTransformer(diffbot_api_key=DIFFBOT_API_KEY)

# Diffbot's Natural Language API converts unstructured text data into knowlegde graphs
graph_documents = diffbot_nlp.convert_to_graph_documents(raw_documents)

# add knowledge graph data to the neo4j database
graph.add_graph_documents(graph_documents)

### Vanilla DSPy RAG Pipeline

In [12]:
import dspy

class GenerateAnswer(dspy.Signature):
    """Answer questions with short factoid answers."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")

class vanilla_dspy_rag(dspy.Module):

    # we set num_passages=1 to avoid the same passage being repeatedly retrieved for multiple times
    def __init__(self, num_passages=1):
        super().__init__()

        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)

    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)
    
vanilla_dspy_rag_lm = dspy.OpenAI(model="gpt-3.5-turbo-instruct")
dspy.settings.configure(lm=vanilla_dspy_rag_lm, rm=rm)

### Question 1 : What industry or industries is Neuralink in?

A. Vanilla DSPy RAG

In [16]:
question_neuralink_industry = 'What industry or industries is Neuralink in?'
vanilla_dspy_rag = vanilla_dspy_rag()
vanilla_dspy_rag_q1 = vanilla_dspy_rag.forward(question_neuralink_industry)
vanilla_dspy_rag_q1

Prediction(
    context=["In 2004, Musk was an early investor in electric vehicle manufacturer Tesla Motors, Inc. (later Tesla, Inc.). He became the company's chairman and product architect, assuming the position of CEO in 2008. In 2006, Musk helped create SolarCity, a solar-energy company that was acquired by Tesla in 2016 and became Tesla Energy. In 2013, he proposed a hyperloop high-speed vactrain transportation system. In 2015, he co-founded OpenAI, a nonprofit artificial intelligence research company. The following year, Musk co-founded Neuralink—a neurotechnology company developing brain–computer interfaces—and The Boring Company, a tunnel construction company. In 2018, the U.S. Securities and Exchange Commission (SEC) sued Musk, alleging that he had falsely announced that he had secured funding for a private takeover of Tesla. To settle the case, Musk stepped down as the chairman of Tesla and paid a $20 million fine. In 2022, he acquired Twitter for $44 billion. He subsequently 

In [17]:
vanilla_dspy_rag_lm.inspect_history()




Answer questions with short factoid answers.

---

Follow the following format.

Context: may contain relevant facts

Question: ${question}

Reasoning: Let's think step by step in order to ${produce the answer}. We ...

Answer: often between 1 and 5 words

---

Context:
«In 2004, Musk was an early investor in electric vehicle manufacturer Tesla Motors, Inc. (later Tesla, Inc.). He became the company's chairman and product architect, assuming the position of CEO in 2008. In 2006, Musk helped create SolarCity, a solar-energy company that was acquired by Tesla in 2016 and became Tesla Energy. In 2013, he proposed a hyperloop high-speed vactrain transportation system. In 2015, he co-founded OpenAI, a nonprofit artificial intelligence research company. The following year, Musk co-founded Neuralink—a neurotechnology company developing brain–computer interfaces—and The Boring Company, a tunnel construction company. In 2018, the U.S. Securities and Exchange Commission (SEC) sued Musk, alleg

"\n\n\nAnswer questions with short factoid answers.\n\n---\n\nFollow the following format.\n\nContext: may contain relevant facts\n\nQuestion: ${question}\n\nReasoning: Let's think step by step in order to ${produce the answer}. We ...\n\nAnswer: often between 1 and 5 words\n\n---\n\nContext:\n«In 2004, Musk was an early investor in electric vehicle manufacturer Tesla Motors, Inc. (later Tesla, Inc.). He became the company's chairman and product architect, assuming the position of CEO in 2008. In 2006, Musk helped create SolarCity, a solar-energy company that was acquired by Tesla in 2016 and became Tesla Energy. In 2013, he proposed a hyperloop high-speed vactrain transportation system. In 2015, he co-founded OpenAI, a nonprofit artificial intelligence research company. The following year, Musk co-founded Neuralink—a neurotechnology company developing brain–computer interfaces—and The Boring Company, a tunnel construction company. In 2018, the U.S. Securities and Exchange Commission (

### initial_context_from_knowledge_graph : fetch structured and grounded facts in knowlegde graph

In [18]:
from langchain.chains import GraphCypherQAChain
from langchain_openai import ChatOpenAI

# function to query neo4j graph database
initial_context_from_knowledge_graph = GraphCypherQAChain.from_llm(
    ChatOpenAI(model="gpt-3.5-turbo-instruct", temperature=0), graph=graph, verbose=True
)

### Complete Pipeline of DSPy RAG with Knowledge Graph

In [19]:
# Set up the LM
lm = dspy.OpenAI(model='gpt-3.5-turbo-instruct')
dspy.settings.configure(lm=lm, rm=rm)

class EnrichQueryWithKG(dspy.Signature):
    """Refines and enriches questions using knowledge graph context."""
    question = dspy.InputField()
    kg_context = dspy.InputField(type=str, desc="Knowledge graph metadata") #changed from dict to str
    enriched_query = dspy.OutputField(desc="Query enriched with KG context")

class EnhanceContextWithKG(dspy.Signature):
    """Consolidates and summarizes information from enriched queries and passages."""
    query = dspy.InputField()
    passages = dspy.InputField(desc="Passages retrieved based on the enriched query")
    enhanced_context = dspy.OutputField(desc="Context enhanced with KG data")

class KGEnrichedQueryModule(dspy.Module):
    def __init__(self, temperature: int = 0):
        super().__init__()
        self.temperature = temperature
        self.enrich_query_module = dspy.ChainOfThought(EnrichQueryWithKG)
        self.enhance_context_module = dspy.ChainOfThought(EnhanceContextWithKG)
        self.retrieve = rm

    def forward(self, question: str, kg_context: str) -> str: #change from dict to str
        enriched_query = self.enrich_query_module(question=question, kg_context=kg_context).enriched_query
        passages = self.retrieve(enriched_query)
        final_answer = self.enhance_context_module(query=enriched_query, passages=passages).enhanced_context
        return final_answer

### Fact-check with Our Knowledge Graph

In [24]:
# Reminder that our question is: question_neuralink_industry = 'What industry or industries is Neuralink in?'
# kg_context = initial_context_from_knowledge_graph.run(question_neuralink_industry)

### DSPy RAG with Knowledge Graphs

In [20]:
dspy_w_kg = KGEnrichedQueryModule(temperature=0)

In [25]:
kg_context = "Neuralink is in the computer interfaces and neurotechnology industries."

In [30]:
answer = dspy_w_kg.forward(question=question_neuralink_industry, kg_context = kg_context)
print(answer)

Elon Musk is a businessman and investor known for his key roles in space company SpaceX and automotive company Tesla, Inc. Other involvements include ownership of X Corp., formerly Twitter, and his role in the founding of The Boring Company, xAI, Neuralink and OpenAI. He is one of the wealthiest people in the world; as of June 2024, Forbes


In [28]:
lm.inspect_history()




Consolidates and summarizes information from enriched queries and passages.

---

Follow the following format.

Query: ${query}

Passages: Passages retrieved based on the enriched query

Reasoning: Let's think step by step in order to ${produce the enhanced_context}. We ...

Enhanced Context: Context enhanced with KG data

---

Query: What industry or industries is Neuralink in that are related to computer interfaces and neurotechnology?

Passages:
[1] «{'id': 'b679fa5d-932c-429b-a374-c4cf7215a964', 'score': 0.4613931435920016, 'long_text': "In 2004, Musk was an early investor in electric vehicle manufacturer Tesla Motors, Inc. (later Tesla, Inc.). He became the company's chairman and product architect, assuming the position of CEO in 2008. In 2006, Musk helped create SolarCity, a solar-energy company that was acquired by Tesla in 2016 and became Tesla Energy. In 2013, he proposed a hyperloop high-speed vactrain transportation system. In 2015, he co-founded OpenAI, a nonprofit artif

'\n\n\nConsolidates and summarizes information from enriched queries and passages.\n\n---\n\nFollow the following format.\n\nQuery: ${query}\n\nPassages: Passages retrieved based on the enriched query\n\nReasoning: Let\'s think step by step in order to ${produce the enhanced_context}. We ...\n\nEnhanced Context: Context enhanced with KG data\n\n---\n\nQuery: What industry or industries is Neuralink in that are related to computer interfaces and neurotechnology?\n\nPassages:\n[1] «{\'id\': \'b679fa5d-932c-429b-a374-c4cf7215a964\', \'score\': 0.4613931435920016, \'long_text\': "In 2004, Musk was an early investor in electric vehicle manufacturer Tesla Motors, Inc. (later Tesla, Inc.). He became the company\'s chairman and product architect, assuming the position of CEO in 2008. In 2006, Musk helped create SolarCity, a solar-energy company that was acquired by Tesla in 2016 and became Tesla Energy. In 2013, he proposed a hyperloop high-speed vactrain transportation system. In 2015, he co-

### Question 2 : List out the companies that Elon Musk co-founded with other founders.

In [32]:
question_org_elon_cofounded = "List out the companies that Elon Musk co-founded with other founders."
# vanilla_dspy_rag = vanilla_dspy_rag()
vanilla_dspy_rag_q2 = vanilla_dspy_rag.forward(question=question_org_elon_cofounded)
vanilla_dspy_rag_q2

Prediction(
    context=["In 2004, Musk was an early investor in electric vehicle manufacturer Tesla Motors, Inc. (later Tesla, Inc.). He became the company's chairman and product architect, assuming the position of CEO in 2008. In 2006, Musk helped create SolarCity, a solar-energy company that was acquired by Tesla in 2016 and became Tesla Energy. In 2013, he proposed a hyperloop high-speed vactrain transportation system. In 2015, he co-founded OpenAI, a nonprofit artificial intelligence research company. The following year, Musk co-founded Neuralink—a neurotechnology company developing brain–computer interfaces—and The Boring Company, a tunnel construction company. In 2018, the U.S. Securities and Exchange Commission (SEC) sued Musk, alleging that he had falsely announced that he had secured funding for a private takeover of Tesla. To settle the case, Musk stepped down as the chairman of Tesla and paid a $20 million fine. In 2022, he acquired Twitter for $44 billion. He subsequently 

In [33]:
vanilla_dspy_rag_lm.inspect_history(15)




Answer questions with short factoid answers.

---

Follow the following format.

Context: may contain relevant facts

Question: ${question}

Reasoning: Let's think step by step in order to ${produce the answer}. We ...

Answer: often between 1 and 5 words

---

Context:
«In 2004, Musk was an early investor in electric vehicle manufacturer Tesla Motors, Inc. (later Tesla, Inc.). He became the company's chairman and product architect, assuming the position of CEO in 2008. In 2006, Musk helped create SolarCity, a solar-energy company that was acquired by Tesla in 2016 and became Tesla Energy. In 2013, he proposed a hyperloop high-speed vactrain transportation system. In 2015, he co-founded OpenAI, a nonprofit artificial intelligence research company. The following year, Musk co-founded Neuralink—a neurotechnology company developing brain–computer interfaces—and The Boring Company, a tunnel construction company. In 2018, the U.S. Securities and Exchange Commission (SEC) sued Musk, alleg

"\n\n\nAnswer questions with short factoid answers.\n\n---\n\nFollow the following format.\n\nContext: may contain relevant facts\n\nQuestion: ${question}\n\nReasoning: Let's think step by step in order to ${produce the answer}. We ...\n\nAnswer: often between 1 and 5 words\n\n---\n\nContext:\n«In 2004, Musk was an early investor in electric vehicle manufacturer Tesla Motors, Inc. (later Tesla, Inc.). He became the company's chairman and product architect, assuming the position of CEO in 2008. In 2006, Musk helped create SolarCity, a solar-energy company that was acquired by Tesla in 2016 and became Tesla Energy. In 2013, he proposed a hyperloop high-speed vactrain transportation system. In 2015, he co-founded OpenAI, a nonprofit artificial intelligence research company. The following year, Musk co-founded Neuralink—a neurotechnology company developing brain–computer interfaces—and The Boring Company, a tunnel construction company. In 2018, the U.S. Securities and Exchange Commission (

In [38]:
# turn our dictionary-type kg data into string-type to fit in the DSPy pipeline
kg_context = "These are the companies that Elon has co-founded with other people:" + str(kg_data['intermediate_steps'][1]['context'])
kg_context

NameError: name 'kg_data' is not defined