In [1]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = os.getenv("LANGCHAIN_API_KEY")
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import ir_datasets

data = ir_datasets.load("cranfield")
data = [Document(page_content = doc.text, metadata={"title": doc.title, "author": doc.author}) for doc in data.docs_iter()]

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
splits = text_splitter.split_documents(data)

In [15]:
# SIMPLE

from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate

vectorstore = Chroma.from_documents(documents = splits, embedding = OpenAIEmbeddings())
retriever = vectorstore.as_retriever()

# PROMPT

prompt = """
    You are a chatbot answering questions on The Cranfield dataset corpus. Using these retrieved documents as context: {context}

    Answer this question: {question}
"""
prompt = ChatPromptTemplate.from_template(prompt)

llm = ChatOpenAI(model_name = "gpt-3.5-turbo", temperature = 0)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("What's a gyroscopic wave system?")

'A gyroscopic wave system is a system that involves gyroscopes and waves, typically referring to the oscillations and movements of a vehicle or object in a planetary atmosphere or other similar environment.'