In [1]:
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader


## Load data

In [2]:
docs="/teamspace/studios/this_studio/retrieval-augmented-generation/data/"
loader = DirectoryLoader(docs, glob="./*.txt", loader_cls=TextLoader)
documents = loader.load()

## Split document into nodes

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [4]:
len(texts)

83

In [7]:
texts[3]

Document(page_content="get to the next round of funding Beyond just that product Market fit so product Market fit is just one step just one\nstep along the way so the the challenge is something we call the The Product Company Gap and I\nfigured I'd you know since I I actually started a company I figured I'd tell you about a company that actually failed to get across that Gap um and that might be\nuseful I started a company called padient um it was a mobile payments company QR code payments before it was\nhappening in China and Korea like we actually had to write we had to code our own QR Code Reader we hired a satellite\nimagery engineer to build our own QR Reader from scratch the idea was hey we\ncould use QR codes instead of credit cards to pay for stuff um we knew like we had we started\ncompanies before and we knew that it was going to be hard to build a company a direct a consumer company how are you", metadata={'source': '/teamspace/studios/this_studio/retrieval-augmented-generat

In [None]:
# Start Here - New Code

In [6]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.document_loaders import UnstructuredURLLoader
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.vectorstores.pinecone import Pinecone
from langchain.chains import RetrievalQA
from langchain import OpenAI
from langchain_openai import ChatOpenAI
from cleantext import clean
import os
import nltk
import pinecone
import openai

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
split_texts = text_splitter.split_documents(texts)
split_texts

[Document(page_content="what we're going to talk today about is how to build a product that scales into\na company it's a you know it's it's super it's super common that when you start a company you\nstart with the product so for those of you starting companies did you have a bolt of lightning where I've got a\nproduct idea and that was the Genesis for starting the conversation ordered or did you spend time deep in an industry", metadata={'source': '/teamspace/studios/this_studio/retrieval-augmented-generation/data/part1.txt'}),
 Document(page_content="where you understood a problem and decided to explore different products who who started their thinking with a\nproduct as opposed to as opposed to a market by the way most\npeople do that's not just I'm not trying to I'm not trying to imply that that's not the right way to do it it's the\nright way to do it um but it doesn't always scale into a company and so what we want to talk to", metadata={'source': '/teamspace/studios/this_studio/

In [11]:
from pinecone import Pinecone

# configure client
pc = Pinecone()

In [12]:
from pinecone import ServerlessSpec

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

In [13]:
import time
index_name = "mynotebookindex"

if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

# we create a new index
pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of text-embedding-ada-002
        metric='dotproduct',
        spec=spec
    )

# wait for index to be initialized
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

In [17]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 209}},
 'total_vector_count': 209}

In [15]:
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)


In [16]:
vectorstore_from_docs = PineconeVectorStore.from_documents(
        split_texts,
        index_name=index_name,
        embedding=embeddings
    )

In [20]:
query = "What is minimum viable product?"
vectorstore.similarity_search(query)

[Document(page_content="today so the problem you've heard the term minimum viable\nproduct you've certainly heard the the term product Market fit um you know but it often doesn't get\nenough momentum to build a lasting company product Market fit so in the VC business one of the things we're seed\ninvestors at underscore and one of the markers we look for when a company is getting ready to a question number one", metadata={'source': '/teamspace/studios/this_studio/retrieval-augmented-generation/data/part1.txt'}),
 Document(page_content="of grow into it so when we talk about bringing an MVP um to Market\nminimum viable product you've built and I'll I'll continue to do software examples just since that's my word and I apologize if you're You\nKnow You're Building you're building something else or a piece of Hardware but you build something small typically you're building a fraction of your total\nvision and you and you've got an idea of the people that you want to sell it to", metadata={'

In [21]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
# completion llm
llm = ChatOpenAI(
    model_name='gpt-3.5-turbo',
    temperature=0.0
)
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)
qa.run(query)

  warn_deprecated(


'Minimum Viable Product (MVP) is a version of a product with just enough features to satisfy early customers and provide feedback for future product development. It is a strategy used by startups to quickly and cost-effectively test their product ideas in the market before fully investing in the development of a complete product.'

In [22]:
initial_prompt = "You are a researcher who is going to search the web links, summarize them and share insights as asked"

In [23]:
query = "What is the primary goal of minimum viable product?"
result = qa({"query": query, "prompt": initial_prompt})

  warn_deprecated(


In [24]:
print(result['result'])

The primary goal of a minimum viable product (MVP) is to validate the value proposition of a product before investing significant resources into its development. It helps in testing the product idea with real users and gathering feedback to iterate and improve the product.
