In [None]:
import os
import nltk
import pinecone
import openai
from langchain_openai import OpenAIEmbeddings
from langchain_openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain.docstore.document import Document
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.vectorstores.pinecone import Pinecone
from langchain.chains import RetrievalQA
from langchain import OpenAI

## Load documents

In [None]:
docs="/teamspace/studios/this_studio/retrieval-augmented-generation/data/"
loader = DirectoryLoader(docs, glob="./*.txt", loader_cls=TextLoader)
documents = loader.load()

## Split documents

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
split_texts = text_splitter.split_documents(documents)
split_texts

## Initialize Pinecone

In [None]:
from pinecone import ServerlessSpec
from pinecone import Pinecone

pc = Pinecone()
cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'
spec = ServerlessSpec(cloud=cloud, region=region)

## Create or recreate index

In [None]:
import time
index_name = "test001"

if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

# we create a new index
pc.create_index(index_name, dimension=1536, metric='dotproduct', spec=spec)

# wait for index to be initialized
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

## Display index stats

In [None]:
index = pc.Index(index_name)
index.describe_index_stats()

## Initialize vectorstore

In [None]:
embeddings = OpenAIEmbeddings()
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)

## Load data

In [None]:
vectorstore_from_docs = PineconeVectorStore.from_documents(split_texts,index_name=index_name,embedding=embeddings)

## Display index stats

In [48]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 302}},
 'total_vector_count': 302}

## Simple similarity search

In [49]:
query = "What is minimum viable product?"
vectorstore.similarity_search(query)

[Document(page_content="you understand whether you're building something that's valuable and that may be even more valuable as you as you kind\nof grow into it so when we talk about bringing an MVP um to Market\nminimum viable product you've built and I'll I'll continue to do software examples just since that's my word and I apologize if you're You\nKnow You're Building you're building something else or a piece of Hardware but you build something small typically you're building a fraction of your total", metadata={'source': '/teamspace/studios/this_studio/retrieval-augmented-generation/data/part2.txt'}),
 Document(page_content="you understand whether you're building something that's valuable and that may be even more valuable as you as you kind\nof grow into it so when we talk about bringing an MVP um to Market\nminimum viable product you've built and I'll I'll continue to do software examples just since that's my word and I apologize if you're You\nKnow You're Building you're building