In [24]:
# Loading the data
from langchain_community.document_loaders import DirectoryLoader

loader = DirectoryLoader('data', glob="*.txt")
docs = loader.load()

len(docs)
docs

[Document(page_content="A bunch of information about Paolo Ligsay from his website: https://paolomartinligsay.netlify.app/\n\nHello, I am Paolo Ligsay, a Front-end Engineer and I specialize in crafting code for visually stunning digital experiences, using my expertise in front end development.\n\nHi! I'm Paolo Ligsay 👋 a passionate and dedicated Front End developer with around 4 years of experience in front end development 👨\u200d💻 I have had the opportunity to work with various technologies and frameworks, allowing me to adapt quickly ⚡️. Recently, I have learned to give importance to accessibility (a11y) which I consider now as a core principle that guides my work. I strongly believe that digital experiences should be inclusive and accessible to all users, regardless of their abilities. I priorities implementing accessible design principles and following WCAG guidelines to ensure that my projects are usable and enjoyable for everyone. 🤝🧑\u200d🦯\n\nExperience five: Software Engineer @

In [25]:
# Transforming the data
from langchain_text_splitters import RecursiveCharacterTextSplitter

# each docs will have 1000 characters: chunksize
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 250, chunk_overlap = 100)
chunk_docs = text_splitter.split_documents(docs)

len(chunk_docs)

45

In [26]:
# embedding the chunks of data above
from langchain_openai import OpenAIEmbeddings
import os
from dotenv import load_dotenv

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")

embeddings = OpenAIEmbeddings(api_key=openai_api_key)

In [79]:
# Index Creation
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=pinecone_api_key)
index_name = "pinecone-chat"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name = index_name,
        # Notes[PAO] Dimension and metric is suggested in Pinecone > indexes > setup by model
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

In [38]:
# Create embeddings
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore.from_documents(chunk_docs, embeddings, index_name="chatbot")
# vector_store = PineconeVectorStore.from_existing_index(embedding=embeddings, index_name="chatbot")
vector_store

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x158fbd350>

In [37]:
query = "What are the tech stack of Project Next?"
retriever = vector_store.as_retriever(search_kwargs = { "k":3 })
retriever.get_relevant_documents(query)

[Document(page_content='Featured Project: Three: Project: Next, A comprehensive demonstration of a typical project workflow and the essential dependencies commonly employed in web development. This Project focuses more on the articles, documentation, and project set up.', metadata={'source': 'data/info.txt'}),
 Document(page_content='Featured Project: Three: Project: Next, A comprehensive demonstration of a typical project workflow and the essential dependencies commonly employed in web development. This Project focuses more on the articles, documentation, and project set up.', metadata={'source': 'data/info.txt'}),
 Document(page_content='in web development. This Project focuses more on the articles, documentation, and project set up. Please see its Github Repo for more information Using the following techt stack: NextJS, Vercel, TypeScript, Storybook, Tailwind Featured Project:', metadata={'source': 'data/info.txt'})]