### Importando as bibliotecas 

In [93]:
import os
from dotenv import load_dotenv, find_dotenv

from langchain_cohere import CohereEmbeddings #faz o embedd
from langchain_community.document_loaders import TextLoader # permite carregar arquivos de texto
from langchain_text_splitters import CharacterTextSplitter #etapa de fatiamento dos textos em partes menores

from langchain_pinecone import PineconeVectorStore #permite armazenar os vetores no pinecone
from langchain.memory import ConversationBufferMemory #buffer para uma memoria

from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain

from langchain_groq import ChatGroq
from langchain import hub





### Carregar as variáveis de ambiente

In [94]:
load_dotenv (find_dotenv())

COHERE_API_KEY = os.getenv("COHERE_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")


### Instanciar o banco de dados do PINECONE

OBS: primeito devemos criar o index no pinecone cloud

In [95]:
index_name = "rag-demo"
embeddings = CohereEmbeddings (

    model="embed-english-v3.0",
    cohere_api_key=COHERE_API_KEY
)

### Criando a memoria

In [96]:
memory = ConversationBufferMemory (
    memory_key= "chat_history",
    return_messages=True
)

###  Carregamento/Ingestão do documento

In [97]:
print("🔥Carregando os documentos...\n\n")

PATH_FILE = "C:\\Users\\Maria Raquel\\RAG_LLM_INTRODUCTION\\data\\mediumblog1.txt"
loader = TextLoader(PATH_FILE)
document = loader.load()

🔥Carregando os documentos...




In [98]:
document

[Document(metadata={'source': 'C:\\Users\\Maria Raquel\\RAG_LLM_INTRODUCTION\\data\\mediumblog1.txt'}, page_content='Title: Vector Database: What is it and Why You Should Know It?\n\nAuthor: Ejiro Onose\nDate: December 22, 2023\n\nIf 2021 was the year of graph databases, 2023 is the year of vector databases â€” Chip Huen.\n\nGenerative AI and Large Language Models (LLMs) have become popular, and a vector database is one of the best tools to handle LLM data. Vector databases provide the ideal infrastructure for managing the complex, high-dimensional data that LLMs produce and rely upon.\n\nIn this article, Iâ€™ll explain what vector databases are, how they work, and introduce some top vector database tools.\n\n What is a Vector?\nIn machine learning (ML), a vector is a collection of numerical values that represents the features of multi-dimensional objects, such as words or images. For example, a vector representing an image might contain values related to pixel intensities and color ch

### Fatiamento/ Splinting dos documentos

In [99]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(document)
print(f"Total de partes criadas: {len(docs)}")


Total de partes criadas: 5


### INSTANCIAR BANCO DE DADOS

In [100]:
vectorstore_from_docs = PineconeVectorStore.from_documents(
    docs, 
    index_name=index_name, 
    embedding=embeddings
)

In [101]:
vectorstore = PineconeVectorStore(
    index_name=index_name,
    embedding=embeddings
)

In [102]:
query = "what is Vector Store?"
vectorstore.similarity_search(query)

[Document(id='b1558104-bb40-48a9-8675-22e2e0d6a124', metadata={'source': 'C:\\Users\\Maria Raquel\\RAG_LLM_INTRODUCTION\\data\\mediumblog1.txt'}, page_content='Title: Vector Database: What is it and Why You Should Know It?\n\nAuthor: Ejiro Onose\nDate: December 22, 2023\n\nIf 2021 was the year of graph databases, 2023 is the year of vector databases â€” Chip Huen.\n\nGenerative AI and Large Language Models (LLMs) have become popular, and a vector database is one of the best tools to handle LLM data. Vector databases provide the ideal infrastructure for managing the complex, high-dimensional data that LLMs produce and rely upon.\n\nIn this article, Iâ€™ll explain what vector databases are, how they work, and introduce some top vector database tools.\n\n What is a Vector?\nIn machine learning (ML), a vector is a collection of numerical values that represents the features of multi-dimensional objects, such as words or images. For example, a vector representing an image might contain value

### Criar a LLM

In [103]:
llm = ChatGroq(
    model="Gemma2-9b-It", #modelo de llm utilizado
    groq_api_key=GROQ_API_KEY, #chave de api do groq
    temperature=0.1 #temperatura do llm
)

### Carregar um prompt "padrão" via langchain hub  e criar o retriever

In [104]:
retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
combine_docs_chain   = create_stuff_documents_chain(
    llm, retrieval_qa_chat_prompt
)



### criar a chain com base no prompt e retriever

In [105]:
qa = create_retrieval_chain(
    vectorstore.as_retriever(), 
    combine_docs_chain
    )

In [106]:
res = qa.invoke({"input": "What is vector store in 3 sentences?"})
print(res['answer'])

A vector store is a specialized database designed to efficiently store and retrieve high-dimensional vectors. 

These vectors represent data points like words, images, or audio, capturing their semantic meaning or characteristics. 

Vector stores are crucial for applications powered by generative AI and large language models, enabling fast and accurate similarity searches. 



