# HiBlu: RAG - Vector To Database

## Import Library

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
from pymongo import MongoClient
from langchain.vectorstores import MongoDBAtlasVectorSearch
import os

## The Process

The process of storing data from PDF files will involve splitting the text into chunks of `1000` words each. This data will be processed using the OpenAI environment for embedding. The resulting embeddings, in vector form, will be stored in a `MongoDB` database.

In [None]:
# Loading data.
loader = PyPDFLoader("answers.pdf")
data = loader.load()
data

In [None]:
# Showing the data typa.
type(data)

In [None]:
# Display 100 pages.
print(data[31].page_content)

In [None]:
# Text splitter.
text_spliter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0,
                                               separators=["\n\n", "\n", "(?<=\. )", " "],
                                             length_function=len)

In [None]:
# Text chunk.
text_chunk = text_spliter.split_documents(data)

In [None]:
# Display chunk.
print(text_chunk[101].page_content) 

In [None]:
# Show 5 chunk.
for idx,chunk in enumerate(text_chunk[100:105]):
    print(f'no {idx} chunk : \n{chunk.page_content}')
    print(f'\ncharacter length in chunk {len(chunk.page_content)} ')
    print('-'*50)

In [None]:
# Loading Environment Variables.
load_dotenv()
KEY=os.getenv("OPEN_AI_MONGO") 

In [None]:
# Embedding.
embedding = OpenAIEmbeddings(openai_api_key=KEY)

In [None]:
# Show embedding's len.
len(embedding.embed_query('my name is danu'))

In [None]:
# Test embedding.
test_embed = embedding.embed_query('saya adalah danu')
test_embed[:10]

In [None]:
# MongoDB Python Client.
client = MongoClient("mongodb+srv://Maverick:anakbimbinganmasdanu@cluster0.muggb2k.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")

In [None]:
# Access to dataset.
collection = client['Maverick']['Maverick_DB']

In [None]:
# Search the vector from MongoDB Atlas.
docsearch = MongoDBAtlasVectorSearch.from_documents(
    text_chunk, embedding, collection=collection, index_name="vector_index"
)