In [3]:
# Vector store integration
"""
This file contains functions for interacting with the
MongoDB Atlas  vector store, which is essential for our
RAG system's retrieval capabilities.
"""

from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, UnstructuredHTMLLoader
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

from typing import List
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from dotenv import load_dotenv, find_dotenv
from langchain_mongodb import MongoDBAtlasVectorSearch
from pymongo import MongoClient
from fastapi import HTTPException

import os
import logging
import sys
sys.path.append("/home/phucuy2025/RAG-Chatbot/api")
sys.path.append("/home/phucuy2025/RAG-Chatbot")


from api.scraper import WebScraper
# sys.path.append('/home/phucuy2025/RAG-Chatbot/api')

# force reload the .env file
load_dotenv(find_dotenv(), override=True)

# set up logging
logging.basicConfig(filename="rag_chatbot_app.log", level=logging.INFO)


local_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
text_splitters = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len)

gemini_embeddings = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004",
    google_api_key=os.getenv("GOOGLE_API_KEY")
)
# create embeddings using Gemini embeddings

# step 4: Setting Up the vector store for RAG system, we gonna use MongoDBAtlas

#$ initialize the MongoDB python client
MONGODB_ATLAS_CLUSTER_URI = os.getenv("MONGODB_ATLAS_CLUSTER_URI")
client = MongoClient(
    MONGODB_ATLAS_CLUSTER_URI
)
DB_NAME = "RAG-Chatbot-Cluster"
COLLECTION_NAME = "RAG-Chatbot-Collection-Test"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "RAG-Chatbot-Index-Test"

MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]

vector_store = MongoDBAtlasVectorSearch(
    collection=MONGODB_COLLECTION,
    embedding=gemini_embeddings,
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
    relevance_score_fn="cosine"
)


In [4]:
"""Initialize the MongoDB collection and verify the vector search index."""

# Verify MongoDB connection
client.server_info()  # Raises an exception if connection fails
print("MongoDB connection established successfully")

# Check if collection exists
if COLLECTION_NAME not in client[DB_NAME].list_collection_names():
    client[DB_NAME].create_collection(COLLECTION_NAME)
    print(f"Created collection {COLLECTION_NAME}")
else:
    print(f"Collection {COLLECTION_NAME} already exists")

# Note: Vector search index must be created in MongoDB Atlas UI or via API
print(f"Ensure vector search index '{ATLAS_VECTOR_SEARCH_INDEX_NAME}' is configured in MongoDB Atlas for collection {COLLECTION_NAME}")


MongoDB connection established successfully
Collection RAG-Chatbot-Collection-Test already exists
Ensure vector search index 'RAG-Chatbot-Index-Test' is configured in MongoDB Atlas for collection RAG-Chatbot-Collection-Test


In [None]:
vector_store.create_vector_search_index(
    dimensions=768
)

In [5]:

# Test vector store by adding a dummy document
dummy_doc = Document(page_content="Test document", metadata={"file_id": 0})
vector_store.add_documents([dummy_doc, dummy_doc])
print("Added test document to vector store")


Added test document to vector store


In [39]:
print(dir(vector_store._collection))

['__annotations__', '__bool__', '__call__', '__class__', '__class_getitem__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__next__', '__orig_bases__', '__parameters__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_aggregate', '_aggregate_one_result', '_codec_options', '_command', '_conn_for_writes', '_count_cmd', '_create', '_create_helper', '_create_indexes', '_create_search_indexes', '_database', '_delete', '_delete_retryable', '_drop_index', '_find_and_modify', '_full_name', '_insert_one', '_list_indexes', '_name', '_read_concern', '_read_preference', '_read_preference_for', '_retryable_non_cursor_read', '_timeout', '_update', '_update_retryable', '_write_concern', '_write_concern_for'

In [6]:
# Log the inserted document to inspect its structure
inserted_doc = vector_store._collection.find_one({"file_id": 0})
if inserted_doc:
    print(f"Inserted test document: {inserted_doc}") # inserted_doc is a dict with keys: _id, text, embedding, file_id
else:
    print("Test document not found after insertion")


Inserted test document: {'_id': ObjectId('6805fdc9101024e2fd3a72e2'), 'text': 'Cẩm nang du lịch một ngày khám phá địa đạo Củ Chi Du lịchĐiểm đến Thứ hai, 1442025, 0700 GMT7 Một ngày khám phá địa đạo Củ Chi TP HCMChui hầm địa đạo, thăm khu vực tái hiện vùng giải phóng, thưởng thức món ăn dân dã là những trải nghiệm giúp du khách hiểu hơn về cuộc sống thời chiến của quân dân vùng đất thép. Cách trung tâm TP HCM khoảng 70 km về hướng Tây Bắc, địa đạo Củ Chi với hệ thống đường hầm dài gần 250 km, là cứ địa vững chắc của Khu ủy Quân khu, Bộ tư lệnh Sài Gòn - Gia Định, góp phần không nhỏ vào công cuộc thống nhất đất nước. Hiện nay, di tích địa đạo Củ Chi được bảo tồn tại hai khu vực chính là Bến Dược, xã Phú Mỹ Hưng và Bến Đình, xã Nhuận Đức, trở thành điểm đến thu hút du khách khi đến TP HCM. Theo đại diện Khu di tích lịch sử địa đạo Củ Chi, lượng khách tham quan trong tháng 4 tăng 30 so với ngày thường, nhờ hiệu ứng từ chuỗi sự kiện kỷ niệm 50 năm thống nhất đất nước. Đơn vị dự báo lượng k

In [50]:
idd = inserted_doc.get('_id')
idd

ObjectId('6805c00259381735d0ddcf83')

In [41]:
vector_store._collection.delete_one({"_id": idd})

DeleteResult({'n': 1, 'electionId': ObjectId('7fffffff000000000000014b'), 'opTime': {'ts': Timestamp(1745207151, 43), 't': 331}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1745207151, 43), 'signature': {'hash': b'$\xe9&\xcc\x886\xda\xea\n(\x00\x0c\xc2"B\xb5\x93\xef\xd7\x8f', 'keyId': 7457926348309266455}}, 'operationTime': Timestamp(1745207151, 43)}, acknowledged=True)

In [19]:
docs = vector_store._collection.find({"file_id": 0})
# print(dir(docs))
for i in ( docs.to_list()):
    print(i)

{'_id': ObjectId('6805c00259381735d0ddcf83'), 'text': 'Test document', 'embedding': [0.02245466038584709, 0.012345736846327782, -0.04655120521783829, -0.0017819341737776995, 0.031153041869401932, 0.0038923893589526415, -0.0031458723824471235, 0.041887763887643814, -0.026203611865639687, 0.03639741614460945, 7.221212399599608e-06, 0.011879394762217999, 0.06041010096669197, 0.011899629607796669, -0.0028977717738598585, -0.032094914466142654, 0.03987731412053108, 0.038080934435129166, -0.09837115556001663, 0.011226614937186241, 0.020411163568496704, -0.03096204623579979, 0.04700252041220665, -0.012685094028711319, -0.006810220889747143, -0.02934683859348297, 0.017130544409155846, -0.000422801764216274, 0.003477184334769845, -0.014720271341502666, 0.06298795342445374, 0.0556221604347229, 0.0050380295142531395, -0.06103367730975151, 0.026296168565750122, 0.03546476364135742, 0.007996564731001854, 0.007996597327291965, 0.0346197634935379, -0.03967507183551788, -0.07154643535614014, 0.0088919

In [20]:
result = vector_store._collection.delete_many({'file_id': 0})
result

DeleteResult({'n': 5, 'electionId': ObjectId('7fffffff000000000000014b'), 'opTime': {'ts': Timestamp(1745208608, 16), 't': 331}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1745208608, 16), 'signature': {'hash': b'\x16ZO\x0c\xb8\xbc\xa6\xa3\xb1\xf4eT\xf0\xc7\xec\xb62l\xdb\x83', 'keyId': 7457926348309266455}}, 'operationTime': Timestamp(1745208608, 16)}, acknowledged=True)

In [42]:

# Delete the test document
result = vector_store._collection.delete_one({"file_id": 0})
if result.deleted_count > 0:
    print("Successfully deleted test document")
else:
    print("No test document was deleted; check document structure or query")

# Verify deletion
remaining_doc = vector_store._collection.find_one({"file_id": 0})
if remaining_doc:
    print(f"Test document still exists after deletion attempt: {remaining_doc}")
else:
    print(f"Confirmed test document was deleted")


No test document was deleted; check document structure or query
Confirmed test document was deleted


In [2]:
try:
    client[DB_NAME].drop_collection(COLLECTION_NAME)
    print(f"Successfully deleted collection {COLLECTION_NAME}")
except Exception as e:
    print(f"Error deleting collection {COLLECTION_NAME}: {str(e)}")
    

Successfully deleted collection RAG-Chatbot-Collection
