# Creating a vector store

In [1]:
# path to the created database
VECTOR_DB_PATH="/home/vpa/RAGollama3/data/database"

# The path of the csv file that will be used to create a vectorstore. 
# # The csv will have only one column and each row entry will be treated as a separate document.  
DATA_SOURCE="/home/vpa/RAGollama3/data/source.csv" 

# The model to use in sentence-transformers for creation embedding
EMBEDDING_MODEL="l3cube-pune/bengali-sentence-similarity-sbert" 

## Vector Store Creation

In [None]:
from sentence_transformers import SentenceTransformer
import torch

# Define the CustomEmbeddings class
class CustomEmbeddings:
    def __init__(self, model_name: str, device: str = None):
        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = SentenceTransformer(model_name)
        self.model.to(self.device)

    def embed_documents(self, texts):
        """Embeds a list of documents (texts) into embeddings."""
        return self.model.encode(texts, convert_to_numpy=True).tolist()

    def embed_query(self, text):
        """Embeds a single query into an embedding."""
        return self.model.encode([text], convert_to_numpy=True).tolist()[0]


In [None]:
#---------------------------------
# imports
#---------------------------------
import os
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.schema import Document
from tqdm import tqdm
#--------------------------------------
# global
#--------------------------------------
embedding_model=CustomEmbeddings(EMBEDDING_MODEL)

#--------------------------------------
# helper functions
#--------------------------------------
def create_vector_store(df: pd.DataFrame, 
                        vector_db_path: str) -> None:
    """
    Create a vector store from a dataframe with a single column, where each entry in the dataframe
    is treated as a separate document.

    Args:
        df (pd.DataFrame): DataFrame containing a single column with text data.
        vector_db_path (str): Path to the directory where the vector store will be persisted.
    """
    # Check if GPU is available and set the device
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")

    # Create data directory structure
    os.makedirs(vector_db_path, exist_ok=True)

    # Initialize text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500,
        chunk_overlap=200,
        length_function=len
    )

    
    # Initialize the vector store with GPU-based embeddings
    vectorstore = Chroma(persist_directory=vector_db_path, embedding_function=embedding_model)

    # Process each document in the dataframe
    for data in tqdm(df.iloc[:, 0]):
        documents = [Document(page_content=data)]
        all_splits = text_splitter.split_documents(documents)

        # Add the split documents to the vector store
        vectorstore.add_documents(all_splits)

    # Persist the vector store
    vectorstore.persist()
    print("Vector store created and persisted at:", vector_db_path)


In [None]:
df=pd.read_csv(DATA_SOURCE)
df

In [None]:
create_vector_store(df,VECTOR_DB_PATH)

# Checking Validity of vector store

In [6]:
from langchain.vectorstores import Chroma
from langchain.schema import Document
import torch

def query_vector_store(vector_db_path: str, query_text: str, top_k: int = 5) -> None:
    """
    Query the vector store to find and display the most similar documents to the given query text.

    Args:
        vector_db_path (str): Path to the directory where the vector store is persisted.
        query_text (str): The text query for which to retrieve similar documents.
        top_k (int): The number of top similar documents to retrieve.
    """
    # Load the vector store
    vectorstore = Chroma(persist_directory=vector_db_path, embedding_function=embedding_model)

    # Check if GPU is available and set the device
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")

    # Perform the query
    results = vectorstore.similarity_search(query_text, k=top_k)
    
    # Print the results
    for i, result in enumerate(results):
        print(f"Document {i+1}:")
        print(result.page_content)
        #print("Similarity score:", result.score)
        print()



In [None]:
# Example usage
query_vector_store(vector_db_path=VECTOR_DB_PATH, query_text='ভোক্তা অধিদপ্তরে অভিযোগ দিবো')