In [1]:
import os

In [2]:
os.chdir("../")
%pwd

'f:\\ProjectAI\\ChatSystem'

In [3]:
# from dotenv import load_dotenv
# load_dotenv()

In [4]:
# os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
# os.environ["GEMINI_API_KEY"] = os.getenv("GEMINI_API_KEY")

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class EmbeddingsConfig:
    root_dir: Path
    embeddings_file: Path
    model_name: str
    folder_data: str
    local_data_file: Path


In [6]:
from ChatBoxSystem.constants import *
from ChatBoxSystem.utils.helper import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.embeddings.root_dir])

    def get_embeddings_config(self) -> EmbeddingsConfig:
        create_directories([self.config.embeddings.root_dir])
        embeddings_config = EmbeddingsConfig(
            root_dir=Path(self.config.embeddings.root_dir),
            embeddings_file=Path(self.config.embeddings.embeddings_file),
            model_name=self.config.embeddings.model_name,
            folder_data=self.config.data_ingestion.root_dir,
            local_data_file=Path(self.config.embeddings.local_data_file),
        )
        return embeddings_config

In [8]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from ChatBoxSystem import logger

  from .autonotebook import tqdm as notebook_tqdm


[2025-10-02 10:44:27,260: INFO: loader]: Loading faiss with AVX512 support.
[2025-10-02 10:44:27,267: INFO: loader]: Could not load library with AVX512 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx512'")
[2025-10-02 10:44:27,267: INFO: loader]: Loading faiss with AVX2 support.
[2025-10-02 10:44:27,396: INFO: loader]: Successfully loaded faiss with AVX2 support.


In [9]:
class Embeddings:
    def __init__(self, config: EmbeddingsConfig):
        self.config = config
        self.model = SentenceTransformer(self.config.model_name)

    def generate_embeddings(self):
        logger.info("Loading data...")
        df = pd.read_csv(self.config.local_data_file)
        texts = df['Answer'].tolist()

        logger.info("Generating embeddings...")
        embeddings = self.model.encode(texts, convert_to_tensor=True, show_progress_bar=True)

        self.save_embeddings(embeddings, texts)

    def save_embeddings(self, embeddings, texts):
        logger.info("Saving embeddings...")
        dim = embeddings[0].shape[0]
        index = faiss.IndexFlatL2(dim)

        index.add(np.array(embeddings))
        logger.info(f"Total embeddings indexed: {index.ntotal}")

        faiss.write_index(index, str(self.config.embeddings_file))
        np.save(self.config.embeddings_file.with_suffix('.npy'), np.array(texts))
        logger.info(f"Embeddings saved to {self.config.embeddings_file}")

In [10]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Document
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.storage.storage_context import StorageContext

In [11]:
class Embeddings:
    def __init__(self, config: EmbeddingsConfig):
        self.config = config
        self.model = HuggingFaceEmbedding(model_name='BAAI/bge-small-en-v1.5')

    def generate_embeddings(self):
        logger.info("Loading data...")
        df = pd.read_csv(self.config.local_data_file)

        # Tạo danh sách Document từ từng row
        documents = []
        for _, row in df.iterrows():
            q, a = row["Question"], row["Answer"]
            text = f"Question: {q}\nAnswer: {a}"
            documents.append(Document(text=text))

        logger.info("Generating embeddings via VectorStoreIndex...")
        index = VectorStoreIndex.from_documents(documents, embed_model=self.model)

        self.save_embeddings(index)

    def save_embeddings(self, index: VectorStoreIndex):
        logger.info("Saving VectorStoreIndex...")
        
        storage_context = index.storage_context
        storage_context.persist()
        logger.info(f"Embeddings saved to {self.config.embeddings_file}")

In [12]:
# try:
#     config = ConfigurationManager()
#     embeddings_config = config.get_embeddings_config()

#     embeddings = Embeddings(config=embeddings_config)
#     embeddings.generate_embeddings()
#     logger.info("Embeddings generation completed successfully.")
# except Exception as e:
#     logger.exception(f"Error occurred: {e}")
#     raise

In [13]:
try:
    config = ConfigurationManager()
    embeddings_config = config.get_embeddings_config()

    embeddings = Embeddings(config=embeddings_config)
    embeddings.generate_embeddings()
    logger.info("Embeddings generation completed successfully.")
except Exception as e:
    logger.exception(f"Error occurred: {e}")
    raise  

[2025-10-02 10:44:28,754: INFO: helper]: YAML file config\config.yaml loaded successfully.
[2025-10-02 10:44:28,757: INFO: helper]: YAML file params.yaml loaded successfully.
[2025-10-02 10:44:28,758: INFO: helper]: Directory created at: artifacts/embeddings
[2025-10-02 10:44:28,759: INFO: helper]: Directory created at: artifacts/embeddings
[2025-10-02 10:44:28,763: INFO: SentenceTransformer]: Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5
[2025-10-02 10:44:32,997: INFO: SentenceTransformer]: 1 prompt is loaded, with the key: query
[2025-10-02 10:44:32,997: INFO: 2974174681]: Loading data...
[2025-10-02 10:44:33,031: INFO: 2974174681]: Generating embeddings via VectorStoreIndex...
[2025-10-02 10:44:37,083: INFO: 2974174681]: Saving VectorStoreIndex...
[2025-10-02 10:44:38,116: INFO: 2974174681]: Embeddings saved to artifacts\embeddings\embeddings.index
[2025-10-02 10:44:38,117: INFO: 3935594119]: Embeddings generation completed successfully.
