In [1]:
import os

In [2]:
os.chdir("../")
%pwd

'f:\\ProjectAI\\ChatSystem'

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class EmbeddingsConfig:
    root_dir: Path
    embeddings_file: Path
    model_name: str
    local_data_file: Path


In [4]:
from ChatBoxSystem.constants import *
from ChatBoxSystem.utils.helper import read_yaml, create_directories

In [5]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.embeddings.root_dir])

    def get_embeddings_config(self) -> EmbeddingsConfig:
        create_directories([self.config.embeddings.root_dir])
        embeddings_config = EmbeddingsConfig(
            root_dir=Path(self.config.embeddings.root_dir),
            embeddings_file=Path(self.config.embeddings.embeddings_file),
            model_name=self.config.embeddings.model_name,
            local_data_file=Path(self.config.embeddings.local_data_file),
        )
        return embeddings_config

In [6]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from ChatBoxSystem import logger

  from .autonotebook import tqdm as notebook_tqdm


[2025-10-01 20:23:48,796: INFO: loader]: Loading faiss with AVX512 support.
[2025-10-01 20:23:48,797: INFO: loader]: Could not load library with AVX512 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx512'")
[2025-10-01 20:23:48,798: INFO: loader]: Loading faiss with AVX2 support.
[2025-10-01 20:23:48,954: INFO: loader]: Successfully loaded faiss with AVX2 support.


In [7]:
class Embeddings:
    def __init__(self, config: EmbeddingsConfig):
        self.config = config
        self.model = SentenceTransformer(self.config.model_name)

    def generate_embeddings(self):
        logger.info("Loading data...")
        df = pd.read_csv(self.config.local_data_file)
        texts = df['Answer'].tolist()

        logger.info("Generating embeddings...")
        embeddings = self.model.encode(texts, convert_to_tensor=True, show_progress_bar=True)

        self.save_embeddings(embeddings, texts)

    def save_embeddings(self, embeddings, texts):
        logger.info("Saving embeddings...")
        dim = embeddings[0].shape[0]
        index = faiss.IndexFlatL2(dim)

        index.add(np.array(embeddings))
        logger.info(f"Total embeddings indexed: {index.ntotal}")

        faiss.write_index(index, str(self.config.embeddings_file))
        np.save(self.config.embeddings_file.with_suffix('.npy'), np.array(texts))
        logger.info(f"Embeddings saved to {self.config.embeddings_file}")

In [10]:
try:
    config = ConfigurationManager()
    embeddings_config = config.get_embeddings_config()

    embeddings = Embeddings(config=embeddings_config)
    embeddings.generate_embeddings()
    logger.info("Embeddings generation completed successfully.")
except Exception as e:
    logger.exception(f"Error occurred: {e}")
    raise

[2025-10-01 20:25:53,980: INFO: helper]: YAML file config\config.yaml loaded successfully.
[2025-10-01 20:25:53,982: INFO: helper]: YAML file params.yaml loaded successfully.
[2025-10-01 20:25:53,983: INFO: helper]: Directory created at: artifacts/embeddings
[2025-10-01 20:25:53,984: INFO: helper]: Directory created at: artifacts/embeddings
[2025-10-01 20:25:53,986: INFO: SentenceTransformer]: Use pytorch device_name: cpu
[2025-10-01 20:25:53,987: INFO: SentenceTransformer]: Load pretrained SentenceTransformer: all-MiniLM-L6-v2
[2025-10-01 20:25:57,784: INFO: 3818667061]: Loading data...
[2025-10-01 20:25:57,784: INFO: 3818667061]: Generating embeddings...


Batches: 100%|██████████| 10/10 [00:00<00:00, 15.22it/s]

[2025-10-01 20:25:58,448: INFO: 3818667061]: Saving embeddings...
[2025-10-01 20:25:58,450: INFO: 3818667061]: Total embeddings indexed: 300
[2025-10-01 20:25:58,450: INFO: 3818667061]: Embeddings saved to artifacts\embeddings\embeddings
[2025-10-01 20:25:58,450: INFO: 29449560]: Embeddings generation completed successfully.



  index.add(np.array(embeddings))
