In [1]:
import os

In [2]:
os.chdir("../")
%pwd

'f:\\ProjectAI\\ChatSystem'

In [None]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class EmbeddingsConfig:
    root_dir: Path
    embeddings_file: Path
    model_name: str
    local_data_file: Path


In [4]:
from ChatBoxSystem.constants import *
from ChatBoxSystem.utils.helper import read_yaml, create_directories

In [None]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.embeddings.root_dir])

    def get_embeddings_config(self) -> EmbeddingsConfig:
        create_directories([self.config.embeddings.root_dir])
        embeddings_config = EmbeddingsConfig(
            root_dir=Path(self.config.embeddings.root_dir),
            embeddings_file=Path(self.config.embeddings.embeddings_file),
            model_name=self.config.embeddings.model_name,
            local_data_file=Path(self.config.embeddings.local_data_file),
        )
        return embeddings_config

In [18]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from ChatBoxSystem import logger

In [None]:
class Embeddings:
    def __init__(self, config: EmbeddingsConfig):
        self.config = config
        self.model = SentenceTransformer(self.config.model_name)

    def generate_embeddings(self):
        logger.info("Loading data...")
        df = pd.read_csv(self.config.local_data_file)
        texts = df['Answer'].tolist()

        logger.info("Generating embeddings...")
        embeddings = self.model.encode(texts, convert_to_tensor=True, show_progress_bar=True)

        self.save_embeddings(embeddings, texts)

    def save_embeddings(self, embeddings, texts):
        logger.info("Saving embeddings...")
        dim = embeddings[0].shape[0]
        index = faiss.IndexFlatL2(dim)

        index.add(np.array(embeddings))
        logger.info(f"Total embeddings indexed: {index.ntotal}")

        faiss.write_index(index, str(self.config.embeddings_file))
        np.save(self.config.embeddings_file.with_suffix('.npy'), np.array(texts))
        logger.info(f"Embeddings saved to {self.config.embeddings_file}")

In [20]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()

    embeddings = Embeddings(config=data_ingestion_config)
    embeddings.generate_embeddings()
    logger.info("Embeddings generation completed successfully.")
except Exception as e:
    logger.exception(f"Error occurred: {e}")
    raise

[2025-10-01 19:04:15,647: INFO: helper]: YAML file config\config.yaml loaded successfully.
[2025-10-01 19:04:15,648: INFO: helper]: YAML file params.yaml loaded successfully.
[2025-10-01 19:04:15,649: INFO: helper]: Directory created at: artifacts/embeddings
[2025-10-01 19:04:15,650: INFO: helper]: Directory created at: artifacts/embeddings
[2025-10-01 19:04:15,652: INFO: SentenceTransformer]: Use pytorch device_name: cpu
[2025-10-01 19:04:15,654: INFO: SentenceTransformer]: Load pretrained SentenceTransformer: all-MiniLM-L6-v2


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


[2025-10-01 19:04:26,753: INFO: 2983522759]: Loading data...
[2025-10-01 19:04:26,753: INFO: 2983522759]: Generating embeddings...


Batches: 100%|██████████| 10/10 [00:00<00:00, 14.48it/s]

[2025-10-01 19:04:27,467: INFO: 2983522759]: Saving embeddings...
[2025-10-01 19:04:27,467: INFO: 2983522759]: Total embeddings indexed: 300
[2025-10-01 19:04:27,467: INFO: 2983522759]: Embeddings saved to artifacts\embeddings\embeddings.pkl
[2025-10-01 19:04:27,474: INFO: 1568427099]: Embeddings generation completed successfully.



  index.add(np.array(embeddings))
