### Data Ingestion Module

In [2]:
import os
%pwd

'c:\\Users\\nico_\\Desktop\\MLOPS\\TextSummarizer\\research'

In [3]:
os.chdir("../")
%pwd

'c:\\Users\\nico_\\Desktop\\MLOPS\\TextSummarizer'

### Basic Configuration

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataIngestionConfig:
    root_dir: Path
    dataset_name: str
    save_path: Path




DataIngestionConfig est une structure de données qui contient les paramètres essentiels :

root_dir : dossier racine où tu veux tout stocker

dataset_name : nom du dataset Hugging Face (ex: "knkarthick/samsum")

save_path : où tu veux enregistrer le dataset téléchargé

In [6]:
from src.textSummarizer.constants import *
from src.textSummarizer.utils.common import read_yaml, create_directories

## Configuration Updates 

In [7]:
class ConfigurationManager:
    def __init__(self,
                 config_path=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH):
        self.config=read_yaml(config_path)
        self.paramss=read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])

        return DataIngestionConfig(
            root_dir=Path(config.root_dir),
            dataset_name=config.dataset_name,
            save_path=Path(config.save_dir)
        )
        

Charge le fichier config.yaml pour lire la config globale

Crée les dossiers nécessaires à l’exécution (artifacts/)

Extrait seulement la partie data_ingestion: du YAML et retourne une instance de DataIngestionConfig avec les valeurs correspondantes.

In [16]:
import os
import urllib.request as request
from src.textSummarizer.logging import logger
from datasets import load_dataset

### Components

In [14]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def load_and_save_dataset(self):
        dataset = load_dataset(self.config.dataset_name)
        dataset.save_to_disk(self.config.save_path)
        logger.info(f"Dataset '{self.config.dataset_name}' saved to {self.config.save_path}")


    


Cette méthode :

Utilise load_dataset() pour télécharger depuis Hugging Face

Puis save_to_disk() pour le sauvegarder localement (très utile pour les gros pipelines ou le debug)

In [15]:
config = ConfigurationManager()
data_ingestion_config = config.get_data_ingestion_config()
data_ingestion = DataIngestion(config=data_ingestion_config)

data_ingestion.load_and_save_dataset()


[2025-07-18 19:27:51,688: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-07-18 19:27:51,697: INFO: common: yaml file: params.yaml loaded successfully]
[2025-07-18 19:27:51,699: INFO: common: created directory at: artifacts]
[2025-07-18 19:27:51,701: INFO: common: created directory at: artifacts/data_ingestion]


Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 489332.70 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 142309.54 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 142159.20 examples/s]

[2025-07-18 19:27:53,490: INFO: 3345693479: Dataset 'knkarthick/samsum' saved to artifacts\data_ingestion\samsum]



