### Data Ingestion Module

In [1]:
import os 
%pwd

'/home/eman/Desktop/NLP_huggingface/notebook'

In [2]:
os.chdir('../')

In [3]:
%pwd

'/home/eman/Desktop/NLP_huggingface'

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataIngestionConfig:
    root_dir: Path
    source_url: Path
    local_data_file: Path
    unzip_dir: Path

## Basic Configuration

In [5]:
from src.TextSummarizer.constants import *
from src.TextSummarizer.utils.common_utils import read_yaml, create_dir
from src.TextSummarizer.exceptions.customexception import TextSummarizerException
from src.TextSummarizer.logger.logger import logger
from pathlib import Path
import os, sys


In [6]:
## defining the configuration manager

class ConfigurationManager:
    def __init__(self, config_path:Path = CONFIG_FILE_PATH, params_path:Path = PARAMS_FILE_PATH):
        
        try:
            self.config = read_yaml(config_path)
            self.params = read_yaml(params_path)

            artifact_root = self.config.artifacts_root

            ## creating the artifact root directory
            create_dir([artifact_root])
            logger.info(f"successfully created the artifact root directory of path: {artifact_root}")

        except Exception as e:
            logger.error('Unable to initiate the Configuration Manager')
            raise TextSummarizerException(e, sys)
        
    def get_data_ingestion_config(self)-> DataIngestionConfig:
        try:
            config= self.config.data_ingestion
            
            ##creating the data ingestion directories
            create_dir([config.root_dir])
            logger.info(f"successfully created the data ingestion directory with path: {config.root_dir}")

            data_ingestion_config = DataIngestionConfig(
                root_dir=config.root_dir,
                source_url = config.source_url,
                local_data_file=config.local_data_file,
                unzip_dir= config.unzip_dir
            )

            return data_ingestion_config

        except Exception as e:
            logger.error(f'Cannot get the data ingestion configuration {e}')
            raise TextSummarizerException(e,sys)



## Components

In [7]:
import urllib.request as request
import zipfile
import os

In [None]:
## creating our dataingestion class

class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config=config

    def download_file(self):
        try:
            if not os.path.exists(self.config.local_data_file):
                file, header = request.urlretrieve(
                    url= self.config.source_url,
                    filename= self.config.local_data_file
                )
                logger.info(f"File has been downloaded to: {self.config.local_data_file}")
            else:
                logger.info('File already exists')

        except Exception as e:
            logger.error(f'Error downloading file {e}')
            raise TextSummarizerException(e,sys)

    def extract_zip_file(self):
        """ Extracts the zip file into the data directory

        zip_file_path: str
        
        
        returns: None
        """
        try:
            unzip_path = self.config.unzip_dir
            os.makedirs(unzip_path,exist_ok=True)
            logger.info(f'unzip directory created successfully with path: {unzip_path}')

            with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
                zip_ref.extractall(unzip_path)
                logger.info(f"file successfully extracted to {unzip_path}")

        except Exception as e:
            logger.error(f"Error unzipping the file {e}")
            raise TextSummarizerException(e,sys)

        

In [14]:
## running the code

config= ConfigurationManager()
ingestion_config= config.get_data_ingestion_config()
data_ingestion = DataIngestion(ingestion_config)
data_ingestion.download_file()
data_ingestion.extract_zip_file()

[ 2025-08-13 11:37:30,626 ]: textsummarizer: INFO: common_utils: 31: logger file loaded successfully from: {path_to_yaml}
[ 2025-08-13 11:37:30,633 ]: textsummarizer: INFO: common_utils: 31: logger file loaded successfully from: {path_to_yaml}
[ 2025-08-13 11:37:30,637 ]: textsummarizer: INFO: common_utils: 61: created directory at: artifacts
[ 2025-08-13 11:37:30,638 ]: textsummarizer: INFO: 2448609134: 14: successfully created the artifact root directory of path: artifacts
[ 2025-08-13 11:37:30,644 ]: textsummarizer: INFO: common_utils: 61: created directory at: artifacts/data_ingestion
[ 2025-08-13 11:37:30,647 ]: textsummarizer: INFO: 2448609134: 26: successfully created the data ingestion directory with path: artifacts/data_ingestion
[ 2025-08-13 11:37:30,649 ]: textsummarizer: INFO: 3388520733: 16: File already exists
[ 2025-08-13 11:37:30,652 ]: textsummarizer: INFO: 3388520733: 33: unzip directory created successfully with path: artifacts/data_ingestion/text_summarizer
[ 2025-0