# Data Ingestion component workflow

In [1]:
import os

In [2]:
# current working directory
%pwd

'c:\\Users\\anjik\\Desktop\\MLOPs_projects\\Chest_Disease_Image_Classification\\research'

In [3]:
# move to root directory
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\anjik\\Desktop\\MLOPs_projects\\Chest_Disease_Image_Classification'

In [5]:
# create dataclass for DataIngestionConfig (inputs from config.yaml)

from dataclasses import dataclass
from pathlib import Path

# similar to config_entity
@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

In [6]:
# read config, params files from constant

from cnnClassifier.constants import *
print(CONFIG_FILE_PATH)
print(PARAMS_FILE_PATH)


config\config.yaml
params.yaml


In [7]:
# common functions (read_yaml, create_directory) in utils
from cnnClassifier.utils.common import read_yaml, create_directories

In [8]:
# write configuration manager
class ConfigurationManager:
    """
    ConfigurationManager class captures & returns configuration for components implementation
        
    """
    def __init__(self,
        # params: config.yaml, params.yaml          
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):
        
        # read config, params yaml file
        self.config = read_yaml(config_filepath) # returns ConfigBox to access data easily
        self.params = read_yaml(params_filepath)
        
        # create artifact directory
        create_directories([self.config.artifacts_root])
        
    def get_data_ingestion_config (self) -> DataIngestionConfig:
        """
        Method: get_data_ingestion_config
        Params:
        Returns: configuration for Data Ingestion component i.e DataIngestionConfig 
        """
        logger.info("Entering get_data_ingestion_config method of ConfigurationManager")
        data_ingestion_config = self.config.data_ingestion # data_ingestion key from config.yaml
        create_directories([data_ingestion_config.root_dir]) # creates artifacts/data_ingestion directory
        
        # returning from entity: DataIngestionConfig dataclass
        data_ingestion_config= DataIngestionConfig(
            root_dir = data_ingestion_config.root_dir,
            source_URL= data_ingestion_config.source_URL,
            local_data_file= data_ingestion_config.local_data_file,          
            unzip_dir= data_ingestion_config.unzip_dir
        )
        logger.info("Then, exiting get_data_ingestion_config method of ConfigurationManager")
        return data_ingestion_config
        

In [9]:
from cnnClassifier import logger
import gdown
import os
import zipfile
from cnnClassifier.utils.common import get_size

In [10]:
# write DataIngestion Component
class DataIngestion:
    """
    
    """
    def __init__(self, data_ingestion_config:DataIngestionConfig):
        self.data_ingestion_config = data_ingestion_config
    
    def download_file(self) ->str:
        """
        Method: download_url
        purpose: fetch data from the url
        Returns: str: _description_
        """
        logger.info("Entering download_file method of DataIngestion component")
        
        try:
            dataset_url = self.data_ingestion_config.source_URL
            zip_download_dir = self.data_ingestion_config.local_data_file #artifacts/data_ingestion/data.zip
            # create directory to store zip file
            os.makedirs("artifacts/data_ingestion", exist_ok=True)
            logger.info(f"Downloading data from {dataset_url} into {zip_download_dir}")
            
            file_id = dataset_url.split('/')[-2]
            prefix = 'https://drive.google.com/uc?/export=download&id='
            gdown.download(prefix+file_id, zip_download_dir) # gdown.download('from_where', 'to_where')
            
            logger.info(f"Downloaded data from {dataset_url} into {zip_download_dir}")
            logger.info("Then, exiting download_file method of DataIngestion component")
        except Exception as e:
            raise e
            
    def extract_zip_data(self) -> None:
        """
        Method: extract_zip_data
        purpose: Extract zip file into the data directory
        zip_file_path:str
        Returns: None
        
        """
        logger.info("Entering extract_zip_data method of DataIngestion component")
        
        try:
            # path for unzip_dir 
            unzip_path = self.data_ingestion_config.unzip_dir  # artifacts/data_ingestion
            os.makedirs(unzip_path, exist_ok=True) 
            
            logger.info(f"Extracting zip file into {unzip_path}")
            
            zip_file = self.data_ingestion_config.local_data_file
            # unzip from artifacts/data_ingestion/data.zip
            with zipfile.ZipFile(zip_file, "r") as zip_ref:
                # extract into artifacts/data_ingestion
                zip_ref.extractall(unzip_path)
                
            logger.info(f"Extracted zip data from {zip_file} into {unzip_path}") 
            logger.info("Then, exiting extract_zip_data method of DataIngestion component")   
            
        except Exception as e:
            raise e

# Training pipeline for data ingestion component

In [11]:
try:
    logger.info("Data Ingestion component started")
    logger.info("Loading of Data Ingestion configuration started")
    config = ConfigurationManager() # create object for ConfigurationManager class
    data_ingestion_config = config.get_data_ingestion_config() # obj.method() returns DataIngestionConfig
    logger.info("All configuration directories, files needed for Data Ingestion component are ready")
    
    logger.info("Data Ingestion steps started")
    data_ingestion = DataIngestion(data_ingestion_config = data_ingestion_config) # create object for DataIngestion class
    data_ingestion.download_file() # obj.method()
    data_ingestion.extract_zip_data()
    logger.info("OK! Data Ingestion component completed")
except Exception as e:
    raise e

[2024-03-16 23:30:39,707: INFO: 2322006326: Data Ingestion component started]
[2024-03-16 23:30:39,709: INFO: 2322006326: Loading of Data Ingestion configuration started]
[2024-03-16 23:30:39,713: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-03-16 23:30:39,717: INFO: common: yaml file: params.yaml loaded successfully]
[2024-03-16 23:30:39,719: INFO: common: created directory at: artifacts]
[2024-03-16 23:30:39,721: INFO: 3479315878: Entering get_data_ingestion_config method of ConfigurationManager]
[2024-03-16 23:30:39,722: INFO: common: created directory at: artifacts/data_ingestion]
[2024-03-16 23:30:39,723: INFO: 3479315878: Then, exiting get_data_ingestion_config method of ConfigurationManager]
[2024-03-16 23:30:39,724: INFO: 2322006326: All configuration directories, files needed for Data Ingestion component are ready]
[2024-03-16 23:30:39,725: INFO: 2322006326: Data Ingestion steps started]
[2024-03-16 23:30:39,726: INFO: 3070473835: Entering download_fi

Downloading...
From (original): https://drive.google.com/uc?/export=download&id=1z0mreUtRmR-P-magILsDR3T7M6IkGXtY
From (redirected): https://drive.google.com/uc?%2Fexport=download&id=1z0mreUtRmR-P-magILsDR3T7M6IkGXtY&confirm=t&uuid=62da1ff9-2fa9-45c5-ac6d-e24f00c4e3c7
To: c:\Users\anjik\Desktop\MLOPs_projects\Chest_Disease_Image_Classification\artifacts\data_ingestion\data.zip
100%|██████████| 49.0M/49.0M [00:28<00:00, 1.74MB/s]

[2024-03-16 23:31:09,690: INFO: 3070473835: Downloaded data from https://drive.google.com/file/d/1z0mreUtRmR-P-magILsDR3T7M6IkGXtY/view?usp=drive_link into artifacts/data_ingestion/data.zip]
[2024-03-16 23:31:09,691: INFO: 3070473835: Then, exiting download_file method of DataIngestion component]
[2024-03-16 23:31:09,691: INFO: 3070473835: Entering extract_zip_data method of DataIngestion component]
[2024-03-16 23:31:09,692: INFO: 3070473835: Extracting zip file into artifacts/data_ingestion]





[2024-03-16 23:31:10,260: INFO: 3070473835: Extracted zip data from artifacts/data_ingestion/data.zip into artifacts/data_ingestion]
[2024-03-16 23:31:10,261: INFO: 3070473835: Then, exiting extract_zip_data method of DataIngestion component]
[2024-03-16 23:31:10,262: INFO: 2322006326: OK! Data Ingestion component completed]


: 