In [1]:
import os
from collections import namedtuple

os.chdir("../")

In [2]:
from pydantic import BaseModel , FilePath , FileUrl , DirectoryPath , AnyUrl
from pydantic.dataclasses import dataclass

@dataclass
class DataIngestionConfig:
    root_dir : DirectoryPath
    source_url : str
    local_file_name : str
    unzip_dir : DirectoryPath

In [3]:
@dataclass
class DataIngestionConfig:
    root_dir : DirectoryPath
    source_url : str
    local_file_name : str
    unzip_dir : DirectoryPath


In [None]:
!kaggle datasets download -d gpiosenka/100-bird-species -p artifacts/data_ingestion/

In [None]:
import os 
os.makedirs("artifacts/data_ingestion")

In [None]:
!unzip 100-bird-species.zip -d artifacts/data_ingestion/

In [None]:
!rm 100-bird-species.zip

In [6]:
from BirdClassifier.constants import *
from BirdClassifier.utils import read_yaml , create_directories  
from BirdClassifier.logger import logger

In [7]:
class ConfigurationManager:
    def __init__(self , 
                 config_file_path = CONFIG_FILE_PATH , 
                 param_file_path = PARAMS_FILE_PATH) -> None:
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(param_file_path)
        logger.info(f"Artifacts dir : {self.config.artifacts_root}")
        create_directories([self.config.artifacts_root])
    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion 
        create_directories([config.root_dir])
        
        data_ingestion_config = DataIngestionConfig(root_dir=config.root_dir , 
                                                    source_url=config.source_url , 
                                                    local_file_name=config.local_data_file ,
                                                    unzip_dir=config.unzip_dir)
        return data_ingestion_config

In [8]:
config = ConfigurationManager()

2022-09-29 19:05:15.870 | INFO     | BirdClassifier.utils.common:read_yaml:29 - yaml file: configs/config.yaml loaded successfully
2022-09-29 19:05:15.871 | INFO     | BirdClassifier.utils.common:read_yaml:29 - yaml file: params.yaml loaded successfully
2022-09-29 19:05:15.871 | INFO     | __main__:__init__:7 - Artifacts dir : artifacts
2022-09-29 19:05:15.872 | INFO     | BirdClassifier.utils.common:create_directories:47 - created directory at: artifacts


In [9]:
config.get_data_ingestion_config()

2022-09-29 19:05:18.590 | INFO     | BirdClassifier.utils.common:create_directories:47 - created directory at: artifacts/data_ingestion


DataIngestionConfig(root_dir=PosixPath('artifacts/data_ingestion'), source_url='gpiosenka/100-bird-species', local_file_name='artifacts/data_ingestion/100-bird-species.zip', unzip_dir=PosixPath('artifacts/data_ingestion'))

In [8]:
import os 
import urllib.request as request 
from zipfile import ZipFile
import kaggle
import subprocess
from BirdClassifier.logger import logger

In [9]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
        logger.info(f'{"#"*10} STAGE ONE DATA INEGESTION STARTED {"#"*10}')
    def download_file(self):
        if not os.path.exists(self.config.local_file_name):
            logger.info(f" downloading data from kaggle {self.config.source_url}")
            subprocess.run(['kaggle', 'datasets', 'download', '-d', self.config.source_url, '-p', self.config.root_dir])
          

    # def _get_updated_list_of_files(self, list_of_files):
    #     return [f for f in list_of_files if f.endswith(".jpg") and ("Cat" in f or "Dog" in f)]

    # def _preprocess(self, zf: ZipFile, f: str, working_dir: str):
    #     target_filepath = os.path.join(working_dir, f)
    #     if not os.path.exists(target_filepath):
    #         zf.extract(f, working_dir)
        
    #     if os.path.getsize(target_filepath) == 0:
    #         os.remove(target_filepath)

    def unzip_and_clean(self):
        with ZipFile(file=self.config.local_file_name, mode="r") as zf:
            list_of_files = zf.namelist()
            logger.info(f'folder and data description file : {list_of_files}')
            zf.extractall(path=self.config.root_dir)
            

In [None]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.unzip_and_clean()
except Exception as e:
    raise e