In [1]:
import os
from ensure import ensure_annotations

In [2]:
os.chdir("../")

In [3]:
!pwd

/Users/ngkuissi/Dev/Image_Search_Engine


In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataCleaningConfig:
    remove_folder_dir: Path
    remove_train_file_dir: Path
    remove_file_extention: str
    remove_zip_dir: Path


In [5]:
from imageSearchEngine.constants import *
from imageSearchEngine.utils.file_helpers import read_yaml, create_directories

In [6]:
class ConfigurationManager:
    @ensure_annotations
    def __init__(
        self,
        config_filepath:Path = CONFIG_FILE_PATH,
        params_filepath:Path = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        #self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    @ensure_annotations
    def get_clean_data_config(self) -> DataCleaningConfig:
        config = self.config.data_cleaning

        data_cleaning_config = DataCleaningConfig(
            remove_folder_dir=config.remove_folder_dir,
            remove_train_file_dir=config.remove_train_file_dir,
            remove_file_extention=config.remove_file_extention,
            remove_zip_dir=config.remove_zip_dir
        )
        return data_cleaning_config

In [7]:
import os
import shutil
from imageSearchEngine.logging.logger import log
from imageSearchEngine.exception import CustomException

In [8]:
class DataCleaning:
    '''class that would take care of cleaning the data in current directories'''

    @ensure_annotations
    def __init__(self, config: DataCleaningConfig):
        self.config = config
    
    def clean(self):
        '''
        Cleans data as per request should only be done once
        '''
        try:
            
            shutil.rmtree(self.config.remove_folder_dir)
            log.info(f'Removed {self.config.remove_folder_dir}')
            os.remove(self.config.remove_zip_dir)
            log.info(f'Removed {self.config.remove_zip_dir}')
            for folder in os.listdir(self.config.remove_train_file_dir):
                current_dir = os.path.join(self.config.remove_train_file_dir, folder)
                for items in os.listdir(current_dir):
                    if items.endswith(f'.{self.config.remove_file_extention}'):
                        os.remove(os.path.join(current_dir, items))
            log.info(f'remove all the files ending with {self.config.remove_file_extention} on the directory {self.config.remove_train_file_dir}')
        except Exception as e:
            raise CustomException(e)

        

In [9]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_clean_data_config()
    data_cleaning = DataCleaning(config=data_ingestion_config)
    data_cleaning.clean()
except Exception as e:
    raise CustomException(e)

[2023-09-11 16:43:06,107: INFO: file_helpers: yaml file: config/config.yaml loaded successfully]
[2023-09-11 16:43:06,108: INFO: file_helpers: created directory at: artifacts]
[2023-09-11 16:43:06,788: INFO: 659246889: Removed artifacts/data_ingestion/tiny-imagenet-200/test]
[2023-09-11 16:43:06,789: INFO: 659246889: Removed artifacts/data_ingestion/tinyimagenettorch.zip]
[2023-09-11 16:43:06,914: INFO: 659246889: remove all the files ending with txt on the directory artifacts/data_ingestion/tiny-imagenet-200/train]
