In [10]:
import os
from pathlib import Path
os.chdir(Path("E:\FSDS_NOV\ML-WaferFault-Detection"))

In [11]:
from collections import namedtuple
from waferFaultDetection.constants import *
from waferFaultDetection.utils import read_yaml,create_directories
from waferFaultDetection import logger

In [18]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    '''
    Define the resources required for the data ingestion
    such as URL, dataset file name, directory details
    '''
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path
    
# DataIngestionConfig = namedtuple("DataIngestionConfig",[
#     "root_dir",
#     "source_URL",
#     "local_data_file",
#     "unzip_dir"
# ])

# @dataclass(frozen=True)
# class DataValidationConfig:
#     root_dir: Path

In [19]:
from waferFaultDetection import logger
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH
        ):

        logger.info("reading yaml files for configs and parameters")
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        logger.info("creating directory for artifacts")
        create_directories([self.config.artifacts_root])
        logger.info("artifacts directory created")

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        logger.info("creating data ingestion root directory")
        create_directories([config.root_dir])

        logger.info('creating data ingestion configuration')
        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir
        )
        return data_ingestion_config

    # def get_data_validation_config(self)->DataValidationConfig:
    #     config = self.config.data_validation
    #     logger.info('creating data validation configuration')
    #     data_validation_config = DataValidationConfig(
    #         root_dir=config.root_dir
    #     )
    #     return data_validation_config

In [20]:
import os
import json
from waferFaultDetection import logger
from tqdm import tqdm
import re
import shutil
from pathlib import Path
import pandas as pd


class DataValidation:
    def __init__(
        self,
        config:DataIngestionConfig
        ):
        self.config = config

    def _create_schema_train(self):
        root_dir = self.config.root_dir
        schema_train_dict = {
            "SampleFileName":"wafer_31122020_000000.csv",
            "LengthOfDateStamp":8,
            "LengthOfTimeStamp":6,
            "NumOfCols":592,
            "Columns":{}
        }
        numCols = schema_train_dict['NumOfCols']
        for i in tqdm(range(numCols)):
            if i == 0:
                schema_train_dict['Columns']['wafer'] = "str"
            elif i == numCols - 1:
                schema_train_dict['Columns']['Output'] = "int"
            else:
                col = "Sensor - "+str(i)
                schema_train_dict['Columns'][col] = "float"
        json_out = os.path.join(root_dir,"schema_train.json")
        # with open(json_out,"w") as outfile:
        #     json.dump(schema_train_dict,outfile)
        return schema_train_dict
        

    def _validate_file_names(self):
        root_dir = self.config.root_dir
        # dirpath,dirnames,list_of_files = os.walk(root_dir)
        logger.info("Finding the training files directory")
        dirs = [f for f in os.listdir(root_dir) if 'train' in f.lower()]
        if len(dirs) == 1:
            train_dir = os.path.join(root_dir,dirs[0])
        else:
            raise ValueError(print(dirs))
        logger.info("Finding the training data with .csv extension")
        list_of_files = [f for f in os.listdir(train_dir) if f.endswith('.csv')]
        good_files_dir = Path(os.path.join(train_dir,"good_files_dir"))
        bad_files_dir = Path(os.path.join(train_dir,"bad_files_dir"))
        logger.info("Creating good and bad files directories")
        if not os.path.exists(good_files_dir):
            os.makedirs(good_files_dir)
        if not os.path.exists(bad_files_dir):
            os.makedirs(bad_files_dir)
        logger.info("Compare file names and move them to good or bad files directory")
        regex = 'wafer_'+"\d{8}"+"_"+"\d{6}"+".csv"
        for file in tqdm(list_of_files):
            # dir,fname = os.path.split(file)
            if re.match(regex,file.lower()):
                shutil.move(os.path.join(train_dir,file),os.path.join(good_files_dir,file))
            else:
                shutil.move(os.path.join(train_dir,file),os.path.join(bad_files_dir,file))

        self.train_dir = train_dir
        self.good_files_dir = good_files_dir
        self.bad_files_dir = bad_files_dir

    def _validate_columns(self):
        good_files_dir = self.good_files_dir
        bad_files_dir = self.bad_files_dir
        logger.info("create train data schema")
        schema_train_dict = self._create_schema_train()
        logger.info("checking the number of columns in each file and moving to bad_files_dir if not met")
        for file in tqdm(os.listdir(good_files_dir)):
            # logger.info(f"pandas reading file: {file} in {good_files_dir}")
            df = pd.read_csv(os.path.join(good_files_dir,file))
            if not len(df.columns) == schema_train_dict['NumOfCols']:
                logger.info(f"file: {file} moving into the bad_files_dir")
                shutil.move(os.path.join(good_files_dir,file),os.path.join(bad_files_dir,file))
            else:
                pass




In [21]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_ingestion_config() 
    data_validation = DataValidation(config=data_validation_config)
    data_validation._validate_file_names()
    data_validation._validate_columns()
except Exception as e:
    raise e

[2022-12-21 08:59:25,653: INFO: 2872640451]: reading yaml files for configs and parameters
[2022-12-21 08:59:25,657: INFO: common]: yaml file: configs\config.yaml loaded successfully
[2022-12-21 08:59:25,659: INFO: common]: yaml file: params.yaml loaded successfully
[2022-12-21 08:59:25,660: INFO: 2872640451]: creating directory for artifacts
[2022-12-21 08:59:25,662: INFO: common]: created directory at: artifacts
[2022-12-21 08:59:25,664: INFO: 2872640451]: artifacts directory created
[2022-12-21 08:59:25,664: INFO: 2872640451]: creating data ingestion root directory
[2022-12-21 08:59:25,666: INFO: common]: created directory at: artifacts/data_ingestion
[2022-12-21 08:59:25,667: INFO: 2872640451]: creating data ingestion configuration
[2022-12-21 08:59:25,668: INFO: 4797932]: Finding the training files directory
[2022-12-21 08:59:25,668: INFO: 4797932]: Finding the training data with .csv extension
[2022-12-21 08:59:25,670: INFO: 4797932]: Creating good and bad files directories
[2022

0it [00:00, ?it/s]

[2022-12-21 08:59:25,674: INFO: 4797932]: create train data schema



100%|██████████| 592/592 [00:00<00:00, 435237.15it/s]

[2022-12-21 08:59:25,677: INFO: 4797932]: checking the number of columns in each file and moving to bad_files_dir if not met



100%|██████████| 9/9 [00:00<00:00, 27.14it/s]
