In [1]:
import os, sys
from pathlib import Path
import urllib
from textSummarizer.logging import logger
from zipfile import ZipFile

In [2]:
pwd

'C:\\Users\\papu_\\OneDrive\\Desktop\\University Courses\\3rd Semester\\AWS ML Speciality\\Text-Summarizer-AWS-Deployment\\research'

In [3]:
os.chdir("../")

In [4]:
pwd

'C:\\Users\\papu_\\OneDrive\\Desktop\\University Courses\\3rd Semester\\AWS ML Speciality\\Text-Summarizer-AWS-Deployment'

In [5]:
# Entity.yaml
from dataclasses import dataclass
import numpy as np

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_dir: str
    local_data_file: Path
    unzip_dir: Path


In [6]:
ad = DataIngestionConfig('a', 'b', 'c', 'd')
ad.unzip_dir

'd'

In [7]:
# Configuration manager
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories, get_size

@dataclass
class ConfigurationManager:
    config_file_path: Path = CONFIG_FILE_PATH
    params_file_path: Path = PARAMS_FILE_PATH
    
    # Calling this to not override default init and post init is called as the last function of default init
    def __post_init__(self):
        self.config = read_yaml(self.config_file_path) # CONFIG_FILE_PATH taken from constants
        self.params = read_yaml(self.params_file_path) # PARAMS_FILE_PATH taken from constants
        
        create_directories([self.config.artifacts_path, self.config.data_ingestion.root_dir])
        
    def getDataIngestionConfig(self) -> DataIngestionConfig:
        
        data_ingestion_config = self.config.data_ingestion
        
        dic = DataIngestionConfig(root_dir = data_ingestion_config.root_dir, 
                                 source_dir = data_ingestion_config.source_dir, 
                                 local_data_file = data_ingestion_config.local_data_file, 
                                 unzip_dir = data_ingestion_config.unzip_dir)
        
        return dic
    


In [12]:
# Components creation

class DataIngestion:
    def __init__(self, dic : DataIngestionConfig):
        self.dic = dic
        
    # Download the data from url and save to the local file name and extract the zip contents, save them all in artifacts folder
    def download_data(self):
        if(not os.path.exists(self.dic.local_data_file)):
            filename, header = urllib.request.urlretrieve(url = self.dic.source_dir, filename = self.dic.local_data_file)
            logger.info('Data file {} downloaded, with return header {}'.format(filename, header))
        else:
            size = get_size(Path(self.dic.local_data_file))
            logger.info('Data file {} already exists, with {}'.format(self.dic.local_data_file, size))
    # Extract
    def extract_zip(self):
        with ZipFile(self.dic.local_data_file, 'r') as zObject:
            zObject.extractall(path=self.dic.unzip_dir)
        logger.info('Zip file extracted at {}'.format(self.dic.unzip_dir))
        
        

In [13]:
# Update the pipeline
try:
    config = ConfigurationManager()
    dic = config.getDataIngestionConfig()
    di = DataIngestion(dic)

    di.download_data()
    di.extract_zip()
except Exception as e:
    raise e
    

[2023-08-16 15:57:33,438] @ [INFO] : common : Path to YAML config\config.yaml loaded correctly
[2023-08-16 15:57:33,440] @ [INFO] : common : Path to YAML params.yaml loaded correctly
[2023-08-16 15:57:33,441] @ [INFO] : common : Directory artifacts created correctly
[2023-08-16 15:57:33,443] @ [INFO] : common : Directory artifacts/data_ingestion created correctly
[2023-08-16 15:57:33,444] @ [INFO] : <ipython-input-12-532bbc926797> : Data file artifacts/data_ingestion/summarizer-data.zip already exists, with File size: 7718.353515625 kB
[2023-08-16 15:57:33,604] @ [INFO] : <ipython-input-12-532bbc926797> : Zip file extracted at artifacts/data_ingestion


In [60]:
from dataclasses import dataclass
from ensure import ensure_annotations

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    data_dir: Path
    status_dir: str
    required_files: list


In [61]:
# Configuration manager
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories, get_size
from pathlib import Path

@dataclass
class ConfigurationManager:
    config_file_path: Path = CONFIG_FILE_PATH
    params_file_path: Path = PARAMS_FILE_PATH
    
    # Calling this to not override default init and post init is called as the last function of default init
    def __post_init__(self):
        self.config = read_yaml(self.config_file_path) # CONFIG_FILE_PATH taken from constants
        self.params = read_yaml(self.params_file_path) # PARAMS_FILE_PATH taken from constants
        
        create_directories([self.config.artifacts_path, self.config.data_ingestion.root_dir])

    def getDataValidationConfig(self) -> DataValidationConfig:
        
        data_validation_config = self.config.data_validation  
        create_directories([data_validation_config.root_dir])
        
        dic = DataValidationConfig(root_dir = Path(data_validation_config.root_dir), 
                                   data_dir = Path(data_validation_config.data_dir),
                                 status_dir = str(data_validation_config.status_dir), 
                                 required_files = data_validation_config.required_files)
        
        return dic

In [68]:

class DataValidation:
    def __init__(self, dic : DataValidationConfig):
        self.dic = dic
    
    def validate_data(self):
        try:
            # Validate files in folder, status is set to true only when all the required files are found
            # Further validation should include column level, handling missing values, row level error fixing
            list_of_files = os.listdir(os.path.join(self.dic.data_dir, 'samsum_dataset'))
            data_validation_status = True

            with open(self.dic.status_dir, 'w') as f:
                f.write('Data validation status: {}'.format(data_validation_status))
            
#             print(self.dic.required_files)
            for req_file in self.dic.required_files:
                if(req_file not in list_of_files):
                    print(req_file)
                    data_validation_status = False
                    with open(self.dic.status_dir, 'w') as f:
                        f.write('Data validation status: {}'.format(data_validation_status))

                logger.info('Data Validation status: {}'.format(data_validation_status))
                return data_validation_status
            
            logger.info('Data Validation status: {}'.format(data_validation_status))
            
        except Exception as e:
            logger.error(e)
            raise e
    
    

In [69]:
try:
    config = ConfigurationManager()
    dic = config.getDataValidationConfig()
    di = DataValidation(dic)
    di.validate_data()
    
except Exception as e:
    raise e

[2023-08-16 22:09:39,308] @ [INFO] : common : Path to YAML config\config.yaml loaded correctly
[2023-08-16 22:09:39,312] @ [INFO] : common : Path to YAML params.yaml loaded correctly
[2023-08-16 22:09:39,316] @ [INFO] : common : Directory artifacts created correctly
[2023-08-16 22:09:39,316] @ [INFO] : common : Directory artifacts/data_ingestion created correctly
[2023-08-16 22:09:39,321] @ [INFO] : common : Directory artifacts/data_validation created correctly
[2023-08-16 22:09:39,324] @ [INFO] : <ipython-input-68-a47d829e1e0e> : Data Validation status: True
