# Stage 1 -
### Steps for Data Ingestion 

In [189]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

In [148]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [149]:
class ConfigurationManager:
    def __init__(self,config_filepath = CONFIG_FILE_PATH,params_filepath = PARAMS_FILE_PATH,schema_filepath = SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        create_directories([self.config.artifacts_root])
        
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir 
        )
        return data_ingestion_config

In [150]:
config = ConfigurationManager()

[2024-03-08 16:25:40,824: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-03-08 16:25:40,827: INFO: common: yaml file: params.yaml loaded successfully]
[2024-03-08 16:25:40,828: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-03-08 16:25:40,830: INFO: common: created directory at: artifacts]


In [151]:
config.config

ConfigBox({'artifacts_root': 'artifacts', 'data_ingestion': {'root_dir': 'artifacts/data_ingestion', 'source_URL': 'https://github.com/pankajavasthi/E2EML_Project_EC2_Docker_MLFlow/research/winequality-data.zip', 'local_data_file': 'artifacts/data_ingestion/data.zip', 'unzip_dir': 'artifacts/data_ingestion'}})

In [152]:
config.config.data_ingestion

ConfigBox({'root_dir': 'artifacts/data_ingestion', 'source_URL': 'https://github.com/pankajavasthi/E2EML_Project_EC2_Docker_MLFlow/research/winequality-data.zip', 'local_data_file': 'artifacts/data_ingestion/data.zip', 'unzip_dir': 'artifacts/data_ingestion'})

In [153]:
config.get_data_ingestion_config()

[2024-03-08 16:25:43,421: INFO: common: created directory at: artifacts/data_ingestion]


DataIngestionConfig(root_dir='artifacts/data_ingestion', source_URL='https://github.com/pankajavasthi/E2EML_Project_EC2_Docker_MLFlow/research/winequality-data.zip', local_data_file='artifacts/data_ingestion/data.zip', unzip_dir='artifacts/data_ingestion')

In [154]:
config.config

ConfigBox({'artifacts_root': 'artifacts', 'data_ingestion': {'root_dir': 'artifacts/data_ingestion', 'source_URL': 'https://github.com/pankajavasthi/E2EML_Project_EC2_Docker_MLFlow/research/winequality-data.zip', 'local_data_file': 'artifacts/data_ingestion/data.zip', 'unzip_dir': 'artifacts/data_ingestion'}})

In [155]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

In [156]:
data_ingestion_config = config.get_data_ingestion_config()

[2024-03-08 16:25:46,629: INFO: common: created directory at: artifacts/data_ingestion]


In [157]:
data_ingestion = DataIngestion(config=data_ingestion_config)

In [158]:
data_ingestion.config.local_data_file

'artifacts/data_ingestion/data.zip'

In [159]:
data_ingestion.config.source_URL

'https://github.com/pankajavasthi/E2EML_Project_EC2_Docker_MLFlow/research/winequality-data.zip'

In [176]:
data_ingestion.config.unzip_dir

'artifacts/data_ingestion'

import os
import urllib.request as request
import zipfile

In [183]:
#filename, headers = request.urlretrieve(data_ingestion.config.source_URL,data_ingestion.config.local_data_file)

In [184]:
unzip_path = data_ingestion.config.unzip_dir

In [185]:
os.makedirs(unzip_path, exist_ok=True)

In [186]:
with zipfile.ZipFile(data_ingestion.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)

# Data Ingestion step completed

In [5]:
pwd()

'd:\\Pankaj Avasthi\\Pankaj Study Material\\Projects\\ML\\E2EML_Project_EC2_Docker_MLFlow'

## Data Validation steps Started

In [4]:
import os
os.chdir("../")

In [9]:
import pandas as pd
data = pd.read_csv("artifacts/data_ingestion/winequality-red.csv")
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [None]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict