In [2]:
import os
%pwd  # this tell us which path we are currently working , so based on the below output path we are working under the research file
os.chdir("C:\datascience End to End Projects\End-to-End-Heart-Disease-Application-")  #  but i would like to work with main ProjectML_with_MLFlow file , so for getting i step back in path inorder to enter the main project file i used this command os.chdir("../")

In [3]:
# This is called the entity 
from dataclasses import dataclass # here i imported the dataclass from the dataclasses
from pathlib import Path  # here i imported path from pathlib

# here entity means DataIngestionConfig which it returns all the variables like root_dir,source_URL  and etc 
@dataclass(frozen=True) # here i declared the dataclass decorator
class DataIngestionConfig:  # here i have created a class and named as DataIngestionConfig ,and it is not a python class because we need to declare the self to the variables if it is a python class, it is data class  and whenever i define the configuration fucntion , this class should my return function , the below are the varaible it do return 
    root_dir: Path    # these are variable which i have declared inside the class 
    source_URL: str
    local_data_file: Path
    unzip_dir: Path
    train_csv: Path
    test_csv: Path

In [4]:
from pathlib import Path

CONFIG_FILE_PATH = Path("config/config.yaml") # here iam returning config.yaml file and CONFIG_FILE_PATH is the varaible which stores the path
PARAMS_FILE_PATH = Path("params.yaml")  # here iam returning params.yaml file
SCHEMA_FILE_PATH = Path("schema.yaml")  # here iam returning the schema.yaml file

In [5]:
from PROJECTML.constants import * # here iam importing everthing which is present in the constants->__init__.py file into inside the data_ingestion.ipynb
from PROJECTML.utils.common import read_yaml, create_directories # here iam importing the read_yaml, create_directories which are presenting inside the utils,common files into PROJECTML in which the file is data_ingestion.ipynb

In [6]:
class ConfigurationManager:  # here iam creating class called ConfigurationManager
    def __init__( # inisde this class iam reading all the yaml files which iam calling it from constants->__init__.py file and iam mentioning inside the class varaiable 
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath) # and here iam giving read_yaml path here and iam giving the path after that then it will return all the configuration in the variable
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

# now i will create artifacts root in the side of the vscode project one of the path and the below i will define the data ingestion cofiguration function
    # the above one  entity which inside 4 variables needs to return by this below fucntion
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir]) # here iam creating the root directory, and iam reading the config from the configurationManager class and iam going to access all the data ingestion from the config.yaml file 

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,  # that how iam accessing all the things like root_dir,source_url and etc from config.yaml file and finally this fucntion do return all this variables 
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir,
            train_csv=config.train_csv,
            test_csv=config.test_csv
        )

        return data_ingestion_config

In [7]:
# these are libraries i need for to uodate the components 
import os
import urllib.request as request # so i use the request to download the data from the URL
import zipfile # here iam using the Zipfile to transform the data 
from PROJECTML import logger # here logger is used to logger the data 
from PROJECTML.utils.common import get_size # here i used the getsize is used to get to know the file size


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

# iam going to define one class which is DataIngestion from that class which it will take the DataIngestionConfig because from this dataingestionConfig only it will get to know the path 
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

# now i will define one method which it is responsible for dowmloading the data 
    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            filename, headers = request.urlretrieve(
                url = self.config.source_URL, # it will download the dta from this URL
                filename = self.config.local_data_file
            )
            logger.info(f"{filename} download! with following info: \n{headers}")
        else:
            logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}") # if the data file is already exist it will print the message like that data file is already exit

    

# now iam going to another method called ExtractZipfile
    def extract_zip_file(self):
        """
        zip_file_path: str
        Extracts the zip file into the data directory
        Function returns None
        """
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref: # here it will take the local_data_file path which is present in the config.yaml  local_data_file: artifacts/data_ingestion/data.zip and it will unzip the folder to this data_ingestion 
            zip_ref.extractall(unzip_path)


    def train_test_spliting(self):

        data=pd.read_csv("artifacts\data_ingestion\Heart_csv\heart.csv")
        # Split the data into training and test sets. (0.75, 0.25) split.
        train, test = train_test_split(data, test_size=0.20,random_state=2)

        train.to_csv(os.path.join(self.config.root_dir, "train.csv"), index=False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"), index=False)

In [9]:
# Now iam going to Update my pipeline First iam Initilizing my ConfigirationManager and from this COnfigrationManager iam calling my DataIngestionConfig and this thing iam returning it my dataingestionclass because it will take the data ingestion config  and this thing iam returning it my dataingestionclass because it will take the data ingestion config 
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config) # here iam passing my dataingestionconfig
    data_ingestion.download_file() # here iam downloading the file
    data_ingestion.extract_zip_file() # here iam extracting the zip file , and here we following it by step by step thats why we call it as pipeline 
    data_ingestion.train_test_spliting()
except Exception as e:
    raise e

[2024-02-26 18:45:34,912: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-02-26 18:45:34,917: INFO: common: yaml file: params.yaml loaded successfully]
[2024-02-26 18:45:34,922: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-02-26 18:45:34,924: INFO: common: created directory at: artifacts]
[2024-02-26 18:45:34,926: INFO: common: created directory at: artifacts/data_ingestion]
[2024-02-26 18:45:34,927: INFO: 3653406326: File already exists of size: ~ 9 KB]
