* Data Ingestion - Fetch Data from Source


In [1]:
# import pandas
import pandas as pd

# import logger and config
from helpers.config import load_config
from helpers.logger import logger



class DataIngestion:
    """Get data from source and return as CSV file."""
    
    
    def __init__(self, config: dict):
        """Initialize DataIngestion class.
        
        Args:
            config (dict): A configuration file containing features, targets, file paths.
        """
        self.config = config or load_config()
        
    def get_data(self) -> pd.DataFrame:
        """Get data from url link and return as pd.DataFrame.
        
        Returns:
            data (pd.DataFrame): Data From Url Link.
        """
        try:
            
            data = pd.read_csv(self.config['url_link'], delimiter=",")
            return data
        except Exception as e:
            logger.error(f"File does not exits or response error: {e}")
        return None

- if you read the URL Link from a pd.read_csv, and it is a CSV format, you will still retrieve that data.

In [2]:
# example
obj = DataIngestion(config=load_config())
data = obj.get_data()

data.head(10)

Unnamed: 0,seq,s3,s4,s5,s6,s7,s9,s11,s13,s14,...,uria,rtdum,bd,mi,old,vr,school,chval,dnotown,dprop
0,2,1,1,1,88,1,1120,0,5,5.0,...,3.9,0,1,1,1,0,15.0,206.7355,0,0
1,3,1,1,1,118,1,1120,0,5,5.0,...,3.2,0,1,1,0,1,18.0,238.4943,0,0
2,7,1,1,1,185,1,1120,0,5,5.0,...,3.2,0,1,1,1,0,12.0,200.1977,0,0
3,9,1,1,1,185,1,1120,0,5,5.0,...,4.3,0,0,1,0,1,12.0,193.8895,0,0
4,10,1,1,1,330,1,1120,0,5,5.0,...,3.2,0,0,1,1,0,20.0,257.0874,0,0
5,11,1,1,1,97,1,1120,0,5,5.0,...,3.9,0,1,1,1,0,16.0,237.8392,0,0
6,12,1,1,1,56,1,1120,0,5,8.0,...,3.9,0,1,1,1,0,14.0,212.4823,0,0
7,14,1,1,1,187,1,1120,0,5,5.0,...,1.8,0,1,1,0,0,16.0,197.7294,0,0
8,15,1,1,1,131,3,1120,0,5,5.0,...,3.1,0,1,1,0,0,12.0,213.4624,0,0
9,20,1,1,1,80,1,1120,0,5,5.0,...,3.9,0,0,1,1,0,16.0,213.9908,0,0


- Cleaning is the next step