In [1]:
# default_exp core

# Data Frame Loader

> An API to create training, validation and test dataset for Machine Learning models based on tabluarl or strucuture data

In [2]:
#hide
from nbdev.showdoc import *
from nbdev import *

In [3]:
# export
import pandas as pd

In [4]:
# export

class DataFrameLoader:
    """
    Represent Data Frame Loader class
    
    Attributes:
    X_full: full dataframe load from raw input
    X_test_full: full test dataframe load from raw input
    X: features
    y: target
    """

    def __init__(self):
        self.X_full = None
        self.X_test = None
        self.X = None
        self.y = None
    
    def __str__(self):
        """Returns human readable string reprsentation"""
        return "DataFrameLoader object with X_full, X_test and X(features) dataframe and y(target) series"
    
    def __repr__(self):
        return self.__str__()
    
#     def __lt__(self):
#         """returns: boolean"""
#         return True
    
    #     core methods
    def from_csv(self,train_file_path:str,test_file_path:str, idx_col:str, ):
        # Read the csv files using pandas
        self.X_full = pd.read_csv(train_file_path, index_col=idx_col)
        self.X_test = pd.read_csv(test_file_path, index_col=idx_col)

    def prepare_X_y(self,input_df:object, target:str):
        # Remove rows with missing target
        self.X = input_df.dropna(axis=0, subset=[target])
        # separate target from predictors
        self.y = self.X[target]
        # drop target
        self.X = input_df.drop([target], axis=1)

In [5]:
# create DataFrameLoader object
dfl = DataFrameLoader()
dfl

DataFrameLoader object with X_full, X_test and X(features) dataframe and y(target) series

In [6]:
show_doc(DataFrameLoader().from_csv)

<h4 id="DataFrameLoader.from_csv" class="doc_header"><code>DataFrameLoader.from_csv</code><a href="__main__.py#L32" class="source_link" style="float:right">[source]</a></h4>

> <code>DataFrameLoader.from_csv</code>(**`train_file_path`**:`str`, **`test_file_path`**:`str`, **`idx_col`**:`str`)



In [8]:
# load kaggle melbourne home sales price data
dfl.from_csv(
    train_file_path='input/home_data/train.csv',
    test_file_path='input/home_data/train.csv',
    idx_col='Id')
# show shape of dataframes
print("X_full shape:", dfl.X_full.shape)
print("X_test_full shape:", dfl.X_test.shape)

X_full shape: (1460, 80)
X_test_full shape: (1460, 80)


In [10]:
# test for from_csv()
assert dfl.X_full.shape == (1460,80)
assert dfl.X_test.shape == (1460,80)

In [11]:
show_doc(DataFrameLoader().prepare_X_y)

<h4 id="DataFrameLoader.prepare_X_y" class="doc_header"><code>DataFrameLoader.prepare_X_y</code><a href="__main__.py#L37" class="source_link" style="float:right">[source]</a></h4>

> <code>DataFrameLoader.prepare_X_y</code>(**`input_df`**:`object`, **`target`**:`str`)



In [12]:
# Prepare X and y for Melbourne Home Sale Price data
dfl.prepare_X_y(input_df=dfl.X_full, target='SalePrice')

# show shape of X and y 
print("homedata_X shape:", dfl.X.shape)
print("homedata_y shape", dfl.y.shape)


homedata_X shape: (1460, 79)
homedata_y shape (1460,)


In [13]:
# test prepare_X_y()
assert dfl.X.shape == (1460,79)
assert dfl.y.shape == (1460,)