In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# default_exp DataFrameLoader

# Data Frame Loader

> An API to create training, validation and test dataset for Machine Learning models based on tabluarl or strucuture data

In [None]:
#hide
from nbdev.showdoc import *
from nbdev import *

In [None]:
# export
# hide
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# export

class DataFrameLoader:
    """
    Represent DataFrameLoader class
    
    Attributes:
    X_full: full dataframe load from raw input
    X_test_full: full test dataframe load from raw input
    X: features
    y: target
    """

    def __init__(self):
        self.X_full = None
        self.X_test_full = None
        self.X = None
        self.y = None
        self.X_train = None
        self.X_valid = None
        self.X_test = None
        self.y_train = None
        self.y_valid = None
        self.categorical_cols = None
        self.numerical_cols = None
        self.final_columns = None
    
    def __str__(self):
        """Returns human readable string reprsentation"""
        return "DataFrameLoader object with attributes: X_full, X_test, X(features), y(target), X_train, X_valid, y_train and y_valid"
    
    def __repr__(self):
        return self.__str__()
    
#     def __lt__(self):
#         """returns: boolean"""
#         return True
    
    # load data from csv
    def read_csv(self,train_file_path:str,test_file_path:str, idx_col:str):
        # Read the csv files using pandas
        self.X_full = pd.read_csv(train_file_path, index_col=idx_col)
        self.X_test_full = pd.read_csv(test_file_path, index_col=idx_col)

    # prepare X and y
    def prepare_X_y(self,input_df:object, target:str):
        # Remove rows with missing target
        self.X = input_df.dropna(axis=0, subset=[target])
        # separate target from predictors
        self.y = self.X[target]
        # drop target
        self.X = input_df.drop([target], axis=1)
        
    # split X and y into X_train, y_train, X_valid & y_valid dataframes
    def prepare_train_valid(self,X:object,y:object, valid_size:float, random_state=42):
        self.X_train, self.X_valid, self.y_train, self.y_valid = train_test_split(
            self.X, self.y, train_size=(1-valid_size), test_size=valid_size,random_state=random_state)
    
    # select categorical columns
    def select_categorical_cols(self):
        self.categorical_cols = [cname for cname in self.X_train.columns if
                    self.X_train[cname].nunique() < 10 and 
                    self.X_train[cname].dtype == "object"]
        #TODO: seprate categorical columns into one hot eligible cols for low cardinality 
        # and ordinal cols for high cardinatliy
    
    # select numerical columns
    def select_numerical_cols(self):
        self.numerical_cols = [cname for cname in self.X_train.columns if 
                self.X_train[cname].dtype in ['int64', 'float64']]
    
    # prepare X_train, X_valid from selected columns
    def prepare_X_train_X_valid(self):
        self.select_categorical_cols()
        self.select_numerical_cols()
        self.final_columns = self.categorical_cols + self.numerical_cols
        self.X_train = self.X_train[self.final_columns].copy()
        self.X_valid = self.X_valid[self.final_columns].copy()
        self.X_test = self.X_test_full[self.final_columns].copy()
        
    # get train and valid dataframe
    def from_csv(self, train_file_path:str,test_file_path:str, idx_col:str, target:str, valid_size:float, random_state=42):
        self.read_csv(train_file_path,test_file_path, idx_col)
        self.prepare_X_y(self.X_full, target)
        self.prepare_train_valid(self.X,self.y, valid_size, random_state)
        self.prepare_X_train_X_valid()
        return self

In [None]:
show_doc(DataFrameLoader.from_csv)

<h4 id="DataFrameLoader.from_csv" class="doc_header"><code>DataFrameLoader.from_csv</code><a href="__main__.py#L82" class="source_link" style="float:right">[source]</a></h4>

> <code>DataFrameLoader.from_csv</code>(**`train_file_path`**:`str`, **`test_file_path`**:`str`, **`idx_col`**:`str`, **`target`**:`str`, **`valid_size`**:`float`, **`random_state`**=*`42`*)



*Let's load [Melbourne Home Sale price raw data](https://www.kaggle.com/estrotococo/home-data-for-ml-course)*

In [None]:
dfl = DataFrameLoader().from_csv(train_file_path="input/home_data/train.csv", test_file_path="input/home_data/test.csv", idx_col="Id", target="SalePrice", valid_size=0.2)
dfl

DataFrameLoader object with attributes: X_full, X_test, X(features), y(target), X_train, X_valid, y_train and y_valid

In [None]:
# show shape of dataframes
print("X_full shape:", dfl.X_full.shape)
print("X_test_full shape:", dfl.X_test_full.shape)

X_full shape: (1460, 80)
X_test_full shape: (1459, 79)


Test for X_full and X_test_full

In [None]:
assert dfl.X_full.shape == (1460,80)
assert dfl.X_test_full.shape == (1459,79)

In [None]:
# show shape of X and y 
print("homedata X shape:", dfl.X.shape)
print("homedata y shape", dfl.y.shape)

homedata X shape: (1460, 79)
homedata y shape (1460,)


Test for X and y

In [None]:
# test prepare_X_y()
assert dfl.X.shape == (1460,79)
assert dfl.y.shape == (1460,)

In [None]:
# show shape of X_train, X_valid, y_train and y_valid
print("homedata X_train shape:", dfl.X_train.shape)
print("homedata y_train shape", dfl.y_train.shape)
print("homedata X_valid shape:", dfl.X_valid.shape)
print("homedata y_valid shape", dfl.y_valid.shape)

homedata X_train shape: (1168, 76)
homedata y_train shape (1168,)
homedata X_valid shape: (292, 76)
homedata y_valid shape (292,)


Test for X_train, y_train, X_valid and y_valid

In [None]:
assert dfl.X_train.shape == (1168,76)
assert dfl.y_train.shape == (1168,)
assert dfl.X_valid.shape == (292,76)
assert dfl.y_valid.shape == (292,)

In [None]:
# hide
# run the script to build 

from nbdev.export import notebook2script; notebook2script()

Converted 00_DataFrameLoader.ipynb.
Converted 01_PreProcessor.ipynb.
Converted 03_MLPipeline.ipynb.
Converted index.ipynb.
