In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# default_exp dataframeloader

# Data Frame Loader

> An API to create training, validation and test dataset for Machine Learning models based on tabluarl or strucuture data

In [3]:
#hide
from nbdev.showdoc import *
from nbdev import *

In [4]:
# export
# hide
import pandas as pd
import gc
from sklearn.model_selection import train_test_split
import numpy as np
from tabular_ml_toolkit.logger import *
import time

In [5]:
# export
# hide

# make sure to pip install modin[ray]>=0.11.3
# #settings for modin
# import ray
# ray.init()
# import os
# os.environ["MODIN_ENGINE"] = "ray"
# import modin.pandas as pd

In [6]:
# export

class DataFrameLoader:
    """
    Represent DataFrameLoader class
    
    Attributes:
    X_test: test dataframe
    X: features dataframe
    y: target series
    """

    def __init__(self):
        
        self.numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
        self.shape_X_full = None
        self.X_full = None
        self.X_test = None
        self.X = None
        self.y = None
        self.X_train = None
        self.X_valid = None
        self.y_train = None
        self.y_valid = None
        self.use_num_cols = None
        self.use_cat_cols = None
        self.categorical_cols = None
        self.numerical_cols = None
        self.low_card_cat_cols = None
        self.high_card_cat_cols = None
        self.final_cols = None
        self.target = None
    
    def __str__(self):
        """Returns human readable string reprsentation"""
        return "DataFrameLoader object with attributes: X_full, X_test, X(features), y(target), X_train, X_valid, y_train and y_valid"
    
    def __repr__(self):
        return self.__str__()

    # utility method
    # Idea taken from https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65/comments
    # Author ArjenGroen https://www.kaggle.com/arjanso
    def reduce_num_dtype_mem_usage(self, df, verbose=True):
        start_mem = df.memory_usage().sum() / 1024 ** 2
        for col in df.columns:
            col_type = df[col].dtypes
            if col_type in self.numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == "int":
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
        end_mem = df.memory_usage().sum() / 1024 ** 2
        if verbose:
            logger.info(
                "DataFrame Memory usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                    end_mem, 100 * (start_mem - end_mem) / start_mem
                )
            )
        return df
    
    # CORE METHODS
    # load data from csv
    def read_csv(self,train_file_path:str,test_file_path:str, idx_col:str, nrows:int):
        # Read the csv files using pandas
        if train_file_path is not None:
            self.X_full = pd.read_csv(train_file_path, index_col=idx_col, nrows=nrows)
            self.shape_X_full = self.X_full.shape
            self.X_full = self.reduce_num_dtype_mem_usage(self.X_full, verbose=True)
        else:
            logger.warn(f"No valid train_file_path provided: {train_file_path}")
        
        if test_file_path is not None:
            self.X_test = pd.read_csv(test_file_path, index_col=idx_col, nrows=nrows)
            self.shape_X_test = self.X_test.shape
            self.X_test = self.reduce_num_dtype_mem_usage(self.X_test, verbose=True)
            
        else:
            logger.info(f"No test_file_path given, so training will continue without it!")
        return self
    
    # fixes least class having only 1 value hence breaking logic for train, val split
    def fix_least_class(self, X, target):
        y = X[target].values
        # now check for value count and fix least class value
        val_cnt = X[target].value_counts().to_frame()
        least_class_label = val_cnt.index[-1:].to_list()[0]
        least_class_val = val_cnt.values[-1:][0][0]
        # TODO see if you want to change 2 to other number and then use for loop to add it
        if least_class_val < 2:
            logger.info(f"The least class label is :{least_class_label} and value count is: {least_class_val}")
            #  just copying more least value
            lowest_val_cnt_row = X[X[target] == least_class_label]
            # duplicate lowest value count class bu +12 because of able to do alteast 10 k-fold
            #start_time = time.time()
            lowest_val_cnt_df = pd.concat([lowest_val_cnt_row, lowest_val_cnt_row, lowest_val_cnt_row,
                                           lowest_val_cnt_row, lowest_val_cnt_row, lowest_val_cnt_row,
                                          lowest_val_cnt_row, lowest_val_cnt_row, lowest_val_cnt_row,
                                          lowest_val_cnt_row, lowest_val_cnt_row, lowest_val_cnt_row,
                                          lowest_val_cnt_row, lowest_val_cnt_row, lowest_val_cnt_row,
                                          lowest_val_cnt_row, lowest_val_cnt_row, lowest_val_cnt_row,
                                          lowest_val_cnt_row, lowest_val_cnt_row, lowest_val_cnt_row],
                                          axis=0, ignore_index=True)
            #end_time = time.time()
            #logger.info(f"The time took to concat 12 rows: {end_time - start_time}")
            #del [start_time, end_time]
            # now just copy paste lowest_val_cnt_row
            # can use for loop here to add multiple times same row but performance will impact
            logger.info(f"The Original X shape is: {X.shape}")
            #start_time = time.time()
            #X = X.append(lowest_val_cnt_df, ignore_index = True)
            X = pd.concat([X, lowest_val_cnt_df], axis=0, ignore_index=True)
            #end_time = time.time()
            #logger.info(f"The time took to append 1 dataframe to existing one!: {end_time - start_time}")
            #del [start_time, end_time]
            logger.info(f"The X shape after least class duplicates appends is: {X.shape}")
            y = X[target].values
        #trigger gc to clearn old X df from memory
        gc.collect()
        return y, X
    
    
    # prepare X and y
    def prepare_X_y(self,input_df:object, target:str, problem_type:str):
        # Remove rows with missing target
        self.X = input_df.dropna(axis=0, subset=[target])
        # set target in dfl
        self.target = target
        # separate target from predictors
        #TODO: check value_counts() to fix least class having only 1 value
        # TODO: breaking logic for train, val split
        if "classification" in problem_type:
            self.y, self.X = self.fix_least_class(self.X, target)
        else:
            self.y = self.X[target].values
        # drop target
        self.X = self.X.drop([target], axis=1)
        gc.collect()
        return self
    
    # select categorical columns
    def select_categorical_cols(self, X:object):
        # for low cardinality columns
        self.low_card_cat_cols = [cname for cname in X.columns if
                    X[cname].nunique() < 10 and 
                    X[cname].dtype == "object"]
        # for high cardinality columns
        self.high_card_cat_cols = [cname for cname in X.columns if
                    X[cname].nunique() > 10 and 
                    X[cname].dtype == "object"]    
        # for all categorical columns
        self.categorical_cols = self.low_card_cat_cols + self.high_card_cat_cols
    
    # select numerical columns
    def select_numerical_cols(self, X:object):
        self.numerical_cols = [cname for cname in X.columns if 
                X[cname].dtype in self.numerics]
    
    # prepare final columns by data type
    def prepare_final_cols(self, X):   
        self.select_categorical_cols(X)
        self.select_numerical_cols(X)
        
        if (self.numerical_cols is not None) and (self.categorical_cols is not None):
            self.final_cols = self.numerical_cols + self.categorical_cols

        elif (self.numerical_cols is not None) and (self.categorical_cols is None):
            self.final_cols = self.numerical_cols

        elif (self.numerical_cols is None) and (self.categorical_cols is not None):
            self.final_cols = self.categorical_cols
    
    
    # prepare X_train, X_valid from selected columns
    def update_X_train_X_valid_X_test_with_final_cols(self, final_cols:object):
        self.X_train = self.X_train[final_cols]
        self.X_valid = self.X_valid[final_cols]
        if self.X_test is not None:
            self.X_test = self.X_test[final_cols]

        
    def update_X_y_with_final_cols(self,final_cols:object):
        self.X = self.X[final_cols]
        if self.X_test is not None:
            self.X_test = self.X_test[final_cols]
        
    # split X and y into X_train, y_train, X_valid & y_valid dataframes    
    def create_train_valid(self, X:object, y:object, valid_size:float, random_state=42):
        
        self.X_train, self.X_valid, self.y_train, self.y_valid = train_test_split(
            X, y, train_size=(1-valid_size), test_size=valid_size, random_state=random_state)
        
        #self.update_X_train_X_valid_X_test_with_final_cols(self.final_cols)
        
        return self.X_train, self.X_valid, self.y_train, self.y_valid
    
#     # split X and y into X_train, y_train, X_valid & y_valid dataframes    
#     def create_train_valid(self, valid_size:float, X=None, y=None, random_state=42):
        
#         if X and y:
#             self.X_train, self.X_valid, self.y_train, self.y_valid = train_test_split(
#                 X, y, train_size=(1-valid_size), test_size=valid_size, random_state=random_state)
            
#         else:
#             self.X_train, self.X_valid, self.y_train, self.y_valid = train_test_split(
#                 self.X, self.y, train_size=(1-valid_size), test_size=valid_size, random_state=random_state)
        
#         self.update_X_train_X_valid_X_test_with_final_cols(self.final_cols)
        
#         return self.X_train, self.X_valid, self.y_train, self.y_valid
    
    # get train and valid dataframe
    def from_csv(self, train_file_path:str,
                 idx_col:str, target:str,
                 problem_type:str,
                 nrows:int=None,
                 test_file_path:str=None,
                 use_num_cols:bool=True,
                 use_cat_cols:bool=True,
                 random_state=42):
        
        # read csv and load dataframes using pandas
        self.read_csv(train_file_path,test_file_path, idx_col, nrows)
        if self.X_full is not None:
            self.prepare_X_y(self.X_full, target, problem_type)
        # create final columns based upon type of columns
        self.prepare_final_cols(self.X)
        if self.final_cols is not None:
            self.update_X_y_with_final_cols(self.final_cols)
        
        # clean up unused dataframes
        del [self.X_full]
        gc.collect()
        
        return self
    
        # get train and valid dataframe
    def regenerate_dfl(self, X:object, y:object, X_test:object, random_state=42):
        # assign X,y and X_test
        self.X = X
        self.y = y
        self.X_test = X_test
        
        # create final columns based upon dtype of columns
        self.prepare_final_cols(X)
        
        if self.final_cols is not None:
            self.update_X_y_with_final_cols(self.final_cols)
        
        return self

In [7]:
show_doc(DataFrameLoader.from_csv)

<h4 id="DataFrameLoader.from_csv" class="doc_header"><code>DataFrameLoader.from_csv</code><a href="__main__.py#L227" class="source_link" style="float:right">[source]</a></h4>

> <code>DataFrameLoader.from_csv</code>(**`train_file_path`**:`str`, **`idx_col`**:`str`, **`target`**:`str`, **`problem_type`**:`str`, **`nrows`**:`int`=*`None`*, **`test_file_path`**:`str`=*`None`*, **`use_num_cols`**:`bool`=*`True`*, **`use_cat_cols`**:`bool`=*`True`*, **`random_state`**=*`42`*)



*Let's load [Melbourne Home Sale price raw data](https://www.kaggle.com/estrotococo/home-data-for-ml-course)*

In [8]:
dfl = DataFrameLoader().from_csv(
    train_file_path="input/home_data/train.csv",
    test_file_path="input/home_data/test.csv",
    idx_col="Id",
    target="SalePrice",
    problem_type="regression")
dfl

2021-12-25 21:54:06,909 INFO DataFrame Memory usage decreased to 0.58 Mb (35.5% reduction)
2021-12-25 21:54:06,939 INFO DataFrame Memory usage decreased to 0.58 Mb (34.8% reduction)


DataFrameLoader object with attributes: X_full, X_test, X(features), y(target), X_train, X_valid, y_train and y_valid

Test for X_full and X_test

In [9]:
# show shape of X and y 
print("homedata X shape:", dfl.X.shape)
print("homedata y shape", dfl.y.shape)
print("homedata X_test shape", dfl.X_test.shape)

homedata X shape: (1460, 79)
homedata y shape (1460,)
homedata X_test shape (1459, 79)


Test for X and y

In [10]:
# test prepare_X_y()
assert dfl.X.shape == (1460,79)
assert dfl.y.shape == (1460,)
assert dfl.X_test.shape == (1459,79)

Now, let's create train, valid split from X and y

In [12]:
X_train, X_valid, y_train, y_valid = dfl.create_train_valid(dfl.X, dfl.y, valid_size=0.2, random_state = 42)

In [13]:
# show shape of X_train, X_valid, y_train and y_valid
print("homedata X_train shape:", dfl.X_train.shape)
print("homedata y_train shape", dfl.y_train.shape)
print("homedata X_valid shape:", dfl.X_valid.shape)
print("homedata y_valid shape", dfl.y_valid.shape)
print("homedata X_test shape", dfl.X_test.shape)

homedata X_train shape: (1168, 79)
homedata y_train shape (1168,)
homedata X_valid shape: (292, 79)
homedata y_valid shape (292,)
homedata X_test shape (1459, 79)


Test for X_train, y_train, X_valid and y_valid

In [14]:
assert dfl.X_train.shape == (1168,79)
assert dfl.y_train.shape == (1168,)
assert dfl.X_valid.shape == (292,79)
assert dfl.y_valid.shape == (292,)
assert dfl.X_test.shape == (1459,79)

In [15]:
# hide
# run the script to build 

from nbdev.export import notebook2script; notebook2script()

Converted 00_dataframeloader.ipynb.
Converted 01_preprocessor.ipynb.
Converted 02_tmlt.ipynb.
Converted 04_xgb_optuna_objective.ipynb.
Converted Kaggle_TPS_Dec_Meta_Clean.ipynb.
Converted Kaggle_TPS_Dec_TabNet.ipynb.
Converted Kaggle_TPS_Dec_Tutorial-Meta(Tabnet+XGB).ipynb.
Converted Kaggle_TPS_Dec_Tutorial-Meta.ipynb.
Converted Kaggle_TPS_Dec_Tutorial_XGB.ipynb.
Converted Kaggle_TPS_Nov_Tutorial.ipynb.
Converted index.ipynb.
Converted logger.ipynb.
Converted utility.ipynb.
Converted xgb_tabular_ml_toolkit.ipynb.
