In [1]:
import gc
import pandas as pd 
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

## Helper functions to impute, encode, and generate features/targets

In [8]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

def one_hot_encode_categorical(cat_features, cat_names):
    """
    One-hot encodes categorical features using scikit-learn OneHotEncoder

    Parameters
    ----------
    cat_features : pd.DataFrame
        DataFrame, with index, that has only the categorical columns to one-hot encode
    cat_names : list
        List of categorical column names 

    Returns
    -------
    pd.DataFrame
        DataFrame that holds each of the one-hot encoded columns 
    """
    enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_array = enc.fit_transform(cat_features)
    encoded_df = pd.DataFrame(
        encoded_array,
        columns=enc.get_feature_names_out(cat_names),
        index=cat_features.index
    )
    return encoded_df


In [9]:
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd

def simple_impute_numerical(numeric_features, numeric_names):
    """
    Imputes numerical columns with scikit-learn SimpleImputer()

    Parameters
    ----------
    numeric_features : pd.DataFrame
        DataFrame, with index, that has only the numerical columns to impute
    numeric_names : list
        List of numerical column names 

    Returns
    -------
    pd.DataFrame
        DataFrame that holds each of the imputed numerical columns
    """
    
    # current numeric columns are float16, and they will not work when computing mean()
    # need to convert to float32 for numerical stability
    numeric_features = numeric_features.astype(np.float32)

    # impute columns using the mean
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    numeric_df = pd.DataFrame(
        imp_mean.fit_transform(numeric_features),
        columns=numeric_names,
        index=numeric_features.index
    )
    
    # convert back to float16 for memory efficiency
    numeric_df = numeric_df.astype(np.float16)
    
    return numeric_df


In [10]:
def generate_x_y(df_file_path, test=False):
    """
    Returns the features (X) and targets (y) for the given data file

    Parameters
    ----------
    df_file_path : str
        File path to generate DataFrame from 
    test : bool
        Whether the provided data file is the test set
        False = training set 
        True = test set 

    Returns
    -------
    pd.DataFrame
        If test=True, returns only the features (X)
        
    OR 
    
    Tuple(pd.DataFrame, pd.Series)
        If test=False, returns the features and targets in a tuple (X, y)
    """
    import pandas as pd
    import gc

    # Load data and set index
    df = pd.read_feather(df_file_path)
    df = df.set_index('customer_ID')

    # Extract features and target
    X = df.drop('S_2', axis=1) if test else df.drop(['S_2', 'target'], axis=1)
    y = None if test else df['target']

    del df
    gc.collect()

    # Encode categorical features
    cat_features = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117',
                    'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
    existing_cat_features = [col for col in cat_features if col in X.columns]
    encoded_df = one_hot_encode_categorical(X[existing_cat_features], existing_cat_features)

    # Impute numeric features
    numeric_columns = X.drop(columns=existing_cat_features).columns.tolist()
    numeric_df = simple_impute_numerical(X[numeric_columns], numeric_columns)

    # Combine
    X_final = pd.concat([numeric_df, encoded_df], axis=1)

    return X_final if test else (X_final, y)


## Generate the features and targets, and then save to .ftr file

In [11]:
X_train, y_train = generate_x_y('train_data.ftr')

# sort columns for matching with test set 
X_train = X_train.reindex(sorted(X_train.columns), axis=1)

# feather files do not support indexing
X_train = X_train.reset_index()
y_train = y_train.reset_index()
X_train.to_feather('X_train.ftr')
y_train.to_feather('y_train.ftr')

del X_train, y_train
gc.collect()

0