In [2]:
import gc
import pandas as pd 
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

## Helper functions to impute, encode, and generate features/targets

In [3]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

def one_hot_encode_categorical(cat_features, cat_names):
    """
    One-hot encodes categorical features using scikit-learn OneHotEncoder

    Parameters
    ----------
    cat_features : pd.DataFrame
        DataFrame, with index, that has only the categorical columns to one-hot encode
    cat_names : list
        List of categorical column names 

    Returns
    -------
    pd.DataFrame
        DataFrame that holds each of the one-hot encoded columns 
    """
    enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_array = enc.fit_transform(cat_features)
    encoded_df = pd.DataFrame(
        encoded_array,
        columns=enc.get_feature_names_out(cat_names),
        index=cat_features.index
    )
    return encoded_df


In [4]:
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd

def simple_impute_numerical(numeric_features, numeric_names):
    """
    Imputes numerical columns with scikit-learn SimpleImputer()

    Parameters
    ----------
    numeric_features : pd.DataFrame
        DataFrame, with index, that has only the numerical columns to impute
    numeric_names : list
        List of numerical column names 

    Returns
    -------
    pd.DataFrame
        DataFrame that holds each of the imputed numerical columns
    """
    
    # current numeric columns are float16, and they will not work when computing mean()
    # need to convert to float32 for numerical stability
    numeric_features = numeric_features.astype(np.float32)

    # impute columns using the mean
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    numeric_df = pd.DataFrame(
        imp_mean.fit_transform(numeric_features),
        columns=numeric_names,
        index=numeric_features.index
    )
    
    # convert back to float16 for memory efficiency
    numeric_df = numeric_df.astype(np.float16)
    
    return numeric_df


In [5]:
import gc
import pandas as pd

def generate_x_y(df, test=False):
    """
    Returns the features (X) and targets (y) for the given DataFrame.

    Parameters
    ----------
    df : pd.DataFrame
        A DataFrame containing features and (optionally) the target column.
    test : bool
        Whether the provided DataFrame is the test set.
        False = training set 
        True = test set 

    Returns
    -------
    pd.DataFrame
        If it is the test dataset, returns only the features (X)
        
    OR 
    
    Tuple(pd.DataFrame, pd.Series)
        If it is the training set, returns the features and targets in a tuple (X, y)
    """    
    
    df = df.set_index('customer_ID')

    X = df.drop('S_2', axis=1) if test else df.drop(['S_2', 'target'], axis=1)
    y = None if test else df['target']

    del df
    gc.collect()

    # Handle categorical features
    cat_features = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117',
                    'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
    existing_cat_features = [col for col in cat_features if col in X.columns]
    encoded_df = one_hot_encode_categorical(X[existing_cat_features], existing_cat_features)

    # Impute and handle numeric features
    X = X.drop(existing_cat_features, axis=1)
    X = simple_impute_numerical(X, list(X.columns))

    # Final concatenated features
    X = pd.concat([X, encoded_df], axis=1)

    return X if test else (X, y)


## Generate the features and targets, and then save to .ftr file

In [6]:
df = pd.read_feather('test_data.ftr')  # Loads entire test set
df1 = df.iloc[0:5681881]                # First half
df2 = df.iloc[5681881:]                 # Second half
del df                                  # Frees up memory
gc.collect()                            # Triggers garbage collection


0

### Generate features and targets for each half of the split testing set

In [7]:
X_test_1 = generate_x_y(df1, test=True)

# these values are not present in test set but are in training
# set all to 0 to work in .predict()
X_test_1['D_64_-1'] = 0.0
X_test_1['D_66_0.0'] = 0.0
X_test_1['D_68_0.0'] = 0.0
X_test_1 = X_test_1.reindex(sorted(X_test_1.columns), axis=1)

# feather files do not support indexing
X_test_1 = X_test_1.reset_index()
X_test_1.to_feather('X_test_1.ftr')

del X_test_1, df1
gc.collect()


0

In [9]:
X_test_2 = generate_x_y(df2, test=True)

X_test_2['D_64_-1'] = 0.0
X_test_2['D_66_0.0'] = 0.0
X_test_2['D_68_0.0'] = 0.0
X_test_2 = X_test_2.reindex(sorted(X_test_2.columns), axis=1)

X_test_2 = X_test_2.reset_index()
X_test_2.to_feather('X_test_2.ftr')

del X_test_2, df2
gc.collect()


0