In [1]:
import pickle
import gc
import pandas as pd 
from sklearn.preprocessing import OneHotEncoder

In [2]:
def one_hot_encode(cat_df):
    """
    One-hot encodes categorical features using scikit-learn OneHotEncoder

    Parameters
    ----------
    cat_df: pd.DataFrame
        DataFrame, with index, that has only the categorical columns to one-hot encode

    Returns
    -------
    pd.DataFrame
        DataFrame that holds each of the one-hot encoded columns 
    """    
    
    enc = OneHotEncoder(sparse=False)
    encoded_df = pd.DataFrame(enc.fit_transform(cat_df), columns=enc.get_feature_names(['D_63_last', 'D_64_last']), index=cat_df.index)
    return encoded_df

In [3]:
def impute_helper(col):
    """
    Function to be passed into .apply() to help with imputing the different types of columns.

    Parameters
    ----------
    col: pd.Series
        A column of the DataFrame to be imputed

    Returns
    -------
    pd.Series
        New column imputed with either most common value or mean(), instead of NaNs 
    """        
    
    cat_features = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
    convert_dtype = False
    
    # convert float16's to float32 to calculate means without overflow 
    if col.dtype == 'float16':
        convert_dtype = True
        col = col.astype('float32')
    
    # if the column was originally a categorical feature then fill with the most common value
    # otherwise fill with mean()
    if '_'.join(col.name.split("_", 2)[:2]) in cat_features:
        col = col.fillna(col.value_counts().idxmax())
    else: 
        col = col.fillna(col.mean())
        
    # convert float16s back 
    if convert_dtype:
        col = col.astype('float16')
        
    return col

In [4]:
def impute_columns(df):
    """
    Fills NaN values for Aggregate data. Categorical columns are filled with most common value 
    and numerical are filled with mean.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame, with index, that should be imputed

    Returns
    -------
    pd.DataFrame
        DataFrame with no NaN values
    """

    # if the column is a categorical feature fill with the most common value, fill with mean() if column is numerical
    df = df.apply(impute_helper)
    
    return df

In [5]:
def generate_x_y(df_file_path, test=False):
    """
    Returns the features (X) and targets (y) for the given data file

    Parameters
    ----------
    df_file_path : string
        File path to generate DataFrame from 
    test : boolean
        Whether or not the provided data file is the test set
        False = training set 
        True = test set 

    Returns
    -------
    pd.DataFrame
        If it is the test dataset it will return only the features (X)
        
    OR 
    
    Tuple(pd.DataFrame, pd.DataFrame)
        If it is the training set it will return the features and targets in a tuple (X, y)
    """    
    
    df = pd.read_pickle(df_file_path, compression='gzip')
    y = None if test else df['target']
    
    # D_63_last and D_64_last columns are of type 'category', these are the only columns that need to be one-hot encoded
    # the other, original, categorical features are already modified from the aggregate functions
    encoded_df = one_hot_encode(df[['D_63_last', 'D_64_last']])
    
    # impute with numerical columns with mean() and categorical columns with most common value
    X = impute_columns(df.drop(['D_63_last', 'D_64_last'], axis=1) if test else df.drop(['D_63_last', 'D_64_last', 'target'], axis=1))
    
    del df
    gc.collect()
    
    # combine new dataframes and sort them to line up when training/predicting
    X = pd.concat([X, encoded_df], axis=1)
    
    if test: 
        return X
    else: 
        return (X, y)

In [6]:
X_train, y_train = generate_x_y('/kaggle/input/amex-agg-data-pickle/train_agg.pkl')
X_train = X_train.reindex(sorted(X_train.columns), axis=1)

display(X_train.head())

X_train.to_pickle('X_train_agg.pkl', compression='gzip')
y_train.to_pickle('y_train_agg.pkl', compression='gzip')

del X_train, y_train
gc.collect()

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/amex-agg-data-pickle/train_agg.pkl'

In [None]:
X_test = generate_x_y('/kaggle/input/amex-agg-data-pickle/test_agg.pkl', test=True)

# add columns that existin in train but not test 
X_test['D_64_last_-1'] = 0.0
X_test = X_test.reindex(sorted(X_test.columns), axis=1)

display(X_test.head())
X_test.to_pickle('X_test_agg.pkl', compression='gzip')

del X_test
gc.collect()

  after removing the cwd from sys.path.


Unnamed: 0_level_0,B_10_last,B_10_max,B_10_mean,B_10_min,B_10_std,B_11_last,B_11_max,B_11_mean,B_11_min,B_11_std,...,S_8_last,S_8_max,S_8_mean,S_8_min,S_8_std,S_9_last,S_9_max,S_9_mean,S_9_min,S_9_std
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,0.0336,0.063171,0.037079,-0.002918,0.017798,0.005188,0.013306,0.006187,0.003296,0.003371,...,0.464111,0.464111,0.27124,0.170776,0.103693,0.016998,0.022949,0.015572,0.010773,0.00411
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,0.298828,0.303223,0.298096,0.293457,0.002682,0.002235,0.237061,0.035706,0.001258,0.061982,...,0.768555,1.004883,0.840332,0.60498,0.110109,0.018509,0.544434,0.155762,0.018509,0.184704
0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8,0.12915,0.298828,0.192017,0.079224,0.088889,0.00338,0.025375,0.009048,0.001163,0.007781,...,0.122986,0.759766,0.336182,0.007782,0.204347,0.009171,0.55127,0.08197,0.001582,0.175943
00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976cf6e56734528702d694,0.032135,0.032135,0.024277,0.013832,0.006008,0.139038,0.326416,0.293213,0.139038,0.048926,...,0.479004,0.752441,0.553711,0.466064,0.114457,0.012878,0.016632,0.010063,0.002308,0.004278
00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9a4693dd914fca22557,0.022018,0.038879,0.024414,0.012337,0.008096,0.514648,0.514648,0.419189,0.32373,0.070164,...,0.244995,0.605469,0.281738,0.005058,0.236115,0.006939,0.009323,0.006081,0.000601,0.003206


0