In [1]:
import pandas as pd
import datetime
import numpy as np
import gc
import pickle

import utils
utils.widen_ipython_window()

In [2]:
# Paths

MAIN_PATH = "/home/mahesh/Desktop/ML/kaggle/amex/"

# Data
PATH_TO_DATA                = MAIN_PATH + "data/"
PATH_TO_PROCESSED2_DATA     = PATH_TO_DATA + "processed2/"
PATH_TO_PROCESSED3_DATA     = PATH_TO_DATA + "processed3/"
PATH_TO_PROCESSED4_DATA     = PATH_TO_DATA + "processed4/"

FILENAME_TRAIN_DATA_CSV     = PATH_TO_DATA + "orig/train_data.csv"
FILENAME_TRAIN_LABELS_CSV   = PATH_TO_DATA + "orig/train_labels.csv"
FILENAME_TRAIN_DATA_FEATHER = PATH_TO_DATA + "orig/train_data.f"     
FILENAME_TEST_DATA_CSV      = PATH_TO_DATA + "orig/test_data.csv"

# Processed data
#FILENAME_CID_MAP                      = PATH_TO_PROCESSED_DATA + "cid_map.csv"
FILENAME_TRAIN_PROCESSED2_DATA_FEATHER   = PATH_TO_PROCESSED2_DATA + "train_data.f"
FILENAME_TRAIN_PROCESSED2_LABELS_FEATHER = PATH_TO_PROCESSED2_DATA + "train_labels.f"
FILENAME_TRAIN_PROCESSED2_DATA_CAT_NOCHANGE_FEATHER  = PATH_TO_PROCESSED2_DATA + "train_data_cat_nochange.f"
FILENAME_TEST_PROCESSED2_DATA_FEATHER    = PATH_TO_PROCESSED2_DATA + "test_data.f"
FILENAME_TEST_PROCESSED2_DATA_CAT_NOCHANGE_FEATHER   = PATH_TO_PROCESSED2_DATA + "test_data_cat_nochange.f"

FILENAME_TEST_CUSTOMER_HASHES  = PATH_TO_PROCESSED2_DATA + "test_customer_hashes_data.pq"
FILENAME_TEST_CID_OLD_NEW_MAP  = PATH_TO_PROCESSED2_DATA + "test_cid_old_new_map.f"


FILENAME_TRAIN_PROCESSED3_DATA_PARAQUET  = PATH_TO_PROCESSED3_DATA + "train_data.pq"
FILENAME_LGBM_FEATURE_IMPORTANCE = PATH_TO_PROCESSED4_DATA + "feature_imp.pkl"

PATH_TO_GRU_NAN_EMBEDDINGS_DATA = PATH_TO_DATA + "gru_nan_embeddings_full/"
FILENAME_TRAIN_PROCESSED_GRU_NAN_EMBEDDINGS_FEATHER = PATH_TO_GRU_NAN_EMBEDDINGS_DATA + "train_data.f"

PATH_TO_RNN_NAN_EMBEDDINGS_NN_STATS_DATA = PATH_TO_DATA + "rnn_nn/"
FILENAME_TRAIN_RNN_NN_DATA_FEATHER = PATH_TO_RNN_NAN_EMBEDDINGS_NN_STATS_DATA + "train_nn_data.f"
FILENAME_TRAIN_RNN_RNN_DATA_FEATHER = PATH_TO_RNN_NAN_EMBEDDINGS_NN_STATS_DATA + "train_rnn_data.f"



In [3]:
id_time_cols = set(['customer_ID', 'S_2'])
cat_cols = set(['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68'])
   

#### Tasks
    1. Read FULL CSV into DF (Data-Frame).  
    2. Use low-memory representation for CIDs, date-time, categorical data and numerical data. This should reduce memory usage.  
    3. Save the low-memory representation DF to Feather Format.  

In [4]:
print(datetime.datetime.now())
# Read from Feather
#train_df = pd.read_feather(FILENAME_TRAIN_DATA_FEATHER)
# Read from CSV
train_df = pd.read_csv(FILENAME_TRAIN_DATA_CSV)
print(datetime.datetime.now())
train_df.info(memory_usage="deep")

2022-08-19 11:15:41.580601
2022-08-19 11:17:30.079699
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5531451 entries, 0 to 5531450
Columns: 190 entries, customer_ID to D_145
dtypes: float64(185), int64(1), object(4)
memory usage: 9.2 GB


In [7]:
#
# Here we are messing with both categorical and numerical data
#

train_df['customer_ID'] = train_df['customer_ID'].str[-16:].apply(lambda x:int(x,16)).astype('int64')
train_df['S_2'] = pd.to_datetime( train_df['S_2'] )


for c in train_df.columns:
    if c in id_time_cols: continue
    if c in cat_cols:
        train_df[c] = train_df[c].astype('category')
    elif str( train_df[c].dtype )=='int64':
        train_df[c] = train_df[c].astype('int32')
    elif str( train_df[c].dtype )=='float64':
        train_df[c] = train_df[c].astype('float32')

train_df.to_feather(FILENAME_TRAIN_PROCESSED_DATA_FEATHER)

In [9]:
del train_df
gc.collect()

print(datetime.datetime.now())
# Read from Feather
train_df = pd.read_feather(FILENAME_TRAIN_PROCESSED_DATA_FEATHER)
print(datetime.datetime.now())
train_df.info(memory_usage="deep")

2022-07-30 08:53:15.358187
2022-07-30 08:53:16.502071
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5531451 entries, 0 to 5531450
Columns: 190 entries, customer_ID to D_145
dtypes: category(11), datetime64[ns](1), float32(176), int32(1), int64(1)
memory usage: 3.8 GB


In [7]:
#
# Edit the customer IDs of the label data
#

train_labels_df = pd.read_csv(FILENAME_TRAIN_LABELS_CSV)

train_labels_df['customer_ID'] = train_labels_df['customer_ID'].str[-16:].apply(lambda x:int(x,16)).astype('int64')

train_labels_df.to_feather(FILENAME_TRAIN_PROCESSED2_LABELS_FEATHER)

In [5]:
#
# Here we are messing only with numerical data, and leaving categorical data unchanged.
#

print(datetime.datetime.now())

train_df['customer_ID'] = train_df['customer_ID'].str[-16:].apply(lambda x:int(x,16)).astype('int64')
train_df['S_2'] = pd.to_datetime( train_df['S_2'] )

print(datetime.datetime.now())

for c in train_df.columns:
    if c in id_time_cols: continue
    if c in cat_cols:
        # train_df[c] = train_df[c].astype('category')
        pass
    elif str( train_df[c].dtype )=='int64':
        train_df[c] = train_df[c].astype('int32')
    elif str( train_df[c].dtype )=='float64':
        train_df[c] = train_df[c].astype('float32')

print(datetime.datetime.now())

train_df.to_feather(FILENAME_TRAIN_PROCESSED2_DATA_CAT_NOCHANGE_FEATHER)

print(datetime.datetime.now())

#del train_df
#gc.collect()

2022-08-19 11:19:07.562303
2022-08-19 11:19:11.129219
2022-08-19 11:21:01.605689
2022-08-19 11:21:05.198479


In [5]:
print(datetime.datetime.now())
# Read from Feather
train_df = pd.read_feather(FILENAME_TRAIN_PROCESSED2_DATA_CAT_NOCHANGE_FEATHER)
print(datetime.datetime.now())
train_df.info(memory_usage="deep")

2022-08-19 11:25:57.603904
2022-08-19 11:26:00.184262
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5531451 entries, 0 to 5531450
Columns: 190 entries, customer_ID to D_145
dtypes: datetime64[ns](1), float32(176), float64(9), int32(1), int64(1), object(2)
memory usage: 4.7 GB


In [3]:
id_time_cols = set(['customer_ID', 'S_2'])
cat_cols = set(['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68'])

def compress_df(df, change_cat_data = False):
    df['customer_ID'] = df['customer_ID'].str[-16:].apply(lambda x:int(x,16)).astype('int64')
    df['S_2'] = pd.to_datetime( df['S_2'] )
    for c in df.columns:
        if c in id_time_cols: continue
        if ((c in cat_cols) and (change_cat_data)):
            df[c] = df[c].astype('category')
        elif str( df[c].dtype )=='int64':
            df[c] = df[c].astype('int32')
        elif str( df[c].dtype )=='float64':
            df[c] = df[c].astype('float32')
    return df

def compress_train_data_for_gru(FILENAME_CSV, change_cat_data):
    chunksize = 200000
    compressed_full_df = None
    count = 0
    with pd.read_csv(FILENAME_CSV, chunksize=chunksize) as reader:
        for chunk_df in reader:
            utils.pt(f'Processing chunk {count}')
            #chunk_df.info()
            compressed_chunk_df = compress_df(chunk_df, change_cat_data)
            #compressed_chunk_df.info()
            if (compressed_full_df is None):
                compressed_full_df = compressed_chunk_df
            else:
                compressed_full_df = pd.concat([compressed_full_df,compressed_chunk_df],ignore_index = True)
            utils.gc_l([compressed_chunk_df])
            #chunk_df.info()
            count += 1
            compressed_full_df.info(memory_usage="deep")
    return compressed_full_df

In [8]:
test_df = compress_train_data_for_gru(FILENAME_TEST_DATA_CSV, False)
utils.pt('Writing file.')
test_df.to_feather(FILENAME_TEST_PROCESSED2_DATA_CAT_NOCHANGE_FEATHER)

2022-08-24 14:34:44.064258 : Processing chunk 0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Columns: 190 entries, customer_ID to D_145
dtypes: datetime64[ns](1), float32(185), int32(1), int64(1), object(2)
memory usage: 167.1 MB
2022-08-24 14:34:52.018814 : Processing chunk 1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Columns: 190 entries, customer_ID to D_145
dtypes: datetime64[ns](1), float32(185), int32(1), int64(1), object(2)
memory usage: 334.2 MB
2022-08-24 14:34:59.884538 : Processing chunk 2
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Columns: 190 entries, customer_ID to D_145
dtypes: datetime64[ns](1), float32(185), int32(1), int64(1), object(2)
memory usage: 501.3 MB
2022-08-24 14:35:08.205532 : Processing chunk 3
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Columns: 190 entries, customer_ID to D_145
dtypes: datetime64[ns](1), float32(185), i

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6400000 entries, 0 to 6399999
Columns: 190 entries, customer_ID to D_145
dtypes: datetime64[ns](1), float32(185), int32(1), int64(1), object(2)
memory usage: 5.2 GB
2022-08-24 14:40:27.731611 : Processing chunk 32
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6600000 entries, 0 to 6599999
Columns: 190 entries, customer_ID to D_145
dtypes: datetime64[ns](1), float32(185), int32(1), int64(1), object(2)
memory usage: 5.4 GB
2022-08-24 14:40:40.338166 : Processing chunk 33
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6800000 entries, 0 to 6799999
Columns: 190 entries, customer_ID to D_145
dtypes: datetime64[ns](1), float32(185), int32(1), int64(1), object(2)
memory usage: 5.5 GB
2022-08-24 14:40:53.854456 : Processing chunk 34
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000000 entries, 0 to 6999999
Columns: 190 entries, customer_ID to D_145
dtypes: datetime64[ns](1), float32(185), int32(1), int64(1), object(2)
memory usage: 

In [4]:
test_df = compress_train_data_for_gru(FILENAME_TEST_DATA_CSV, True)
utils.pt('Writing file.')
test_df.to_feather(FILENAME_TEST_PROCESSED2_DATA_FEATHER)

2022-08-24 14:54:51.690409 : Processing chunk 0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Columns: 190 entries, customer_ID to D_145
dtypes: category(11), datetime64[ns](1), float32(176), int32(1), int64(1)
memory usage: 140.2 MB
2022-08-24 14:54:59.444090 : Processing chunk 1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Columns: 190 entries, customer_ID to D_145
dtypes: category(11), datetime64[ns](1), float32(176), int32(1), int64(1)
memory usage: 280.4 MB
2022-08-24 14:55:07.149084 : Processing chunk 2
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Columns: 190 entries, customer_ID to D_145
dtypes: category(11), datetime64[ns](1), float32(176), int32(1), int64(1)
memory usage: 420.6 MB
2022-08-24 14:55:15.784973 : Processing chunk 3
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Columns: 190 entries, customer_ID to D_145
dtypes: category(11), datetime64[

2022-08-24 14:59:41.793906 : Processing chunk 31
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6400000 entries, 0 to 6399999
Columns: 190 entries, customer_ID to D_145
dtypes: category(11), datetime64[ns](1), float32(176), int32(1), int64(1)
memory usage: 4.4 GB
2022-08-24 14:59:54.037128 : Processing chunk 32
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6600000 entries, 0 to 6599999
Columns: 190 entries, customer_ID to D_145
dtypes: category(11), datetime64[ns](1), float32(176), int32(1), int64(1)
memory usage: 4.5 GB
2022-08-24 15:00:05.337011 : Processing chunk 33
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6800000 entries, 0 to 6799999
Columns: 190 entries, customer_ID to D_145
dtypes: category(11), datetime64[ns](1), float32(176), int32(1), int64(1)
memory usage: 4.7 GB
2022-08-24 15:00:16.475139 : Processing chunk 34
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000000 entries, 0 to 6999999
Columns: 190 entries, customer_ID to D_145
dtypes: category(11), datet

In [6]:
test_cids = pd.read_parquet(FILENAME_TEST_CUSTOMER_HASHES)
test_cids = test_cids.drop_duplicates().sort_index().reset_index()
test_cids.rename(columns = {'customer_ID':'customer_ID_orig'}, inplace = True)
test_cids['customer_ID'] = test_cids['customer_ID_orig'].str[-16:].apply(lambda x:int(x,16)).astype('int64')
test_cids.to_feather(FILENAME_TEST_CID_OLD_NEW_MAP)

In [13]:
#
# 1. Read LGBM pre-processed(pp) data, handle NaNs and write only the unimportant features back to file.
# 2. Read RNN  pre-processed(pp) data, write only th2e unimportant features back to file.
#

CAT_FEATURES = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]

IMP_FEATURES_THRESHOLD = 94

# Read feature importance file
def read_feature_importance():
    with open(FILENAME_LGBM_FEATURE_IMPORTANCE, 'rb') as f: 
        f_imp = pickle.load(f)    
    return (f_imp[0], f_imp[1])

# Split features into important and unimportant features 
def imp_unimp_features(f_p_imp_dict):
    
    # first 95 are imp features, rest are un-imp features
    
    imp_f   = []
    unimp_f = []
    th = IMP_FEATURES_THRESHOLD
    c = 0
    for k in f_p_imp_dict.keys():
        if (c <= th):
            imp_f.append(k)
        else:
            unimp_f.append(k)
        c += 1
        
    # Cat features appear in front
    imp_f.sort(key=(lambda x: 0 if (x in CAT_FEATURES) else 1))
    unimp_f.sort(key=(lambda x: 0 if (x in CAT_FEATURES) else 1))
        
    return (imp_f, unimp_f)

def does_s_begin_with_one_of_l(s, l):
    for i in l:
        if s.startswith(i): return True
    return False

TARGET_LABEL      = 'target'
CUSTOMER_ID_LABEL = "customer_ID"
TIME_LABEL        = 'S_2'

#
# The below function is copy from GRU_train_eval, but we don't want to treat NaNs yet since filling NaN values will change the stats
# , so we wait till later to fill NaNs. Also, we don't add padding columns to each customer to have 13 months of data, or we don't 
# group the rows by cid.
#
def feature_engineer_rnn(train, edit_cid_time = False, fill_nan = False, PAD_CUSTOMER_TO_13_ROWS = False, targets = None):
    
    utils.pt('Starting training feature engineer 1...')
    
    features = [col for col in train.columns if col not in [CUSTOMER_ID_LABEL, TARGET_LABEL]]
    
    # REDUCE STRING COLUMNS 
    # from 64 bytes to 8 bytes, and 10 bytes to 3 bytes respectively
    #train['customer_ID'] = train['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    if edit_cid_time:
        train['customer_ID'] = train['customer_ID'].str[-16:].apply(lambda x:int(x,16)).astype('int64')
        train.S_2 = pd.to_datetime( train.S_2 )
    
    train['year'] = (train.S_2.dt.year-2000).astype('int8')
    train['month'] = (train.S_2.dt.month).astype('int8')
    train['day'] = (train.S_2.dt.day).astype('int8')
    del train['S_2']

    
    PARTIAL_CATS = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_66', 'D_68']
    CATS = []
    OFFSETS = [1,0,1,1,2,1,2,1,1] #2 minus minimal value in full train csv
    # then 0 will be padding, 1 will be NAN, 2,3,4,etc will be values
    for c,s in zip(PARTIAL_CATS,OFFSETS):
        if c in features:
            train[c] = train[c] + s
            # train[c] = train[c].fillna(0).astype('int8')
            #if (fill_nan): train[c] = train[c].fillna(0)
            CATS.append(c)
    
    # LABEL ENCODE CAT COLUMNS (and reduce to 1 byte)
    # with 0: padding, 1: nan, 2,3,4,etc: values
    if ('D_63' in features):
        d_63_map = {'CL':1, 'CO':2, 'CR':3, 'XL':4, 'XM':5, 'XZ':6}
        #train['D_63'] = train.D_63.map(d_63_map).fillna(0).astype('int8')
        train['D_63'] = train.D_63.map(d_63_map)
        CATS.append('D_63')

    if ('D_64' in features):
        d_64_map = {'-1':1,'O':2, 'R':3, 'U':4}
        #train['D_64'] = train.D_64.map(d_64_map).fillna(0).astype('int8')
        train['D_64'] = train.D_64.map(d_64_map)
        CATS.append('D_64')
    
    # ADD NEW FEATURES HERE
    # EXAMPLE: train['feature_189'] = etc etc etc
    # EXAMPLE: train['feature_190'] = etc etc etc
    # IF CATEGORICAL, THEN ADD TO CATS WITH: CATS += ['feaure_190'] etc etc etc
    
    # REDUCE MEMORY DTYPE
    SKIP = ['customer_ID','year','month','day']
    for c in train.columns:
        if c in SKIP: continue
        if str( train[c].dtype )=='int64':
            train[c] = train[c].astype('int32')
        if str( train[c].dtype )=='float64':
            train[c] = train[c].astype('float32')
    
    # PAD ROWS SO EACH CUSTOMER HAS 13 ROWS
#     if PAD_CUSTOMER_TO_13_ROWS:
#         tmp = train[['customer_ID']].groupby('customer_ID').customer_ID.agg('count')
#         more = np.array([],dtype='int64') 
#         for j in range(1,13):
#             i = tmp.loc[tmp==j].index.values
#             more = np.concatenate([more,np.repeat(i,13-j)])
#         df = train.iloc[:len(more)].copy().fillna(0)
#         df = df * 0 - 1 #pad numerical columns with -1
#         df[CATS] = (df[CATS] * 0).astype('int8') #pad categorical columns with 0
#         df['customer_ID'] = more
#         train = pd.concat([train,df],axis=0,ignore_index=True)
        
    # FILL NAN
    #if fill_nan: train = train.fillna(-0.5) #this applies to numerical columns
    
    # ADD TARGETS (and reduce to 1 byte)
#     if targets is not None:
#         train = train.merge(targets,on='customer_ID',how='left')
#         train.target = train.target.astype('int8')
        
    # SORT BY CUSTOMER THEN DATE
    train = train.sort_values(['customer_ID','year','month','day']).reset_index(drop=True)
    train = train.drop(['year','month','day'],axis=1)
    
    # REARRANGE COLUMNS WITH 11 CATS FIRST
    COLS = list(train.columns[1:])
    COLS = ['customer_ID'] + CATS + [c for c in COLS if c not in CATS]
    train = train[COLS]
    
    return train


def process_stats_for_features(data, data_agg, features, stats_list, num_features_at_a_time = 1):
    total_features = len(features)
    low_idx = 0
    while low_idx < total_features:
        high_idx = low_idx + num_features_at_a_time
        if (high_idx > total_features):
            high_idx = total_features
        f = features[low_idx:high_idx]
        utils.pt(f'Processing feature/s {f} ... ')
        f_e = data.groupby("customer_ID")[f].agg(stats_list)
        data.drop(columns=f)
        if (data_agg is None):
            data_agg = f_e
        else:
            data_agg = data_agg.merge(f_e, how = 'inner', on = 'customer_ID')
        gc.collect()
        low_idx = high_idx
    return data_agg

def feature_engineer_lgbm(data, train_labels = None, fill_nan = False):
    utils.pt('Starting training feature engineer 2...')
    features = data.drop(['customer_ID'], axis = 1).columns.to_list()
    
    num_features = [col for col in features if col not in CAT_FEATURES]
    cat_features = [col for col in features if col in CAT_FEATURES]
    
    utils.pt('Processing categorical features ...')
    #data_cat_agg = data.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
    data_cat_agg = None
    data_cat_agg = process_stats_for_features(data, data_cat_agg, cat_features, stats_list = ['count', 'last', 'nunique'], num_features_at_a_time = 3)
    utils.pt('Joining aggregate data ...')
    data_cat_agg.columns = ['_'.join(x) for x in data_cat_agg.columns]
    data_cat_agg.reset_index(inplace = True)
    
    utils.pt('Processing numerical features ...')
    #data_num_agg = data.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    data_num_agg = None
    data_num_agg = process_stats_for_features(data, data_num_agg, num_features, stats_list = ['mean', 'std', 'min', 'max', 'last'], num_features_at_a_time = 10)
    utils.pt('Joining aggregate data ...')
    data_num_agg.columns = ['_'.join(x) for x in data_num_agg.columns]
    data_num_agg.reset_index(inplace = True)
    
    #train_labels = pd.read_feather(FILENAME_TRAIN_PROCESSED2_LABELS_FEATHER)
    # data = data_num_agg.merge(_cat_agg, how = 'inner', on = 'customer_ID').merge(train_labels, how = 'inner', on = 'customer_ID')
    # utils.gc_l([data])
    
    cat_cols = list(data_cat_agg.columns)[1:]  # the first features is customer_ID, so we drop that.
    num_cols = list(data_num_agg.columns)[1:]  # the first features is customer_ID, so we drop that.
    
    utils.pt('Merging numerical and categorical features ...')
    
    data = data_num_agg.merge(data_cat_agg, how = 'inner', on = 'customer_ID')
    if (train_labels is not None):
        utils.pt('Merging train labels ...')
        data = data.merge(train_labels, how = 'inner', on = 'customer_ID')
    utils.gc_l([data_num_agg, data_cat_agg])
    
    
#    utils.pt('Round last float features to 2 decimal place')
    #num_cols = list(data.dtypes[(data.dtypes == 'float32') | (data.dtypes == 'float64')].index)
#     num_cols_last = [col for col in num_cols if 'last' in col]
#     for col in num_cols_last:
#         new_col_name = col + '_round2'
#         data[new_col_name] = data[col].round(2)
#         num_cols.append(new_col_name)
    
    # Get feature list
    # features = [col for col in data.columns if col not in [CUSTOMER_ID_LABEL, TARGET_LABEL]]
    
    # fill categorical NaNs with zero. 
    cat_last_col = [col for col in cat_cols if 'last' in col]
    for c in cat_last_col:
        data[c] = data[c].fillna(0)
    
    # fill numerical NaNs with -0.5.
#     stats_list = ['mean','min','max', 'last']
#     for c in (num_cols):
#         if any(substring in c for substring in stats_list):
#             data[c] = data[c].fillna(-0.5)
#         else:
#             data[c] = data[c].fillna(0)
    
    NUMS_WITH_E = []    
    zeros = np.zeros((data.shape[0],), dtype='int8')
    ones  = np.ones((data.shape[0],), dtype='int8')
    for c in num_cols:
        c_exists = c + "_exists"
        data[c_exists] = np.where(np.invert(np.isnan(data[c].values)), ones, zeros)
        data[c] = data[c].fillna(0)
        NUMS_WITH_E.append(c)
        NUMS_WITH_E.append(c_exists)
    
    for c in list(data.columns):
        if (data[c].isnull().any()):
            utils.pt(f'NaN column:{c}')
    
    cols = ['customer_ID'] + cat_cols + NUMS_WITH_E
    
    import collections
    print([item for item, count in collections.Counter(cols).items() if count > 1])
    
    return data[cols]

# def get_nn_data(lgbm_train, f_list):
#     cols = [x for x in lgbm_train.columns if (does_s_begin_with_one_of_l(x, f_list))]    
#     nn_train = lgbm_train[cols]
    
def process_data():
    
    imp_list, f_p_imp_dict = read_feature_importance()
    
    imp_f, uimp_f = imp_unimp_features(f_p_imp_dict)
    
    utils.pt('NN feature engineering')    
    # For NN: (1) Get the data of unimportant features.
    #         (2) Pre-process data in the same way as for RNN except do not do NaN fill or padding to 13 months.
    #         (3) Pre-process data again as though for LGBM, with stats for each column, do a NaN fill as though for RNN.
    train_full = pd.read_feather(FILENAME_TRAIN_PROCESSED2_DATA_CAT_NOCHANGE_FEATHER)
    cols = ([CUSTOMER_ID_LABEL, TIME_LABEL] + uimp_f)
    train_full = train_full.drop(columns=[col for col in train_full if col not in cols])
    train_nn = feature_engineer_rnn(train_full,
                                    edit_cid_time = False,
                                    fill_nan = False,
                                    PAD_CUSTOMER_TO_13_ROWS = False,
                                    targets = None)
    train_nn = feature_engineer_lgbm(train_nn, train_labels = None, fill_nan = True)
    
    train_nn.to_feather(FILENAME_TRAIN_RNN_NN_DATA_FEATHER)
    
    utils.gc_l([train_full, train_nn])
    
    utils.pt('RNN feature engineering')
    # For RNN: (1) Get the pre-processed RNN data of important features.
    train_full = pd.read_feather(FILENAME_TRAIN_PROCESSED_GRU_NAN_EMBEDDINGS_FEATHER)
    cols = ([CUSTOMER_ID_LABEL] + imp_f + [TARGET_LABEL])
    train_rnn = train_full.drop(columns=[col for col in train_full if not (any(substring in col for substring in cols))])
       
    train_rnn.to_feather(FILENAME_TRAIN_RNN_RNN_DATA_FEATHER)
    
    utils.pt('Feature engineering complete.')
    
process_data()

2022-09-07 17:01:43.216316 : NN feature engineering
2022-09-07 17:01:46.922237 : Starting training feature engineer 1...
2022-09-07 17:01:53.684609 : Starting training feature engineer 2...
2022-09-07 17:01:53.976799 : Processing categorical features ...
2022-09-07 17:01:53.976848 : Processing feature/s ['B_30', 'D_114', 'D_116'] ... 
2022-09-07 17:01:55.234546 : Processing feature/s ['D_117', 'D_120', 'D_126'] ... 
2022-09-07 17:01:56.611039 : Processing feature/s ['D_66', 'D_68', 'D_63'] ... 
2022-09-07 17:01:57.959440 : Processing feature/s ['D_64'] ... 
2022-09-07 17:01:58.890741 : Joining aggregate data ...
2022-09-07 17:01:58.891612 : Processing numerical features ...
2022-09-07 17:01:58.891645 : Processing feature/s ['B_6', 'S_6', 'B_13', 'D_58', 'D_60', 'B_15', 'B_16', 'B_19', 'D_69', 'D_71'] ... 
2022-09-07 17:02:00.828916 : Processing feature/s ['D_73', 'P_4', 'D_76', 'B_25', 'R_8', 'R_9', 'D_80', 'B_27', 'D_81', 'D_82'] ... 
2022-09-07 17:02:02.922530 : Processing feature/s 