# Category Embeddings based Keras Models for Tabular Data

## For Kaggle AMEX default prediction competition data

https://www.kaggle.com/competitions/amex-default-prediction/data

This competition had some random looking data where no information on columns was given. This notebook was for trying embeddings layers in Keras. 

This assumes TF/Keras and various other Python libraries are installed. And GPU configured. I used a Docker image and Pipfile with all these installed.

In [1]:
import pandas as pd
import numpy as np
import multiprocessing
from multiprocessing import Process
import glob
import os

from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from tqdm import tqdm

tqdm.pandas()

import time

N_FOLDS = 5


In [2]:
# https://www.kaggle.com/kyakovlev
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)



In [3]:
def amex_metric(y_true, y_pred, return_components=False) -> float:
    """Amex metric for ndarrays"""
    def top_four_percent_captured(df) -> float:
        """Corresponds to the recall for a threshold of 4 %"""
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(df) -> float:
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(df) -> float:
        """Corresponds to 2 * AUC - 1"""
        df2 = pd.DataFrame({'target': df.target, 'prediction': df.target})
        df2.sort_values('prediction', ascending=False, inplace=True)
        return weighted_gini(df) / weighted_gini(df2)

    df = pd.DataFrame({'target': y_true.ravel(), 'prediction': y_pred.ravel()})
    df.sort_values('prediction', ascending=False, inplace=True)
    g = normalized_weighted_gini(df)
    d = top_four_percent_captured(df)

    if return_components: return g, d, 0.5 * (g + d)
    return 0.5 * (g + d)

In [4]:
def format_time(seconds):
    seconds = int(seconds)
    minutes = seconds // 60
    hours = minutes // 60
    minutes = minutes % 60
    seconds = seconds % 60
    if hours > 0:
        return f"{hours}h, {minutes}m, {seconds}s"
    if minutes > 0:
        return f"{minutes}m, {seconds}s"
    return f"{seconds}s"
        

# Load Data

The data has been prepared before by the preprocessing notebook. This involves scaling data to 0-1 for non-embedding cols etc.

In [5]:
df_train = pd.read_parquet("large_train_embeddings.parquet", engine="pyarrow")
#df_train = pd.read_parquet("deloitte-data/keras_train.parquet", engine="pyarrow")
#df_train["target"] = df_train["target"] > 0.6
strat = df_train["fake_splitter"]
df_train = df_train.drop("fake_splitter", axis=1)
df_train.head()


Unnamed: 0,S_2_1,S_2_2,S_2_3,S_2_4,S_2_5,S_2_6,S_2_7,S_2_8,S_2_9,S_2_10,...,S_23-P_3_6,S_23-P_3_7,S_23-P_3_8,S_23-P_3_9,S_23-P_3_10,S_23-P_3_11,S_23-P_3_12,S_23-P_3_13,customer_ID,target
0,0.008214,0.037988,0.090349,0.106776,0.140657,0.160164,0.206365,0.226899,0.271047,0.285421,...,0.234825,0.235098,0.279948,0.223628,0.225614,0.295942,0.255588,0.264086,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0
1,0.0,0.047228,0.068788,0.119097,0.155031,0.175565,0.203285,0.23922,0.275154,0.311088,...,0.309776,0.302655,0.287516,0.34072,0.329134,0.337388,0.339961,0.301747,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0
2,0.010267,0.042094,0.073922,0.103696,0.13655,0.168378,0.199179,0.231006,0.261807,0.293635,...,0.249119,0.245535,0.253332,0.426317,0.266009,0.234389,0.292437,0.264936,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0
3,0.030801,0.052361,0.091376,0.11807,0.140657,0.162218,0.206365,0.2423,0.276181,0.311088,...,0.296159,0.254749,0.27109,0.446758,0.21849,0.233907,0.234684,0.237745,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0
4,0.029774,0.053388,0.092402,0.116016,0.146817,0.163244,0.210472,0.24846,0.280287,0.312115,...,0.506138,0.362316,0.35511,0.326496,0.312432,0.320591,0.301749,0.301565,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0


In [6]:
df_train.shape

(458913, 2641)

In [7]:
sum(df_train["target"])

118828

In [8]:
target = df_train["target"]

In [9]:
df_test = pd.read_parquet("large_test_embeddings.parquet", engine="pyarrow")
#df_test = pd.read_parquet("deloitte-data/keras_test.parquet", engine="pyarrow")
df_test.head()


Unnamed: 0,S_2_1,S_2_2,S_2_3,S_2_4,S_2_5,S_2_6,S_2_7,S_2_8,S_2_9,S_2_10,...,S_23-P_3_5,S_23-P_3_6,S_23-P_3_7,S_23-P_3_8,S_23-P_3_9,S_23-P_3_10,S_23-P_3_11,S_23-P_3_12,S_23-P_3_13,customer_ID
0,0.73922,0.774127,0.805955,0.831622,0.858316,0.887064,0.921971,0.967146,0.980493,-1.0,...,0.289173,0.327993,0.264301,0.262299,0.310161,-1.0,-1.0,-1.0,-1.0,00000469ba478561f23a92a868bd366de6f6527a684c9a...
1,0.428131,0.465092,0.495893,0.513347,0.546201,0.586242,0.622177,0.629363,0.662218,0.711499,...,0.370963,0.261539,0.362988,0.357788,0.258911,0.319032,0.311544,0.271942,0.33212,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...
2,0.596509,0.641684,0.663244,0.704312,0.743326,0.749487,0.811088,0.812115,0.858316,0.894251,...,0.11832,0.121544,0.068181,0.065045,0.337753,0.424995,0.373444,0.371102,0.304577,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...
3,0.410678,0.455852,0.49692,0.502053,0.534908,0.588296,0.62423,0.637577,0.684805,0.705339,...,0.297998,0.268463,0.278719,0.324589,0.266189,0.271857,0.248616,0.26019,0.268461,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...
4,0.610883,0.652977,0.684805,0.705339,0.728953,0.772074,0.800821,0.837782,0.843943,0.87885,...,0.108597,0.297084,0.433925,0.43253,0.42402,0.470337,0.381729,0.431672,0.337952,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...


In [10]:
[col for col in df_train.columns if col not in df_test.columns]

['target']

In [11]:
[col for col in df_test.columns if col not in df_train.columns]

[]

In [12]:
df_train = df_train.drop("target", axis=1)

In [13]:
#df_test = df_test.drop("fake_splitter", axis=1)

In [14]:
df_train.shape

(458913, 2640)

In [15]:
df_test.shape

(924621, 2640)

In [16]:
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68'] 
new_cat_cols = []
for cat_col in cat_cols:
    new_cat_cols.extend([col for col in df_train.columns if col.startswith(cat_col)])
cat_cols = new_cat_cols
#cat_cols

Taking all the columns with a few values only, will build and embedding input for each later in the Keras model.

In [17]:
unique_counts = df_train.nunique()
small_count_cols_train = unique_counts[unique_counts < 10]
#small_count_cols_train = small_count_cols_train.drop("fake_splitter")
small_count_cols_train

D_41_1      3
D_41_2      3
D_41_3      3
D_41_4      3
D_41_5      3
           ..
D_145_9     3
D_145_10    3
D_145_11    3
D_145_12    3
D_145_13    3
Length: 1183, dtype: int64

In [18]:
unique_counts = df_test.nunique()
small_count_cols_test = unique_counts[unique_counts < 10]
#small_count_cols = small_count_cols.drop("fake_splitter")
small_count_cols_test

R_1_1       2
R_1_2       3
R_1_3       3
R_1_4       3
R_1_5       3
           ..
D_145_9     3
D_145_10    3
D_145_11    3
D_145_12    3
D_145_13    3
Length: 1196, dtype: int64

Check train vs test data if they have same set of columns with few values:

In [19]:
#list any columns with few values in test set that have more values in training set
set(small_count_cols_test.index)-set(small_count_cols_train.index)

{'R_1_1',
 'R_1_10',
 'R_1_11',
 'R_1_12',
 'R_1_13',
 'R_1_2',
 'R_1_3',
 'R_1_4',
 'R_1_5',
 'R_1_6',
 'R_1_7',
 'R_1_8',
 'R_1_9'}

In [20]:
#list any columns with few values in train set that have more values in test set
set(small_count_cols_train.index)-set(small_count_cols_test.index)

set()

In [21]:
embeddables = set(cat_cols).union(set(small_count_cols_train.index))
len(embeddables)

1183

# Build Inputs Mapping for Embedding Layers

train_inputs will contain:
- key: Column name.
- value: Numpy array of values for that column.

Keras fit/predict can then take this mapping as input to feed into the embedding layers in the model.

In [22]:
#train_inputs = {}

In [23]:
#for col in embeddables:
#    train_inputs[col] = df_train[col].to_numpy()

# Put all the Remaining Columns into a Separate Input Matrix

This will be all the other inputs (non-categorical/larger set of values).

In [24]:
numericals = [col for col in df_train.columns if col not in embeddables and col != "customer_ID"]

In [25]:
len(numericals)

1456

In [26]:
# put above matrix input the model input dict for model training
# model will have an input with matching shape and name ("numerical")

#train_inputs["numerical"] = df_train[numericals].to_numpy()

In [27]:
y = target

In [28]:
#X = df_train.drop(["customer_ID", "target"], axis=1)
#y = df_train["target"]
#xs = X.shape
#xs

In [29]:
stratzip = zip(strat, y)
#stratified k-fold only supports single binary value, tuple stratification breaks it
stratzip = [f"{a}-{b}" for (a,b) in stratzip]
#stratzip = list(stratzip)

In [30]:
#Xv = X.values.reshape(xs[1], 13, 233)
#Xnp = X.values.reshape (458913, 13, 233)
#Xnp = X.values.reshape (458913, 1, X.shape[1])
#Xnp.shape

In [31]:
3029*13*233

9174841

# Build Keras Models

In [32]:
#this was an attempt on resetting keras between iterations for memory etc. 
#did not work so well, and so it was always run in separate process
def reset_keras():
    sess = tf.compat.v1.keras.backend.get_session()
    tf.compat.v1.keras.backend.clear_session()
    sess.close()
    sess = tf.compat.v1.keras.backend.get_session()

    try:
        del classifier # this is from global space - change this as you need
    except:
        pass

    # use the same config as you used to create the session
    config = tf.compat.v1.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 1
    config.gpu_options.visible_device_list = "0"
    tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config))

In [60]:
#delete the stored keras models under keras/... directory
def delete_keras_models():
    for fl in glob.glob("keras/*"):
        #Do what you want with the file
        os.remove(fl)

#embedding layers + numericals -> cnn -> ...
def create_model_cnn_embedded(X_reference, X_input, embed_cols, embed_col_sizes, numerical_count):
    from keras.models import Model
    from keras.optimizers import Adam
    from tensorflow.keras.layers import Conv1D, BatchNormalization, Activation, MaxPooling1D, Flatten, Dropout, Dense, Input, concatenate, Embedding, Reshape
    from tensorflow.keras import backend as K 
    import tensorflow as tf

#    print(tf.config.experimental.get_memory_info('GPU:0'))
    print("creating classifier, cnn")
    #https://stackoverflow.com/questions/59567226/how-to-programmatically-determine-available-gpu-memory-with-tensorflow?noredirect=1&lq=1
#    print(tf.config.experimental.get_memory_info('GPU:0'))
    
    flats = []
    inputs = []
    embed_size = 10
    reshape_size = 0
    for embed_col in embed_cols:
        #have to use X_reference here and it always has to be same dataframe to have same unique count
        values = X_reference[embed_col].nunique()
        input1 = Input(shape=(1,), name=embed_col)
        inputs.append(input1)
        embed = Embedding(values, embed_size)(input1) #TODO: embed_size=values?
        flat = Flatten()(embed)
        flats.append(flat)
        reshape_size += embed_size

    non_embeds = X_input.drop(embed_cols, axis=1)
    reshape_size += non_embeds.shape[1]
    input1 = Input(shape=(non_embeds.shape[1:]), name="numerical")
    inputs.append(input1)
    flats.append(input1)

    x = concatenate(flats)
    x = Reshape((reshape_size, 1))(x)
    
    x = Conv1D(filters=64, kernel_size=15, strides=15, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Conv1D(filters=32, kernel_size=3, strides=1, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Flatten()(x)
    x = Dropout(0.3)(x)
    x = Dense(64, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    x = Dense(16, activation='relu')(x)
    x = BatchNormalization()(x)
    output = Dense(1, activation='sigmoid')(x)

    print("layers created")

    model = Model(inputs=[inputs], outputs=output)

    print("model created")

    model.compile(Adam(learning_rate=.001), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


#embedding layers + numericals -> mlp -> ...
def create_model_mlp_embedded(X_input, embed_cols, embed_col_sizes, numerical_count):
    from keras.models import Model
    from keras.optimizers import Adam
    from tensorflow.keras.layers import Conv1D, BatchNormalization, Activation, MaxPooling1D, Flatten, Dropout, Dense, Input, concatenate, Embedding, Reshape
    from tensorflow.keras import backend as K 
    import tensorflow as tf

    print(tf.config.experimental.get_memory_info('GPU:0'))
    print("creating mlp classifier")
    #https://stackoverflow.com/questions/59567226/how-to-programmatically-determine-available-gpu-memory-with-tensorflow?noredirect=1&lq=1
    print(tf.config.experimental.get_memory_info('GPU:0'))

    flats = []
    inputs = []
    embed_size = 3
    reshape_size = 0
    for embed_col in embed_cols:
        #https://medium.com/analytics-vidhya/understanding-embedding-layer-in-keras-bbe3ff1327ce
        values = X_input[embed_col].nunique()
        input1 = Input(shape=(1,), name=embed_col)
        inputs.append(input1)
        embed = Embedding(values, embed_size, name=f"embedding-{embed_col}")(input1) #TODO: embed_size=values?
        flat = Flatten()(embed)
        flats.append(flat)
        reshape_size += embed_size

    non_embeds = X_input.drop(embed_cols, axis=1)
    reshape_size += non_embeds.shape[1]
    input1 = Input(shape=(non_embeds.shape[1:]), name="numerical")
    inputs.append(input1)
    flats.append(input1)

    x = concatenate(flats)
#    x = Reshape((reshape_size, 1))(x)

    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(16, activation='relu')(x)
    output = Dense(1, activation='sigmoid')(x)

    print("layers created")

    model = Model(inputs=[inputs], outputs=output)

    print("model created")

    model.compile(Adam(learning_rate=.0001), loss='binary_crossentropy', metrics=['accuracy'], run_eagerly=True)
    
    print("model compiled")
    
    #print(model.summary())
    
    return model


#create dict for embedding inputs, where key=layer name, value=array of input values for that layer
#one key per embedding layer, one for the rest ("numerical")
def create_embedding_input(df_from, y, embeddable_cols, numerical_cols, train_idx, test_idx):
    axis = 0
    
    if train_idx is None:
        #looking only to reshape input, not to sample it
        X_train, X_test = df_from, None
        y_train, y_test = y, None
    else:
        X_train, X_test = df_from.iloc[train_idx], df_from.iloc[test_idx]
        y_train, y_test = np.take(y, train_idx, axis), np.take(y, test_idx, axis)
    
    #key=column name, value=numpy array of values for that column
    train_inputs = {}
    #embed counts = collect the count of values per embed column, to build flatten layer size later
    embed_counts = {}
    for col in embeddable_cols:
        cols = 1
        train_inputs[col] = X_train[col].to_numpy()#.reshape(-1, cols, 1)
        embed_counts[col] = df_from[col].nunique()
        #X = X.values.reshape(-1, X.shape[1], 1)
    count_num_cols = len(numerical_cols)
    #put all the remaining columns as a single input under the layer name "numerical"
    train_inputs["numerical"] = X_train[numerical_cols].to_numpy()#.reshape(-1, count_num_cols, 1)
    
    #build same dict for test data
    test_inputs = {}
    if X_test is not None:
        for col in embeddable_cols:
            cols = 1
            test_inputs[col] = X_test[col].to_numpy()#.reshape(-1, cols, 1)
        count_num_cols = len(numerical_cols)
        test_inputs["numerical"] = X_test[numerical_cols].to_numpy()#.reshape(-1, count_num_cols, 1)
    print(len(embeddable_cols))
    print(train_inputs["numerical"].shape)
    
    return train_inputs, test_inputs, y_train, y_test, embed_counts


def train_and_predict(df_from, y, embeddable_cols, numerical_cols, strat):
    delete_keras_models()

    manager = multiprocessing.Manager()
    process_dict = manager.dict()

    cols_to_drop = [col for col in df_from.columns if "fake" in col]
    df_from = df_from.drop(cols_to_drop, axis=1)

    cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=69)
#    cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=1121218)

    cv_scores = np.empty(N_FOLDS)
    cv_accuracies = np.empty(N_FOLDS)
    time_start_all_folds = time.time()
    acc_score_total = 0
    
    n_classes = 1
    train_preds = np.zeros((df_from.shape[0], n_classes))
    customer_ids = df_from["customer_ID"]
    df_from = df_from.drop("customer_ID", axis=1)
    print(df_from.shape)
    print(strat.shape)

    for idx, (train_idx, test_idx) in enumerate(cv.split(df_from, strat)):
        
        time_start_this_fold = time.time()
        print(f"=== STARTING FOLD {idx+1}/{N_FOLDS} ===")
        #K.clear_session()
        #reset_keras()
        axis = 0
        
        #this should be modified to load the data in the new process, now it occupies 2*the memory of data
        #once in this process, another time in the training process
        #the copying across processes for 40-50GB of data also takes resources/time
        p = Process(target=train_and_predict_2, args=(idx, process_dict, df_from, y, embeddable_cols, numerical_cols, train_idx, test_idx))
        p.start()
        flag = p.join()
        print(f"Subprocess exited with code {flag}")
        p.close()
        print("Subprocess closed")
        
        preds = process_dict[f"preds-{idx}"]
        y_test = process_dict[f"ytest-{idx}"]
        np.add.at(train_preds, test_idx, preds)
        print(f"count zerO: {np.count_nonzero(train_preds)}")

        preds_this_round = (preds >= 0.5)
        acc_score = accuracy_score(y_test, preds_this_round)
        acc_score_total += acc_score

        log_loss_fold = log_loss(y_test, preds)
        cv_scores[idx] = log_loss_fold
        cv_accuracies[idx] = acc_score

        amex_score = amex_metric(y_test, preds)

        time_total_this_fold = time.time() - time_start_this_fold
        print(f"=== FINISHED FOLD {idx+1}/{N_FOLDS} log loss={log_loss_fold}, accuracy={acc_score}, amex={amex_score} ===")
        print(f"time to run this fold: {format_time(time_total_this_fold)}")

    #print(type(train_preds))
    #print(type(process_dict))
    return process_dict, train_preds

#this is the method that is run to train a data split (iteration)
#separate method allows it to be started as a separate process
def train_and_predict_2(idx, process_dict, df_from, y, embeddable_cols, numerical_cols, train_idx, test_idx):
    import tensorflow as tf
    
    train_dict, test_dict, y_train, y_test, embed_counts = create_embedding_input(df_from, y, embeddable_cols, numerical_cols, train_idx, test_idx)
    
    print(df_from.dtypes.value_counts())
    
    #assuming df_from is actually df_train here and train + predict is only called on training
    #thus both the X_reference and X_train are set as df_from here
    model = create_model_cnn_embedded(df_from, df_from, embeddable_cols, embed_counts, len(numerical_cols))

    print("model compiled, fitting")
    
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=f"keras/embedded-model-fold{idx}",
#        filepath=f"keras/cnn-model-fold{idx}-"+"{epoch:02d}",
        save_weights_only=True,
        monitor='val_loss',
        mode='min',
        save_best_only=True)

    print(len(train_dict))
    print(train_dict["numerical"].shape)
#    model.fit(train_dict, y_train, batch_size=64, epochs=10, callbacks=[model_checkpoint_callback])
    model.fit(train_dict, y_train, batch_size=128, epochs=5, validation_data=(test_dict, y_test), callbacks=[model_checkpoint_callback])
    #have to store history right after fit(), as predict() seems to clear it
    history = model.history.history

    filepath = f"keras/embedded-model-fold{idx}"
    print(f"loading best weights: {filepath}")
    model.load_weights(filepath)
    print(f"predicting")
    preds = model.predict(test_dict)
    print(preds.shape)
        
    process_dict[f"history-{idx}"] = history
    process_dict[f"preds-{idx}"] = preds
    process_dict[f"ytest-{idx}"] = y_test
        


In [34]:
df_train.shape

(458913, 2640)

In [35]:
#here run the actual CV split + training on each split
#process_dict, train_preds = train_and_predict(df_train[:1000], target[:1000], embeddables, numericals, strat[:1000])
process_dict, train_preds = train_and_predict(df_train, target, embeddables, numericals, strat)


(458913, 2639)
(458913,)
=== STARTING FOLD 1/5 ===
1183
(367130, 1456)
float32    2628
int8         11
dtype: int64
creating classifier, cnn


2022-08-25 21:26:47.019305: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-25 21:26:47.022145: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-25 21:26:47.022674: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-25 21:26:47.023317: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

layers created
model created
model compiled, fitting
1184
(367130, 1456)
Epoch 1/5


2022-08-25 21:28:00.727568: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100
2022-08-25 21:28:01.444520: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
loading best weights: keras/embedded-model-fold0
predicting
(91783, 1)
Subprocess exited with code None
Subprocess closed
count zerO: 91783
=== FINISHED FOLD 1/5 log loss=0.22933822995274258, accuracy=0.8980639116176198, amex=0.771753741034982 ===
time to run this fold: 58m, 55s
=== STARTING FOLD 2/5 ===
1183
(367130, 1456)
float32    2628
int8         11
dtype: int64
creating classifier, cnn


2022-08-25 22:25:42.953730: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-25 22:25:42.956534: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-25 22:25:42.957045: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-25 22:25:42.957666: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

layers created
model created
model compiled, fitting
1184
(367130, 1456)
Epoch 1/5


2022-08-25 22:26:57.120728: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100
2022-08-25 22:26:57.852644: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
loading best weights: keras/embedded-model-fold1
predicting
(91783, 1)
Subprocess exited with code None
Subprocess closed
count zerO: 183566
=== FINISHED FOLD 2/5 log loss=0.23096319767029708, accuracy=0.8980965974091063, amex=0.7755851178213775 ===
time to run this fold: 1h, 1m, 53s
=== STARTING FOLD 3/5 ===
1183
(367130, 1456)
float32    2628
int8         11
dtype: int64
creating classifier, cnn


2022-08-25 23:27:36.758043: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-25 23:27:36.760929: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-25 23:27:36.761473: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-25 23:27:36.762131: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

layers created
model created
model compiled, fitting
1184
(367130, 1456)
Epoch 1/5


2022-08-25 23:28:52.358192: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100
2022-08-25 23:28:53.073210: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
loading best weights: keras/embedded-model-fold2
predicting
(91783, 1)
Subprocess exited with code None
Subprocess closed
count zerO: 275349
=== FINISHED FOLD 3/5 log loss=0.23213919060390087, accuracy=0.8977915300218995, amex=0.7730205033115839 ===
time to run this fold: 1h, 0m, 31s
=== STARTING FOLD 4/5 ===
1183
(367131, 1456)
float32    2628
int8         11
dtype: int64
creating classifier, cnn


2022-08-26 00:28:08.138590: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 00:28:08.141486: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 00:28:08.142015: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 00:28:08.142690: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

layers created
model created
model compiled, fitting
1184
(367131, 1456)
Epoch 1/5


2022-08-26 00:29:23.725911: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100
2022-08-26 00:29:24.432767: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
loading best weights: keras/embedded-model-fold3
predicting
(91782, 1)
Subprocess exited with code None
Subprocess closed
count zerO: 367131
=== FINISHED FOLD 4/5 log loss=0.2295277254262763, accuracy=0.8976814625961518, amex=0.7753407524639585 ===
time to run this fold: 1h, 2m, 6s
=== STARTING FOLD 5/5 ===
1183
(367131, 1456)
float32    2628
int8         11
dtype: int64
creating classifier, cnn


2022-08-26 01:30:14.846021: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 01:30:14.848917: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 01:30:14.849466: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 01:30:14.850126: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

layers created
model created
model compiled, fitting
1184
(367131, 1456)
Epoch 1/5


2022-08-26 01:31:30.139601: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100
2022-08-26 01:31:30.849759: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
loading best weights: keras/embedded-model-fold4
predicting
(91782, 1)
Subprocess exited with code None
Subprocess closed
count zerO: 458913
=== FINISHED FOLD 5/5 log loss=0.23107495221158753, accuracy=0.8974308687978035, amex=0.7706040864153059 ===
time to run this fold: 1h, 1m, 39s


In [36]:
#cannot import TF outside processses or it will reserve GPU and later use of TF/Keras will fail to find GPU
#import tensorflow as tf
#tf.config.list_physical_devices('GPU')

In [37]:
print(df_train.dtypes.value_counts())

float32    2628
int8         11
object        1
dtype: int64


In [38]:
target.dtype

dtype('int64')

In [39]:
df_train.select_dtypes(include=['object']).columns

Index(['customer_ID'], dtype='object')

In [40]:
#large data set

#13 rows
#wavenet 0.76-0.77
#gru 0.75-0.76 (3 iter)
#bi-lstm 0.75-0.77

#1 row
#bi-lstm 0.75-0.77
#wavenet 0.77-0.79
#gru-cnn 0.70
#perus cnn 0.77-0.78

#deloitte data set
#wavenet 0.774-0.784
#cnn 0.766-0.785



In [41]:
process_dict["preds-1"]

array([[0.00128628],
       [0.8709188 ],
       [0.25024173],
       ...,
       [0.93081063],
       [0.9252362 ],
       [0.48868656]], dtype=float32)

In [42]:
train_preds.shape

(458913, 1)

In [43]:
df_preds = pd.DataFrame()
df_preds["customer_ID"] = df_train["customer_ID"]
df_preds["prediction"] = train_preds

In [44]:
amex_metric_mod(y, train_preds[:, 0])    

0.7729221399799109

In [45]:
df_preds.to_csv("predictions_mlp_embedded.csv")

# Predict Entire Training Set with Trained Models

Not sure if this is meaningful or would it be just better to concatenate the out-of-fold preds from trained folds (as is done above).

In [46]:
#test.replace([np.inf, -np.inf], np.nan,inplace=True)
#X_train = df_train.drop(["customer_ID"], axis=1)
#X = X_train.values.reshape(-1, X_train.shape[1], 1)


In [47]:
#X_train.columns
#train_dict, test_dict, y_train, y_test, embed_counts = create_embedding_input(df_train, target, embeddables, numericals, None, None)

In [72]:
def predict_keras(idx, process_dict, X_ref, X, weights_filepath, embeddables, numericals):
    train_dict, test_dict, y, y_test, embed_counts = create_embedding_input(X, target, embeddables, numericals, None, None)
    model = create_model_cnn_embedded(X_ref, X, embeddables, embed_counts, len(numericals))
    model.load_weights(filepath)
    preds = model.predict(train_dict)
    process_dict[f"preds-{idx}"] = preds

In [49]:
manager = multiprocessing.Manager()
process_dict = manager.dict()
for idx in range(N_FOLDS):
    #todo: remove inner loop
    searchpath = f"keras/embedded-model-fold{idx}*.data-*"
    print(searchpath)
    for fl in glob.glob(searchpath):

        ending = fl.index(".")
        filepath = fl[0:ending]
        print(f"=== STARTING TO PREDICT {idx+1}/{N_FOLDS}: {filepath} === ")

        X = df_train.drop(["customer_ID"], axis=1)
        #again, reference df and train df are both X here as this is the training df predicted
        p = Process(target=predict_keras, args=(idx, process_dict, X, X, filepath, embeddables, numericals))
        p.start()
        flag = p.join()
        p.close()
        print(f"=== FINISHED PREDICTION {idx+1}/{N_FOLDS}: {sum(process_dict[f'preds-{idx}'])} === ")
        print(f"Subprocess exited with code {flag}")



keras/embedded-model-fold0*.data-*
=== STARTING TO PREDICT 1/5: keras/embedded-model-fold0 === 
1183
(458913, 1456)
creating classifier, cnn


2022-08-26 07:52:20.070139: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 07:52:20.073358: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 07:52:20.073907: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 07:52:20.074517: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

layers created
model created


2022-08-26 07:53:19.792795: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100


    5/14342 [..............................] - ETA: 9:39  

2022-08-26 07:53:20.494097: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


=== FINISHED PREDICTION 1/5: [118170.98] === 
Subprocess exited with code None
keras/embedded-model-fold1*.data-*
=== STARTING TO PREDICT 2/5: keras/embedded-model-fold1 === 
1183
(458913, 1456)
creating classifier, cnn


2022-08-26 08:02:45.366644: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 08:02:45.369873: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 08:02:45.370420: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 08:02:45.371074: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

layers created
model created


2022-08-26 08:03:47.783249: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100


    5/14342 [..............................] - ETA: 9:09  

2022-08-26 08:03:48.485942: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


=== FINISHED PREDICTION 2/5: [111582.32] === 
Subprocess exited with code None
keras/embedded-model-fold2*.data-*
=== STARTING TO PREDICT 3/5: keras/embedded-model-fold2 === 
1183
(458913, 1456)
creating classifier, cnn


2022-08-26 08:13:20.143119: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 08:13:20.146319: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 08:13:20.146850: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 08:13:20.147496: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

layers created
model created


2022-08-26 08:14:22.687838: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100


    5/14342 [..............................] - ETA: 9:51  

2022-08-26 08:14:23.399503: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


=== FINISHED PREDICTION 3/5: [111912.83] === 
Subprocess exited with code None
keras/embedded-model-fold3*.data-*
=== STARTING TO PREDICT 4/5: keras/embedded-model-fold3 === 
1183
(458913, 1456)
creating classifier, cnn


2022-08-26 08:23:39.511294: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 08:23:39.514562: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 08:23:39.515114: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 08:23:39.515786: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

layers created
model created


2022-08-26 08:24:41.885793: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100


    5/14342 [..............................] - ETA: 9:20  

2022-08-26 08:24:42.603627: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


=== FINISHED PREDICTION 4/5: [113137.88] === 
Subprocess exited with code None
keras/embedded-model-fold4*.data-*
=== STARTING TO PREDICT 5/5: keras/embedded-model-fold4 === 
1183
(458913, 1456)
creating classifier, cnn


2022-08-26 08:33:55.683998: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 08:33:55.687259: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 08:33:55.687786: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 08:33:55.688433: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

layers created
model created


2022-08-26 08:34:58.095368: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100


    5/14342 [..............................] - ETA: 9:43  

2022-08-26 08:34:58.816898: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


=== FINISHED PREDICTION 5/5: [113722.01] === 
Subprocess exited with code None


In [50]:
preds = []
for x in range(N_FOLDS):
    preds.append(process_dict[f"preds-{x}"])

In [51]:
preds = sum(preds)/N_FOLDS
preds

array([[0.00070457],
       [0.0013388 ],
       [0.00194356],
       ...,
       [0.00249963],
       [0.08902572],
       [0.00164985]], dtype=float32)

In [52]:
amex_metric_mod(y, preds[:, 0])   


0.7845456440460035

In [53]:
df_preds = pd.DataFrame()
df_preds["customer_ID"] = df_train["customer_ID"]
df_preds["prediction"] = preds[:, 0]

In [54]:
df_preds

Unnamed: 0,customer_ID,prediction
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.000705
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.001339
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0.001944
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0.005971
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0.002372
...,...,...
458908,ffff41c8a52833b56430603969b9ca48d208e7c192c6a4...,0.002524
458909,ffff518bb2075e4816ee3fe9f3b152c57fc0e6f01bf7fd...,0.029118
458910,ffff9984b999fccb2b6127635ed0736dda94e544e67e02...,0.002500
458911,ffffa5c46bc8de74f5a4554e74e239c8dee6b9baf38814...,0.089026


In [55]:
df_preds.to_csv("predictions_cnn_embedded.csv")

In [57]:
#X_test = df_test.drop("customer_ID", axis=1)
#X_test_np = X_test.values.reshape (-1, X_test.shape[1], 1)
##X_test_np = X_test.values.reshape (-1, 13, 233)
#X_test_np.shape

In [58]:
N_FOLDS

5

In [73]:
manager = multiprocessing.Manager()
process_dict = manager.dict()
for idx in range(N_FOLDS):
    #todo: remove inner loop
    searchpath = f"keras/embedded-model-fold{idx}*.data-*"
    print(searchpath)
    for fl in glob.glob(searchpath):

        ending = fl.index(".")
        filepath = fl[0:ending]
        print(f"=== STARTING TO PREDICT {idx+1}/{N_FOLDS}: {filepath} === ")

        X = df_test.drop(["customer_ID"], axis=1)
        X_ref = df_train.drop(["customer_ID"], axis=1)
        p = Process(target=predict_keras, args=(idx, process_dict, X_ref, X, filepath, embeddables, numericals))
#        p = Process(target=predict_keras, args=(idx, process_dict, X_test_np, filepath))
        p.start()
        flag = p.join()
        p.close()
        print(f"=== FINISHED PREDICTION {idx+1}/{N_FOLDS}: {sum(process_dict[f'preds-{idx}'])} === ")
        print(f"Subprocess exited with code {flag}")


keras/embedded-model-fold0*.data-*
=== STARTING TO PREDICT 1/5: keras/embedded-model-fold0 === 
1183
(924621, 1456)
creating classifier, cnn


2022-08-26 11:06:03.372089: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 11:06:03.375586: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 11:06:03.376158: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 11:06:03.376794: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

layers created
model created


2022-08-26 11:06:58.629359: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 5384992704 exceeds 10% of free system memory.
2022-08-26 11:07:01.509382: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 5384992704 exceeds 10% of free system memory.
2022-08-26 11:07:09.723709: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100


    5/28895 [..............................] - ETA: 19:44  

2022-08-26 11:07:10.435140: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


=== FINISHED PREDICTION 1/5: [228016.16] === 
Subprocess exited with code None
keras/embedded-model-fold1*.data-*
=== STARTING TO PREDICT 2/5: keras/embedded-model-fold1 === 
1183
(924621, 1456)
creating classifier, cnn


2022-08-26 11:26:13.768729: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 11:26:13.774820: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 11:26:13.775355: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 11:26:13.776008: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

layers created
model created


2022-08-26 11:27:07.660163: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 5384992704 exceeds 10% of free system memory.
2022-08-26 11:27:10.508952: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 5384992704 exceeds 10% of free system memory.
2022-08-26 11:27:18.515547: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100


    5/28895 [..............................] - ETA: 19:21  

2022-08-26 11:27:19.228190: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


=== FINISHED PREDICTION 2/5: [213903.25] === 
Subprocess exited with code None
keras/embedded-model-fold2*.data-*
=== STARTING TO PREDICT 3/5: keras/embedded-model-fold2 === 
1183
(924621, 1456)
creating classifier, cnn


2022-08-26 11:46:24.626404: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 11:46:24.629639: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 11:46:24.630173: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 11:46:24.630811: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

layers created
model created


2022-08-26 11:47:18.489178: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 5384992704 exceeds 10% of free system memory.
2022-08-26 11:47:21.352041: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 5384992704 exceeds 10% of free system memory.
2022-08-26 11:47:29.427240: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100


    5/28895 [..............................] - ETA: 18:38  

2022-08-26 11:47:30.158290: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


=== FINISHED PREDICTION 3/5: [213757.92] === 
Subprocess exited with code None
keras/embedded-model-fold3*.data-*
=== STARTING TO PREDICT 4/5: keras/embedded-model-fold3 === 
1183
(924621, 1456)
creating classifier, cnn


2022-08-26 12:06:36.420712: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 12:06:36.423933: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 12:06:36.424475: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 12:06:36.425120: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

layers created
model created


2022-08-26 12:07:30.185837: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 5384992704 exceeds 10% of free system memory.
2022-08-26 12:07:33.039616: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 5384992704 exceeds 10% of free system memory.
2022-08-26 12:07:41.027646: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100


    5/28895 [..............................] - ETA: 19:13  

2022-08-26 12:07:41.751952: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


=== FINISHED PREDICTION 4/5: [220409.12] === 
Subprocess exited with code None
keras/embedded-model-fold4*.data-*
=== STARTING TO PREDICT 5/5: keras/embedded-model-fold4 === 
1183
(924621, 1456)
creating classifier, cnn


2022-08-26 12:26:22.050008: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 12:26:22.053644: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 12:26:22.054177: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-26 12:26:22.054821: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

layers created
model created


2022-08-26 12:27:15.800073: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 5384992704 exceeds 10% of free system memory.
2022-08-26 12:27:18.656416: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 5384992704 exceeds 10% of free system memory.
2022-08-26 12:27:26.677417: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100


    5/28895 [..............................] - ETA: 20:58  

2022-08-26 12:27:27.394157: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


=== FINISHED PREDICTION 5/5: [215183.5] === 
Subprocess exited with code None


In [74]:
sum(process_dict["preds-2"])

array([213757.92], dtype=float32)

In [75]:
preds = []
for x in range(N_FOLDS):
    preds.append(process_dict[f"preds-{x}"])

In [76]:
process_dict["preds-0"].shape

(924621, 1)

In [77]:
preds = sum(preds)/N_FOLDS
preds

array([[0.02850733],
       [0.00109833],
       [0.07285783],
       ...,
       [0.60344857],
       [0.44189662],
       [0.03884048]], dtype=float32)

In [78]:
df_test.shape

(924621, 2640)

In [79]:
len(preds)

924621

In [80]:
submission = pd.DataFrame()
submission["customer_ID"] = df_test["customer_ID"]
submission["prediction"] = preds
submission

Unnamed: 0,customer_ID,prediction
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0.028507
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0.001098
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,0.072858
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,0.243074
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,0.832472
...,...,...
924616,ffff952c631f2c911b8a2a8ca56ea6e656309a83d2f64c...,0.006105
924617,ffffcf5df59e5e0bba2a5ac4578a34e2b5aa64a1546cd3...,0.641984
924618,ffffd61f098cc056dbd7d2a21380c4804bbfe60856f475...,0.603449
924619,ffffddef1fc3643ea179c93245b68dca0f36941cd83977...,0.441897


In [81]:
submission.to_csv("submission_cnn_embedded.csv", index=False)