In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [40]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

import tensorflow as tf
import keras
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.losses import MeanSquaredLogarithmicError
from tensorflow.keras import regularizers

from tqdm.notebook import tqdm

In [3]:
np.random.seed(1)
tf.random.set_seed(1)
data_root = r'/content/drive/My Drive/data/card/creditcard.csv'
TARGET_NAME = 'Class'

In [4]:
df = pd.read_csv(data_root)
df.drop_duplicates(inplace=True)

In [5]:
df_test = df.values

unique_samples = []
unique_count = np.zeros_like(df_test)
for feature in range(df_test.shape[1]):
    _, index_, count_ = np.unique(df_test[:, feature], return_counts=True, return_index=True)
    unique_count[index_[count_ == 1], feature] += 1

real_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]

print(len(real_samples_indexes))
print(len(synthetic_samples_indexes))

273052
10674


In [6]:
features = [c for c in df.columns if c not in ['Time', 'Amount', 'Class']]
print(len(features))

28


In [7]:
import scipy.ndimage

sigma_fac = 0.001
sigma_base = 4

eps = 0.00000001

def get_count(X_all, X_fake):
    features_count = np.zeros((X_all.shape[0], len(features)))
    features_density = np.zeros((X_all.shape[0], len(features)))
    features_deviation = np.zeros((X_all.shape[0], len(features)))

    features_count_fake = np.zeros((X_fake.shape[0], len(features)))
    features_density_fake = np.zeros((X_fake.shape[0], len(features)))
    features_deviation_fake = np.zeros((X_fake.shape[0], len(features)))
    sigmas = []

    for i,var in enumerate(tqdm(features)):
        X_all_var_int = (X_all[var].values * 10000).round().astype(int)
        X_fake_var_int = (X_fake[var].values * 10000).round().astype(int)
        lo = X_all_var_int.min()
        X_all_var_int -= lo
        X_fake_var_int -= lo
        hi = X_all_var_int.max()+1
        counts_all = np.bincount(X_all_var_int, minlength=hi).astype(float)
        zeros = (counts_all == 0).astype(int)
        before_zeros = np.concatenate([zeros[1:],[0]])
        indices_all = np.arange(counts_all.shape[0])
        # Geometric mean of twice sigma_base and a sigma_scaled which is scaled to the length of array 
        sigma_scaled = counts_all.shape[0]*sigma_fac
        sigma = np.power(sigma_base * sigma_base * sigma_scaled, 1/3)
        sigmas.append(sigma)
        counts_all_smooth = scipy.ndimage.filters.gaussian_filter1d(counts_all, sigma)
        deviation = counts_all / (counts_all_smooth+eps)
        indices = X_all_var_int
        features_count[:,i] = counts_all[indices]
        features_density[:,i] = counts_all_smooth[indices]
        features_deviation[:,i] = deviation[indices]
        indices_fake = X_fake_var_int
        features_count_fake[:,i] = counts_all[indices_fake]
        features_density_fake[:,i] = counts_all_smooth[indices_fake]
        features_deviation_fake[:,i] = deviation[indices_fake]
        
    features_count_names = [var+'_count' for var in features]
    features_density_names = [var+'_density' for var in features]
    features_deviation_names = [var+'_deviation' for var in features]
    X_all_count = pd.DataFrame(columns=features_count_names, data = features_count)
    X_all_count.index = X_all.index
    X_all_density = pd.DataFrame(columns=features_density_names, data = features_density)
    X_all_density.index = X_all.index
    X_all_deviation = pd.DataFrame(columns=features_deviation_names, data = features_deviation)
    X_all_deviation.index = X_all.index
    X_all = pd.concat([X_all,X_all_count, X_all_density, X_all_deviation], axis=1)
    
    X_fake_count = pd.DataFrame(columns=features_count_names, data = features_count_fake)
    X_fake_count.index = X_fake.index
    X_fake_density = pd.DataFrame(columns=features_density_names, data = features_density_fake)
    X_fake_density.index = X_fake.index
    X_fake_deviation = pd.DataFrame(columns=features_deviation_names, data = features_deviation_fake)
    X_fake_deviation.index = X_fake.index
    X_fake = pd.concat([X_fake,X_fake_count, X_fake_density, X_fake_deviation], axis=1)    

    features_count = features_count_names
    features_density = features_density_names
    features_deviation = features_deviation_names
    return X_all, features_count, features_density, features_deviation, X_fake

In [8]:
X_all, X_fake = df.iloc[list(real_samples_indexes), 1:-2], df.iloc[list(synthetic_samples_indexes), 1:-2]

In [9]:
X_num_all, features_count, features_density, features_deviation, X_num_fake = get_count(X_all, X_fake)

  0%|          | 0/28 [00:00<?, ?it/s]

In [10]:
X_train_real = pd.concat([X_num_all, df.iloc[list(real_samples_indexes), [0,-2,-1]]], axis=1)
X_train_fake = pd.concat([X_num_fake, df.iloc[list(synthetic_samples_indexes), [0,-2,-1]]], axis=1)

In [11]:
features_to_scale = [features, features_count, ['Time', 'Amount']]

def get_standardized(X_all, X_fake):
    scaler = StandardScaler()
    features_to_scale_flatten = [var for sublist in features_to_scale for var in sublist]
    scaler.fit(X_all[features_to_scale_flatten])
    features_scaled = scaler.transform(X_all[features_to_scale_flatten])
    features_scaled_fake = scaler.transform(X_fake[features_to_scale_flatten])
    X_all[features_to_scale_flatten] = features_scaled
    X_fake[features_to_scale_flatten] = features_scaled_fake
    return X_all, X_fake

In [12]:
X_train_real, X_train_fake = get_standardized(X_train_real, X_train_fake)

In [13]:
X_real, y_real = X_train_real.drop(TARGET_NAME, axis=1), X_train_real[TARGET_NAME]
X_fake, y_fake = X_train_fake.drop(TARGET_NAME, axis=1), X_train_fake[TARGET_NAME]

In [49]:
y_real.value_counts(normalize=True)

0    0.998268
1    0.001732
Name: Class, dtype: float64

In [51]:
y_fake.value_counts()  # as all classes here are 0 we may just drop these samples as some kind of undersampling

0    10674
Name: Class, dtype: int64

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_real, y_real, test_size=0.25, shuffle=True, stratify=y_real, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.15, shuffle=True, stratify=y_train, random_state=42)
print(f'Train size: {X_train.shape[0]}, valid size: {X_valid.shape[0]}, test size: {X_test.shape[0]}')

Train size: 174070, valid size: 30719, test size: 68263


In [15]:
buffer_size = 100000
batch_size = 1024

In [30]:
model = tf.keras.Sequential([
                            Dense(256, activation='relu', 
                                  input_shape=(X_train.shape[-1], ), 
                                  kernel_regularizer=regularizers.l2(0.001)),
                            Dropout(0.5),
                            Dense(128, activation='relu',
                                  kernel_regularizer=regularizers.l2(0.001)),
                            Dropout(0.5),
                            Dense(64, activation='relu',
                                  kernel_regularizer=regularizers.l2(0.001)),
                            Dropout(0.5),
                            Dense(1, activation='sigmoid')
])
metrics = [
           tf.keras.metrics.Precision(name='precision'),
           tf.keras.metrics.Recall(name='recall'),
           tf.keras.metrics.AUC(name='roc_auc')
]

model.compile(optimizer=tf.keras.optimizers.Adam(1e-3), loss='binary_crossentropy', metrics=metrics)

callbacks = [tf.keras.callbacks.ModelCheckpoint('model_epoch_{epoch}.h5'),
             tf.keras.callbacks.ReduceLROnPlateau(patience=5),
             tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
            ]
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 256)               29440     
                                                                 
 dropout_3 (Dropout)         (None, 256)               0         
                                                                 
 dense_5 (Dense)             (None, 128)               32896     
                                                                 
 dropout_4 (Dropout)         (None, 128)               0         
                                                                 
 dense_6 (Dense)             (None, 64)                8256      
                                                                 
 dropout_5 (Dropout)         (None, 64)                0         
                                                                 
 dense_7 (Dense)             (None, 1)                

In [17]:
bool_train_labels = y_train != 0
pos_features = X_train[bool_train_labels]
neg_features = X_train[~bool_train_labels]
pos_labels = y_train[bool_train_labels]
neg_labels = y_train[~bool_train_labels]

In [18]:
ids = np.arange(len(pos_features))
choices = np.random.choice(ids, int(len(neg_features)*0.4)) 

res_pos_features = pos_features.values[choices]
res_pos_labels = pos_labels.values[choices]

res_pos_features.shape

(69507, 114)

In [19]:
resampled_features = np.concatenate([res_pos_features, neg_features], axis=0)
resampled_labels = np.concatenate([res_pos_labels, neg_labels], axis=0)

order = np.arange(len(resampled_labels))
np.random.shuffle(order)
resampled_features = resampled_features[order]
resampled_labels = resampled_labels[order]

resampled_features.shape

(243275, 114)

In [20]:
pos = resampled_labels[resampled_labels == 1].astype(int).sum()
neg = resampled_labels.shape[0] - pos
total = resampled_features.shape[0]
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)
class_weight = {0: weight_for_0, 1: weight_for_1}
class_weight

{0: 0.6999994245200497, 1: 1.7500035967600387}

In [21]:
resampled_steps_per_epoch = np.ceil((neg+pos)/batch_size)
resampled_steps_per_epoch

238.0

In [22]:
output_layer = model.layers[-1] 
#output_layer.bias.assign([0]) # as if we use balanced dataset
output_bias = tf.keras.initializers.Constant(np.log([pos/neg])) # as if dataset is unbalanced
output_layer.bias_initializer = output_bias

In [23]:
train_ds = tf.data.Dataset.from_tensor_slices((resampled_features, resampled_labels)).cache().batch(batch_size).prefetch(0)
val_ds = tf.data.Dataset.from_tensor_slices((X_valid, y_valid)).cache()
val_ds = val_ds.batch(batch_size).prefetch(2)

In [31]:
history = model.fit(train_ds, epochs=100, verbose=2, callbacks=callbacks,
                     validation_data=val_ds, class_weight=class_weight, 
                     steps_per_epoch=resampled_steps_per_epoch)

Epoch 1/100
238/238 - 3s - loss: 0.7670 - precision: 0.8181 - recall: 0.8675 - roc_auc: 0.9407 - val_loss: 0.4660 - val_precision: 0.1596 - val_recall: 0.8491 - val_roc_auc: 0.9612 - lr: 0.0010 - 3s/epoch - 13ms/step
Epoch 2/100
238/238 - 1s - loss: 0.3791 - precision: 0.9332 - recall: 0.9414 - roc_auc: 0.9923 - val_loss: 0.2837 - val_precision: 0.0869 - val_recall: 0.8491 - val_roc_auc: 0.9601 - lr: 0.0010 - 1s/epoch - 6ms/step
Epoch 3/100
238/238 - 1s - loss: 0.2547 - precision: 0.9567 - recall: 0.9849 - roc_auc: 0.9984 - val_loss: 0.2050 - val_precision: 0.1618 - val_recall: 0.8302 - val_roc_auc: 0.9661 - lr: 0.0010 - 1s/epoch - 6ms/step
Epoch 4/100
238/238 - 1s - loss: 0.1858 - precision: 0.9738 - recall: 0.9946 - roc_auc: 0.9993 - val_loss: 0.1551 - val_precision: 0.2810 - val_recall: 0.8113 - val_roc_auc: 0.9220 - lr: 0.0010 - 1s/epoch - 6ms/step
Epoch 5/100
238/238 - 1s - loss: 0.1420 - precision: 0.9809 - recall: 0.9967 - roc_auc: 0.9995 - val_loss: 0.1210 - val_precision: 0.35

In [32]:
k_pred_train = model.predict(X_train)
k_pred_test = model.predict(X_test)

In [33]:
k_train_label = np.where(k_pred_train >= 0.5, 1, 0).flatten()
k_test_label = np.where(k_pred_test >= 0.5, 1, 0).flatten()

In [34]:
print(classification_report(y_train, k_train_label, digits=3))
print(classification_report(y_test, k_test_label, digits=3))
print(f'roc_auc train: {roc_auc_score(y_train, k_pred_train):.3f}, roc_auc test: {roc_auc_score(y_test, k_pred_test):.3f}')

              precision    recall  f1-score   support

           0      1.000     1.000     1.000    173768
           1      0.926     1.000     0.962       302

    accuracy                          1.000    174070
   macro avg      0.963     1.000     0.981    174070
weighted avg      1.000     1.000     1.000    174070

              precision    recall  f1-score   support

           0      1.000     1.000     1.000     68145
           1      0.818     0.763     0.789       118

    accuracy                          0.999     68263
   macro avg      0.909     0.881     0.895     68263
weighted avg      0.999     0.999     0.999     68263

roc_auc train: 1.000, roc_auc test: 0.953


In [37]:
confusion_matrix(y_test, k_test_label)

array([[68125,    20],
       [   28,    90]])

In [39]:
y_test.value_counts(normalize=True)

0    0.998271
1    0.001729
Name: Class, dtype: float64

Experimaental

In [46]:
num_features = X_real.shape[1]
num_preds = 1

def get_model_cnn():
    inp = keras.layers.Input((num_features*num_preds,))
    x = keras.layers.Reshape((num_features*num_preds,1))(inp)
    x = keras.layers.Conv1D(32,num_preds,strides=num_preds, activation='elu')(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Conv1D(24,1, activation='elu')(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Conv1D(16,1, activation='elu')(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Conv1D(4,1, activation='elu')(x)
    x = keras.layers.Flatten()(x)
    x = keras.layers.Reshape((num_features*4,1))(x)
    x = keras.layers.AveragePooling1D(2)(x)
    x = keras.layers.Flatten()(x)
    x = keras.layers.BatchNormalization()(x)
    out = keras.layers.Dense(1, activation='sigmoid')(x)
    return keras.Model(inputs=inp, outputs=out)

cnn_model = get_model_cnn()
cnn_model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 114)]             0         
                                                                 
 reshape_4 (Reshape)         (None, 114, 1)            0         
                                                                 
 conv1d_8 (Conv1D)           (None, 114, 32)           64        
                                                                 
 batch_normalization_8 (Batc  (None, 114, 32)          128       
 hNormalization)                                                 
                                                                 
 conv1d_9 (Conv1D)           (None, 114, 24)           792       
                                                                 
 batch_normalization_9 (Batc  (None, 114, 24)          96        
 hNormalization)                                           

In [45]:
pos = y_real[y_real == 1].astype(int).sum()
neg = y_real.shape[0] - pos
total = X_real.shape[0]
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)
class_weight = {0: weight_for_0, 1: weight_for_1}
class_weight

{0: 0.5008676383727286, 1: 288.6384778012685}

In [48]:
X_fold, X_test, y_fold, y_test = train_test_split(X_real, y_real, test_size=0.25, shuffle=True, stratify=y_real, random_state=42)

In [52]:
probs_train, probs_test = [], []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, valid_index in skf.split(X_fold, y_fold):
    X_train, X_valid = X_fold.iloc[train_index, :], X_fold.iloc[valid_index, :]
    y_train, y_valid = y_fold.iloc[train_index], y_fold.iloc[valid_index]

    train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).cache().batch(batch_size).prefetch(0)
    val_ds = tf.data.Dataset.from_tensor_slices((X_valid, y_valid)).cache()
    val_ds = val_ds.batch(batch_size).prefetch(2)

    cnn_model = get_model_cnn()
    optimizer = tf.keras.optimizers.Adam(lr=0.001, decay=0.00001)
    cnn_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=metrics)
    cnn_model.fit(train_ds, epochs=100, verbose=0, callbacks=callbacks,
                     validation_data=val_ds, class_weight=class_weight, 
                     steps_per_epoch=np.ceil((X_train.shape[0])/batch_size))
    
    probs_train.append(cnn_model.predict(X_train))
    probs_test.append(cnn_model.predict(X_test))

In [61]:
test_probs = np.array(probs_test).squeeze().mean(axis=0)
test_labels = np.where(test_probs >=0.5, 1, 0)

In [62]:
print(classification_report(y_test, test_labels, digits=3))
print(f'roc_auc test: {roc_auc_score(y_test, test_probs):.3f}')

              precision    recall  f1-score   support

           0      1.000     0.995     0.997     68145
           1      0.228     0.847     0.359       118

    accuracy                          0.995     68263
   macro avg      0.614     0.921     0.678     68263
weighted avg      0.998     0.995     0.996     68263

roc_auc test: 0.974
