Based on github.com/codyznash/GANs_for_Credit_Card_Data

Install requirements with:

    pip install numpy pandas matplotlib tensorflow-gpu keras

In [1]:
import pandas as pd

import pickle
import sys
sys.path.append('1_PIF')
import klcalculator

features = ['gender', 'AGE', 'POSTCODE', 'blood_group', 'eye_color', 'job']
data = pd.read_csv('1_PIF/our-synthetic.csv')[features]
data = data.fillna('Unemployed')
dataset = list(zip(*(data[fn].tolist() for fn in features)))

In [2]:
data.head(10)

Unnamed: 0,gender,AGE,POSTCODE,blood_group,eye_color,job
0,F,99,2649,B-,Brown,"Psychologist, counselling"
1,M,108,1780,A-,Hazel,Personnel officer
2,M,59,2940,B+,Hazel,Tourism officer
3,M,58,2945,B+,Blue,Make
4,M,30,2729,AB-,Brown,Forest/woodland manager
5,M,6,2531,A-,Grey,Unemployed
6,F,104,839,B+,Brown,Youth worker
7,M,11,2901,AB-,Grey,Unemployed
8,F,105,846,A-,Green,Camera operator
9,M,115,2912,A+,Grey,Retail merchandiser


In [3]:
# Encode categorical data.
# See https://towardsdatascience.com/encoding-categorical-features-21a2651a065c.

data = pd.get_dummies(data, drop_first=True)
data.head()

Unnamed: 0,AGE,POSTCODE,gender_M,blood_group_A-,blood_group_AB+,blood_group_AB-,blood_group_B+,blood_group_B-,blood_group_O+,blood_group_O-,...,job_Volunteer coordinator,job_Warden/ranger,job_Warehouse manager,job_Waste management officer,job_Water engineer,job_Water quality scientist,job_Web designer,job_Wellsite geologist,job_Writer,job_Youth worker
0,99,2649,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,108,1780,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,59,2940,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,58,2945,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,30,2729,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
data_cols = [i for i in data.columns]
data_dim = len(data_cols)

In [5]:
def get_data_batch(train, batch_size, seed=0):
    # # random sampling - some samples will have excessively low or high sampling, but easy to implement
    # np.random.seed(seed)
    # x = train.loc[ np.random.choice(train.index, batch_size) ].values
    
    # iterate through shuffled indices, so every sample gets covered evenly
    start_i = (batch_size * seed) % len(train)
    stop_i = start_i + batch_size
    shuffle_seed = (batch_size * seed) // len(train)
    np.random.seed(shuffle_seed)
    train_ix = np.random.choice( list(train.index), replace=False, size=len(train) ) # wasteful to shuffle every time
    train_ix = list(train_ix) + list(train_ix) # duplicate to cover ranges past the end of the set
    x = train.loc[ train_ix[ start_i: stop_i ] ].values
    
    return np.reshape(x, (batch_size, -1) )



In [6]:
from keras import applications
from keras import backend as K
from keras import layers
from keras import models
from keras import optimizers

import tensorflow as tf
import numpy as np

Using TensorFlow backend.


# Generator Definition



In [7]:
def generator_network(x, data_dim, base_n_count): 
    x = layers.Dense(base_n_count, activation='relu')(x)
    x = layers.Dense(base_n_count*2, activation='relu')(x)
    x = layers.Dense(base_n_count*4, activation='relu')(x)
    x = layers.Dense(data_dim)(x)    
    return x


# Discriminator Definition

In [8]:
# def discriminator_network(x, data_dim, base_n_count):
#     x = layers.Dense(base_n_count*4, activation='relu')(x)
#     # x = layers.Dropout(0.1)(x)
#     x = layers.Dense(base_n_count*2, activation='relu')(x)
#     # x = layers.Dropout(0.1)(x)
#     x = layers.Dense(base_n_count, activation='relu')(x)
#     x = layers.Dense(1, activation='sigmoid')(x)
#     # x = layers.Dense(1)(x)
#     return x
    
def critic_network(x, data_dim, base_n_count):
    x = layers.Dense(base_n_count*4, activation='relu')(x)
    # x = layers.Dropout(0.1)(x)
    x = layers.Dense(base_n_count*2, activation='relu')(x) # 2
    # x = layers.Dropout(0.1)(x)
    x = layers.Dense(base_n_count*1, activation='relu')(x) # 1
    # x = layers.Dense(base_n_count*4, activation='relu')(x) # extra
    # x = layers.Dense(base_n_count*4, activation='relu')(x) # extra
    # x = layers.Dense(1, activation='sigmoid')(x)
    x = layers.Dense(1)(x)
    return x


In [9]:
def define_models_GAN(rand_dim, data_dim, base_n_count, type=None):
    generator_input_tensor = layers.Input(shape=(rand_dim, ))
    generated_image_tensor = generator_network(generator_input_tensor, data_dim, base_n_count)

    generated_or_real_image_tensor = layers.Input(shape=(data_dim,))
    

    discriminator_output = critic_network(generated_or_real_image_tensor, data_dim, base_n_count)
    
    generator_model = models.Model(inputs=[generator_input_tensor], outputs=[generated_image_tensor], name='generator')
    discriminator_model = models.Model(inputs=[generated_or_real_image_tensor],
                                       outputs=[discriminator_output],
                                       name='discriminator')

    combined_output = discriminator_model(generator_model(generator_input_tensor))
    combined_model = models.Model(inputs=[generator_input_tensor], outputs=[combined_output], name='combined')
    
    return generator_model, discriminator_model, combined_model


In [10]:
rand_dim = 512 # needs to be ~data_dim
base_n_count = 128 # 128
num_features = data_dim

g, d, c = define_models_GAN(rand_dim, num_features, base_n_count)

W0703 15:02:53.150067 14068 deprecation_wrapper.py:119] From c:\users\tho802\.virtualenvs\venvs\acs\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0703 15:02:53.174069 14068 deprecation_wrapper.py:119] From c:\users\tho802\.virtualenvs\venvs\acs\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0703 15:02:53.180070 14068 deprecation_wrapper.py:119] From c:\users\tho802\.virtualenvs\venvs\acs\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.



In [11]:
np.random.seed(42)

z = np.random.normal(size=(1, rand_dim))

synthetic_observation = g.predict(z)

synthetic_observation.shape

W0703 15:02:53.439073 14068 deprecation_wrapper.py:119] From c:\users\tho802\.virtualenvs\venvs\acs\lib\site-packages\keras\backend\tensorflow_backend.py:2741: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.

W0703 15:02:53.443069 14068 deprecation_wrapper.py:119] From c:\users\tho802\.virtualenvs\venvs\acs\lib\site-packages\keras\backend\tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.



(1, 654)

In [12]:
d.predict(synthetic_observation)

array([[0.03507951]], dtype=float32)

In [13]:
def training_steps_WGAN(model_components):
    
    [ cache_prefix, starting_step,
                        train, data_cols, data_dim,
                        label_dim,
                        generator_model, discriminator_model, combined_model,
                        rand_dim, nb_steps, batch_size, 
                        k_d, k_g, critic_pre_train_steps, log_interval, learning_rate, base_n_count,
                        data_dir, generator_model_path, discriminator_model_path,

                        sess, _z, _x, _labels, _g_z, epsilon, x_hat, gradients, _gradient_penalty,
                        _disc_loss_generated, _disc_loss_real, _disc_loss, disc_optimizer,
                        show,
                        combined_loss, disc_loss_generated, disc_loss_real, pifs
                        ] = model_components
    
    for i in range(starting_step, starting_step+nb_steps):
        K.set_learning_phase(1) # 1 = train

        # train the discriminator
        for j in range(k_d):
            d_l_g, d_l_r = train_discriminator_step(model_components, seed=i+j)
        disc_loss_generated.append(d_l_g)
        disc_loss_real.append(d_l_r)

        # train the generator
        for j in range(k_g):
            np.random.seed(i+j)
            z = np.random.normal(size=(batch_size, rand_dim))

            loss = combined_model.train_on_batch(z, [-np.ones(batch_size)])
        combined_loss.append(loss)

        # Determine PIF loss each step, after training generator and discriminator
        if not i % 10: # 2x faster than testing each step...
            K.set_learning_phase(0) # 0 = test
            test_size = 492 # test using all of the actual fraud data
            x = get_data_batch(train, test_size, seed=i)
            z = np.random.normal(size=(test_size, rand_dim))

            g_z = generator_model.predict(z)
            
            #TODO Evaluate PIF and/or Utility measures on g_z
            #risks = klcalculator.find_risks_for_records(g_z)
            #pif95 = klcalculator.percentile(klcalculator.find_individual_risks(risks), 95)
            #gb_loss = CheckAccuracy( x, g_z, data_cols, seed=0, data_dim=data_dim )
            #pifs = np.append(pifs, pif95)
        
        if not i % log_interval:
            print('Step: {} of {}.'.format(i, starting_step + nb_steps))
            # K.set_learning_phase(0) # 0 = test
                        
            # loss summaries   
            print( 'Losses: G, D Gen, D Real, {:.4f}, {:.4f}, {:.4f}, '.format(combined_loss[-1], disc_loss_generated[-1], disc_loss_real[-1]) )
            print( 'D Real - D Gen: {:.4f}'.format(disc_loss_real[-1]-disc_loss_generated[-1]) )
            # print('Generator model loss: {}.'.format(combined_loss[-1]))
            # print('Discriminator model loss gen: {}.'.format(disc_loss_generated[-1]))
            # print('Discriminator model loss real: {}.'.format(disc_loss_real[-1]))
            
            if show:
                PlotData( x, g_z, data_cols, seed=0, data_dim=data_dim, 
                            save=False, prefix= data_dir + cache_prefix + '_' + str(i) )

            # save model checkpoints
            model_checkpoint_base_name = data_dir + cache_prefix + '_{}_model_weights_step_{}.h5'
            generator_model.save_weights(model_checkpoint_base_name.format('generator', i))
            discriminator_model.save_weights(model_checkpoint_base_name.format('discriminator', i))
            pickle.dump([combined_loss, disc_loss_generated, disc_loss_real, pifs], 
                open( data_dir + cache_prefix + '_losses_step_{}.pkl'.format(i) ,'wb'))
    
    return [combined_loss, disc_loss_generated, disc_loss_real, pifs]


In [14]:
def adversarial_training_WGAN(arguments, train, data_cols, seed=0, starting_step=0):

    [rand_dim, nb_steps, batch_size, 
             k_d, k_g, critic_pre_train_steps, log_interval, learning_rate, base_n_count,
            data_dir, generator_model_path, discriminator_model_path, loss_pickle_path, show ] = arguments
            
    np.random.seed(seed)     # set random seed
    
    data_dim = len(data_cols)
    print('data_dim: ', data_dim)
    print('data_cols: ', data_cols)
    
    label_dim = 0
    
    # define network models
    
    K.set_learning_phase(1) # 1 = train
    
    cache_prefix = 'WGAN'
    generator_model, discriminator_model, combined_model = define_models_GAN(rand_dim, data_dim, base_n_count, type='Wasserstein')
    
    # construct computation graph for calculating the gradient penalty (improved WGAN) and training the discriminator

    _z = tf.placeholder(tf.float32, shape=(batch_size, rand_dim))
    
    _labels = None
     
    _x = tf.placeholder(tf.float32, shape=(batch_size, data_dim))
    _g_z = generator_model(_z)
    
    epsilon = tf.placeholder(tf.float32, shape=(batch_size, 1))
    
    x_hat = epsilon * _x + (1.0 - epsilon) * _g_z
    gradients = tf.gradients(discriminator_model(x_hat), [x_hat])
    _gradient_penalty = 10.0 * tf.square(tf.norm(gradients[0], ord=2) - 1.0)

    # calculate discriminator's loss
    _disc_loss_generated = em_loss(tf.ones(batch_size), discriminator_model(_g_z))
    _disc_loss_real = em_loss(tf.ones(batch_size), discriminator_model(_x))
    _disc_loss = _disc_loss_generated - _disc_loss_real + _gradient_penalty

    # update f by taking an SGD step on mini-batch loss LD(f)
    disc_optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.5, beta2=0.9).minimize(
        _disc_loss, var_list=discriminator_model.trainable_weights)

    sess = K.get_session()

    # compile models

    adam = optimizers.Adam(lr=learning_rate, beta_1=0.5, beta_2=0.9)

    discriminator_model.trainable = False
    combined_model.compile(optimizer=adam, loss=[em_loss])

    combined_loss, disc_loss_generated, disc_loss_real, pifs = [], [], [], []
    
    model_components = [ cache_prefix, starting_step,
                        train, data_cols, data_dim,
                        label_dim,
                        generator_model, discriminator_model, combined_model,
                        rand_dim, nb_steps, batch_size, 
                        k_d, k_g, critic_pre_train_steps, log_interval, learning_rate, base_n_count,
                        data_dir, generator_model_path, discriminator_model_path,

                        sess, _z, _x, _labels, _g_z, epsilon, x_hat, gradients, _gradient_penalty,
                        _disc_loss_generated, _disc_loss_real, _disc_loss, disc_optimizer,
                        show,
                        combined_loss, disc_loss_generated, disc_loss_real, pifs
                        ]

    if show:
        print(generator_model.summary())
        print(discriminator_model.summary())
        print(combined_model.summary())

    if loss_pickle_path:
        print('Loading loss pickles')
        [combined_loss, disc_loss_generated, disc_loss_real, pifs] = pickle.load(open(loss_pickle_path,'rb'))
    if generator_model_path:
        print('Loading generator model')
        generator_model.load_weights(generator_model_path) #, by_name=True)
    if discriminator_model_path:
        print('Loading discriminator model')
        discriminator_model.load_weights(discriminator_model_path) #, by_name=True)
    else:
        print('pre-training the critic...')
        K.set_learning_phase(1) # 1 = train
        for i in range(critic_pre_train_steps):
            if i%20==0:
                print('Step: {} of {} critic pre-training.'.format(i, critic_pre_train_steps))
            loss = train_discriminator_step(model_components, seed=i)
        print('Last batch of critic pre-training disc_loss: {}.'.format(loss))

    model_components = [ cache_prefix, starting_step,
                        train, data_cols, data_dim,
                        label_dim,
                        generator_model, discriminator_model, combined_model,
                        rand_dim, nb_steps, batch_size, 
                        k_d, k_g, critic_pre_train_steps, log_interval, learning_rate, base_n_count,
                        data_dir, generator_model_path, discriminator_model_path,

                        sess, _z, _x, _labels, _g_z, epsilon, x_hat, gradients, _gradient_penalty,
                        _disc_loss_generated, _disc_loss_real, _disc_loss, disc_optimizer,
                        show,
                        combined_loss, disc_loss_generated, disc_loss_real, pifs
                        ]
        
    [combined_loss, disc_loss_generated, disc_loss_real, pifs] = training_steps_WGAN(model_components)
   

In [15]:
def train_discriminator_step(model_components, seed=0):
    
    [ cache_prefix, starting_step,
                        train, data_cols, data_dim,
                        label_dim,
                        generator_model, discriminator_model, combined_model,
                        rand_dim, nb_steps, batch_size, 
                        k_d, k_g, critic_pre_train_steps, log_interval, learning_rate, base_n_count,
                        data_dir, generator_model_path, discriminator_model_path,

                        sess, _z, _x, _labels, _g_z, epsilon, x_hat, gradients, _gradient_penalty,
                        _disc_loss_generated, _disc_loss_real, _disc_loss, disc_optimizer,
                        show,
                        combined_loss, disc_loss_generated, disc_loss_real, pifs
                        ] = model_components
    

    d_l_g, d_l_r, _ = sess.run([_disc_loss_generated, _disc_loss_real, disc_optimizer], feed_dict={
        _z: np.random.normal(size=(batch_size, rand_dim)),
        _x: get_data_batch(train, batch_size, seed=seed),
        epsilon: np.random.uniform(low=0.0, high=1.0, size=(batch_size, 1))
    })
    return d_l_g, d_l_r


In [16]:
def em_loss(y_coefficients, y_pred):
    # define earth mover distance (wasserstein loss)
    # literally the weighted average of the critic network output
    # this is defined separately so it can be fed as a loss function to the optimizer in the WGANs
    return tf.reduce_mean(tf.multiply(y_coefficients, y_pred))


In [17]:
def BaseMetrics(y_pred,y_true):
    TP = np.sum( (y_pred == 1) & (y_true == 1) )
    TN = np.sum( (y_pred == 0) & (y_true == 0) )
    FP = np.sum( (y_pred == 1) & (y_true == 0) )
    FN = np.sum( (y_pred == 0) & (y_true == 1) )
    return TP, TN, FP, FN

def SimpleMetrics(y_pred,y_true):
    TP, TN, FP, FN = BaseMetrics(y_pred,y_true)
    ACC = ( TP + TN ) / ( TP + TN + FP + FN )
    
    # Reporting
    from IPython.display import display
    print( 'Confusion Matrix')
    display( pd.DataFrame( [[TN,FP],[FN,TP]], columns=['Pred 0','Pred 1'], index=['True 0', 'True 1'] ) )
    print( 'Accuracy : {}'.format( ACC ))
    
def SimpleAccuracy(y_pred,y_true):
    TP, TN, FP, FN = BaseMetrics(y_pred,y_true)
    ACC = ( TP + TN ) / ( TP + TN + FP + FN )
    return ACC


def CheckAccuracy( x, g_z, data_cols, label_cols=[], seed=0, with_class=False, data_dim=2 ):

    dtrain = np.vstack( [ x[:int(len(x)/2)], g_z[:int(len(g_z)/2)] ] ) # Use half of each real and generated set for training
    dlabels = np.hstack( [ np.zeros(int(len(x)/2)), np.ones(int(len(g_z)/2)) ] ) # synthetic labels
    dtest = np.vstack( [ x[int(len(x)/2):], g_z[int(len(g_z)/2):] ] ) # Use the other half of each set for testing
    y_true = dlabels # Labels for test samples will be the same as the labels for training samples, assuming even batch sizes
    
    

    # return '{:.2f}'.format(SimpleAccuracy(y_pred, y_true)) # assumes balanced real and generated datasets
    return SimpleAccuracy(y_pred, y_true) # assumes balanced real and generated datasets


In [18]:
nb_steps = 1000 + 1 # 50000 # Add one for logging of the last interval
batch_size = 128 # 64

critic_pre_train_steps = 100 # 100  # number of steps to pre-train the critic before starting adversarial training
log_interval = 100 # 100  # interval (in steps) at which to log loss summaries and save plots of image samples to disc
learning_rate = 5e-4 # 5e-5
data_dir = 'cache/'
generator_model_path, discriminator_model_path, loss_pickle_path = None, None, None

show = False

In [19]:
%%time


# Training the WGAN architectures
k_d = 5  # number of discriminator network updates per adversarial training step
k_g = 1  # number of generator network updates per adversarial training step

learning_rate = 1e-4
arguments = [rand_dim, nb_steps, batch_size, 
             k_d, k_g, critic_pre_train_steps, log_interval, learning_rate, base_n_count,
            data_dir, generator_model_path, discriminator_model_path, loss_pickle_path, show ]

adversarial_training_WGAN(arguments, data, data_cols=data_cols ) # WGAN


data_dim:  654
data_cols:  ['AGE', 'POSTCODE', 'gender_M', 'blood_group_A-', 'blood_group_AB+', 'blood_group_AB-', 'blood_group_B+', 'blood_group_B-', 'blood_group_O+', 'blood_group_O-', 'eye_color_Brown', 'eye_color_Green', 'eye_color_Grey', 'eye_color_Hazel', 'job_Accommodation manager', 'job_Accountant, chartered', 'job_Accountant, chartered certified', 'job_Accountant, chartered management', 'job_Accountant, chartered public finance', 'job_Accounting technician', 'job_Actor', 'job_Actuary', 'job_Acupuncturist', 'job_Administrator', 'job_Administrator, Civil Service', 'job_Administrator, arts', 'job_Administrator, charities/voluntary organisations', 'job_Administrator, education', 'job_Administrator, local government', 'job_Administrator, sports', 'job_Adult guidance worker', 'job_Adult nurse', 'job_Advertising account executive', 'job_Advertising account planner', 'job_Advertising art director', 'job_Advertising copywriter', 'job_Advice worker', 'job_Aeronautical engineer', 'job_Ag

W0703 15:02:54.416075 14068 deprecation_wrapper.py:119] From c:\users\tho802\.virtualenvs\venvs\acs\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



pre-training the critic...
Step: 0 of 100 critic pre-training.
Step: 20 of 100 critic pre-training.
Step: 40 of 100 critic pre-training.
Step: 60 of 100 critic pre-training.
Step: 80 of 100 critic pre-training.
Last batch of critic pre-training disc_loss: (0.49513128, 3756.2139).
Step: 0 of 1001.
Losses: G, D Gen, D Real, -0.7345, 0.6432, 3828.4780, 
D Real - D Gen: 3827.8347
Step: 100 of 1001.
Losses: G, D Gen, D Real, -32.6934, 32.3017, 4418.7104, 
D Real - D Gen: 4386.4087
Step: 200 of 1001.
Losses: G, D Gen, D Real, -166.5651, 163.1503, 3984.5056, 
D Real - D Gen: 3821.3555
Step: 300 of 1001.
Losses: G, D Gen, D Real, -458.4257, 474.8564, 3769.2004, 
D Real - D Gen: 3294.3440
Step: 400 of 1001.
Losses: G, D Gen, D Real, -856.8681, 851.8993, 3320.0979, 
D Real - D Gen: 2468.1987
Step: 500 of 1001.
Losses: G, D Gen, D Real, -1135.9578, 1087.4175, 2373.0342, 
D Real - D Gen: 1285.6167
Step: 600 of 1001.
Losses: G, D Gen, D Real, -871.3438, 858.2996, 1076.8633, 
D Real - D Gen: 218.563

In [21]:
z = np.random.normal(size=(1, rand_dim))

synthetic_observation = g.predict(z)

In [22]:
synthetic_observation

array([[ 2.50482947e-01,  1.39617950e-01, -1.18425049e-01,
        -5.44338822e-01,  3.90498042e-01,  3.93703043e-01,
        -1.90173417e-01, -2.09484681e-01, -3.27937752e-01,
        -1.04589816e-02,  6.28266275e-01,  4.88190874e-02,
         3.98725510e-01,  1.90855712e-02, -1.15017846e-01,
        -2.56384075e-01, -2.19300091e-02,  1.98945895e-01,
        -5.93217090e-03, -5.59039414e-01, -2.66822398e-01,
         1.38643920e-01,  4.13359731e-01, -6.19796693e-01,
         4.30440724e-01,  2.10232899e-01,  1.34511665e-01,
        -4.83408809e-01, -3.27984512e-01,  1.12726137e-01,
        -2.19285131e-01,  2.99115390e-01, -3.96695852e-01,
        -4.18621391e-01,  7.21786022e-02, -2.96919733e-01,
         1.86972827e-01, -3.83603156e-01, -6.98287427e-01,
         1.29774407e-01,  1.11278825e-01,  3.13881278e-01,
         1.75425947e-01,  2.21034110e-01,  4.08779591e-01,
         6.39131889e-02, -4.05631125e-01, -1.67922407e-01,
        -2.99466848e-01, -6.71931803e-02,  1.64604187e-0