In [None]:
%matplotlib inline
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import normalize
from sklearn.manifold import TSNE
from tqdm import tqdm_notebook as tqdm
from keras.models import Model
from keras.layers import Input, Reshape , concatenate
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import UpSampling1D, Conv1D
from keras.layers.advanced_activations import LeakyReLU
from keras.optimizers import Adam, SGD
from keras.callbacks import TensorBoard
from keras.layers import Input, Dense, Reshape, Flatten, Embedding, Dropout
from keras.layers.advanced_activations import LeakyReLU
from keras.models import Sequential, Model
from keras.optimizers import Adam
from keras.utils.generic_utils import Progbar
from sklearn.metrics import  confusion_matrix

np.random.seed(1337)
num_classes = 10

In [None]:
data = pd.read_csv('creditcard.csv')
data_x = data[data.Class == 0]
data_x.shape
data_x = data_x.drop(['Time','Class'], axis = 1)
from sklearn.preprocessing import StandardScaler
data['normalizedAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1))
data = data.drop(['Amount','Time'],axis=1)


In [None]:
## Model Training for Classification
X = data.iloc[:, data.columns != 'Class']
y = data.iloc[:, data.columns == 'Class']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state=0)

In [None]:
"""
VISUALISATION

"""

In [None]:
X.columns

In [None]:
# Plot the data by each feature

axarr = [[]]*len(data.columns)
columns = 4
rows = int( np.ceil( len(data.columns) / columns ) )
f, fig = plt.subplots( figsize=(columns*4, rows*3) )

f.suptitle('Distribution Plots', size=16)

for i, col in enumerate(data.columns[:]):
    axarr[i] = plt.subplot2grid( (int(rows), int(columns)), (int(i//columns), int(i%columns)) )
    axarr[i].hist( [ data.loc[ data.Class == 0, col ], data.loc[ data.Class == 1, col ] ], label=['non-fraud','fraud'], 
                          bins=np.linspace( np.percentile(data[col],0.1), np.percentile(data[col],99.9), 30 ),
                          normed=True )
    axarr[i].set_xlabel(col, size=12)
    axarr[i].set_ylim([0,0.8])
    axarr[i].tick_params(axis='both', labelsize=10)
    if i == 0: 
        legend = axarr[i].legend()
        legend.get_frame().set_facecolor('white')
    if i%4 != 0 : 
        axarr[i].tick_params(axis='y', left='off', labelleft='off')
    else:
        axarr[i].set_ylabel('Values',size=12)

plt.tight_layout(rect=[0,0,1,0.95]) # xmin, ymin, xmax, ymax
# plt.savefig('plots/Engineered_Data_Distributions.png')


In [None]:
# Feature importance
from sklearn import decomposition
from sklearn.ensemble import ExtraTreesClassifier
np.random.seed(77)

df = pd.read_csv('creditcard.csv')
df.Amount = StandardScaler().fit_transform(df['Amount'].values.reshape(-1,1))
X_x = df.iloc[:, df.columns != 'Class']
y_y = df.iloc[:, df.columns == 'Class']
X_names = df.columns.values[:-1]

# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=100, random_state=0)

forest.fit(X_x, y_y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]



In [None]:
# Print the feature ranking
print("Feature ranking:")

for f in range(X_x.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, X_names[indices[f]], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure(figsize= (15, 5 ))
plt.title("Feature importances")
plt.bar(range(X_x.shape[1]), importances[indices],
       color="blue", yerr=std[indices], align="center")
plt.xticks(range(X_x.shape[1]), X_names[indices])
plt.xlim([-1, X_x.shape[1]])
plt.show()

In [None]:
print(X_names[indices])

In [None]:
# top 4 features

top_4 = X_names[indices][:6]
print(top_4)

X_top_4 = df[top_4]
print(X_top_4)

top4df = pd.concat([df['Time'], X_top_4, y_y], axis= 1)

In [None]:
top4df.head()

In [None]:
for i in range(6):
    col1= top4df.columns.values[i+1]
    print(col1)

In [None]:
'''

THIS IS WHERE THE PREDICTION MODEL ENDS

NEXT: NEURAL NETWORK MODEL

'''

In [None]:
data = top4df.groupby('Class')
print(data['V12'])

In [None]:
# Initialising the ANN
classifier = Sequential()
# Adding the input layer and the first hidden layer
classifier.add(Dense(units =20 , kernel_initializer = 'uniform', activation = 'relu', input_dim = 29))
# Adding the second hidden layer
classifier.add(Dense(units = 15, kernel_initializer = 'uniform', activation = 'relu'))
# Adding the third hidden layer
classifier.add(Dense(units = 5, kernel_initializer = 'uniform', activation = 'relu'))
# Adding the output layer
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
# Fitting the ANN to the Training set
# Fitting the ANN to the Training set
classifier.fit(X_train, y_train, batch_size = 32, epochs = 100)

In [None]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)
score = classifier.evaluate(X_test, y_test)
score

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
# Building the generator model
def get_generative(G_in, dense_dim=20, out_dim=29, lr=1e-3):
    x = Dense(dense_dim)(G_in)
    x = Activation('tanh')(x)
    G_out = Dense(out_dim, activation='tanh')(x)
    G = Model(G_in, G_out)
    opt = SGD(lr=lr)
    G.compile(loss='binary_crossentropy', optimizer=opt)
    return G, G_out

G_in = Input(shape=(10,))
G, G_out = get_generative(G_in)
G.summary()

In [None]:
 # Building the Discriminator model
 def get_discriminative(D_in, lr=1e-3, drate=.25, n_channels=20, conv_sz=3, leak=.2):
    x = Reshape((-1, 1))(D_in)
    x = Conv1D(n_channels, conv_sz, activation='relu')(x)
    x = Dropout(drate)(x)
    x = Flatten()(x)
    x = Dense(n_channels)(D_in)
    D_out = Dense(2, activation='sigmoid')(x)
    D = Model(D_in, D_out)
    dopt = Adam(lr=lr)
    D.compile(loss='binary_crossentropy', optimizer=dopt)
    return D, D_out

D_in = Input(shape=[29])
D, D_out = get_discriminative(D_in)
D.summary()

In [None]:
def set_trainability(model, trainable=False):
    model.trainable = trainable
    for layer in model.layers:
        layer.trainable = trainable
        
def make_gan(GAN_in, G, D):
    set_trainability(D, False)
    x = G(GAN_in)
    GAN_out = D(x)
    GAN = Model(GAN_in, GAN_out)
    GAN.compile(loss='binary_crossentropy', optimizer=G.optimizer)
    return GAN, GAN_out

GAN_in = Input([10])
GAN, GAN_out = make_gan(GAN_in, G, D)
GAN.summary()

In [None]:
X_train.shape

In [None]:
# Building GAN model
def sample_data_and_gen(G, n_samples=10000, noise_dim=10):
    random_indices = np.random.choice(199363, size=n_samples, replace=False)
    xx_train = np.array(X_train)
    XT = xx_train[random_indices]
    XN_noise = np.random.uniform(0, 1, size=[n_samples, noise_dim])
    #print(XN_noise)
    XN = G.predict(XN_noise)
    X = np.concatenate((XT, XN))
    y = np.zeros((2*n_samples, 2))
    y[:n_samples, 1] = 1
    y[n_samples:, 0] = 1
    return X, y
# Pretrain the Discriminator and Generator
def pretrain(G, D, noise_dim=10, n_samples=10000, batch_size=32):
    X, y = sample_data_and_gen(G, n_samples=n_samples, noise_dim=noise_dim)
    #print(X,y)
    set_trainability(D, True)
    D.fit(X, y, epochs=200, batch_size=32)

pretrain(G, D)

In [None]:
# Function used to sample from latent space
def sample_noise(G, noise_dim=10, n_samples=1000):
    X = np.random.uniform(0, 1, size=[n_samples, noise_dim])
    y = np.zeros((n_samples, 2))
    y[:, 1] = 1
    return X, y

# Training the GAN model 
def train(GAN, G, D, epochs=400, n_samples=1000, noise_dim=10, batch_size=32, verbose=False, v_freq=50):
    d_loss = []
    g_loss = []
    e_range = range(epochs)
    if verbose:
        e_range = tqdm(e_range)
    for epoch in e_range:
        X, y = sample_data_and_gen(G, n_samples=n_samples, noise_dim=noise_dim)
        print(X.shape)
        set_trainability(D, True)
        d_loss.append(D.train_on_batch(X, y))
        
        X, y = sample_noise(G, n_samples=n_samples, noise_dim=noise_dim)
        
        set_trainability(D, False)
        g_loss.append(GAN.train_on_batch(X, y))
        if verbose and (epoch + 1) % v_freq == 0:
            print("Epoch #{}: Generative Loss: {}, Discriminative Loss: {}".format(epoch + 1, g_loss[-1], d_loss[-1]))
    return d_loss, g_loss

d_loss, g_loss = train(GAN, G, D, verbose=True)

In [None]:
# Ploting training  error of the generator and discriminator

ax = pd.DataFrame({'Generative Loss': g_loss, 'Discriminative Loss': d_loss, }).plot(title='Training loss', logy=True)
ax.set_xlabel("Epochs")
ax.set_ylabel("loss")

In [None]:
N_VIEWED_SAMPLES = 2
X_gen , y_gen = sample_data_and_gen(G, n_samples=100000)


In [None]:
X_gen = X_gen[100000:, :]
print(X_gen)

In [None]:
# Label for the generated/fake data
yg = np.zeros((len(X_gen), 1))
yg[:] = 0
#print(yg)

In [None]:
# Predicted label
y_pred = classifier.predict(X_gen)
print("Number of data samples wrongly classified: ", len(y_pred[y_pred[:] >= 0.5]))

In [None]:
# Number of samples considered real by the inital classifier
print(len(y_pred[y_pred[:] >= 0.5]))

In [None]:
# Get accuracy of the classifier on the training data
y_pred = (y_pred > 0.5)
score = classifier.evaluate(X_gen, yg)
print(score)

In [None]:
# confution matrix of 
confusion_matrix(y_pred, yg)

In [None]:
gen_data = np.hstack((X_gen, yg))

In [None]:
# extract column names
col_names = X.columns.values
col_names = np.append(col_names, ["Class"])
print(col_names)

In [None]:
# create dataframe of the generated data
df_gen = pd.DataFrame(data=gen_data, columns=col_names)
df_gen.Class = 0
df_gen.describe()

In [None]:
# switch positions of column tags
def df_column_switch(df, column1, column2):
    i = list(df.columns)
    a, b = i.index(column1), i.index(column2)
    i[b], i[a] = i[a], i[b]
    df = df[i]
    return df

data = df_column_switch(data, 'Class', 'normalizedAmount')


In [None]:
fraud  = data[data.Class == 1]
df_gen.Class = 2
gen_data = pd.concat([data,df_gen ])

In [None]:
gen_data.shape

In [None]:
# Distribution plot of Generated data

axarr = [[]]*len(df_gen.columns)
columns = 4
rows = int( np.ceil( len(df_gen.columns) / columns ) )
f, fig = plt.subplots( figsize=(columns*4, rows*3) )

f.suptitle('Distribution Plots', size=16)

for i, col in enumerate(data.columns[:]):
    axarr[i] = plt.subplot2grid( (int(rows), int(columns)), (int(i//columns), int(i%columns)) )
    axarr[i].hist( [ gen_data.loc[ gen_data.Class == 0, col ], gen_data.loc[ gen_data.Class == 1, col ], gen_data.loc[ gen_data.Class == 2, col ] ], label=['non-fraud','fraud', 'generated'], 
                          bins=np.linspace( np.percentile(data[col],0.1), np.percentile(data[col],99.9), 40 ),
                          normed=True )
    axarr[i].set_xlabel(col, size=12)
    axarr[i].set_ylim([0,0.8])
    axarr[i].tick_params(axis='both', labelsize=10)
    if i == 0: 
        legend = axarr[i].legend()
        legend.get_frame().set_facecolor('white')
    if i%4 != 0 : 
        axarr[i].tick_params(axis='y', left='off', labelleft='off')
    else:
        axarr[i].set_ylabel('Values',size=12)

plt.tight_layout(rect=[0,0,1,0.95]) # xmin, ymin, xmax, ymax
# plt.savefig('plots/Engineered_Data_Distributions.png')


In [None]:
fraud  = data[data.Class == 1]
nonfraud  = data[data.Class == 0]

In [None]:
df_gen.Class = 0

In [None]:
# dataset for the second classifier
train_gen = pd.concat([nonfraud.sample(10000), fraud.sample(150), df_gen])

In [None]:
#train/test split for the second classifier model

X_train_new = train_gen.iloc[:, train_gen.columns != 'Class']
y_train_new = train_gen.iloc[:, train_gen.columns == 'Class']

In [None]:
print(y_train_new[y_train_new.Class == 2])

In [None]:
# Initialising the ANN
classifier1 = Sequential()
classifier1.add(Dense(units =20 , kernel_initializer = 'uniform', activation = 'relu', input_dim = 29))
classifier1.add(Dense(units = 15, kernel_initializer = 'uniform', activation = 'relu'))
classifier1.add(Dense(units = 5, kernel_initializer = 'uniform', activation = 'relu'))
classifier1.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
classifier1.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# training a classifier on fake data
classifier1.fit(X_train_new, y_train_new, batch_size = 32, epochs = 100)

In [None]:
# Predicting and Evaluating of the validation set using the GAN model
y_pred = classifier1.predict(X_test)
y_pred = (y_pred > 0.5)
score = classifier1.evaluate(X_test, y_test)
score
confusion_matrix(y_test, y_pred)

In [None]:
# Feature importance for GAN model

np.random.seed(77)

df = pd.read_csv('creditcard.csv')
df.Amount = StandardScaler().fit_transform(df['Amount'].values.reshape(-1,1))

X_names = train_gen.columns.values[:-1]

# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=100, random_state=0)

forest.fit(X_train_new, y_train_new)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

In [None]:
# Print the feature ranking
print("Feature ranking:")

for f in range(X_x.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, X_names[indices[f]], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure(figsize= (15, 5 ))
plt.title("Feature importances")
plt.bar(range(X_x.shape[1]), importances[indices],
       color="blue", yerr=std[indices], align="center")
plt.xticks(range(X_x.shape[1]), X_names[indices])
plt.xlim([-1, X_x.shape[1]])
plt.show()