In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Keras. 

from keras import models
from keras import layers
from keras import optimizers
from keras import losses
from keras import metrics

## For plotting
import matplotlib.pyplot as plt
from seaborn import set_style

## This sets the plot style
## to have a grid on a white background
set_style("white")

## Define categorical funcion. This prepares entries for tensor analysis. 

def to_categorical(y):
    return tf.keras.utils.to_categorical(y)


In [None]:
credit_card_df = pd.read_csv("/Users/Matt/Documents/GitHub/Credit-Card-Fraud-Detection/creditcard.csv")
credit_card_df.head()

In [None]:
# Pickout the valid transactions.

valid_transactions = credit_card_df.loc[credit_card_df['Class'] == 0.0]
fraudulent_transactions = credit_card_df.drop(valid_transactions.index)

## For example, 

fraudulent_transactions

In [None]:
# Create a more balanced data set. This training set will have a ~ 75 / 25 split of valid and fraudulent transactions.
# I'll randomly sample 1500 valid transactions and then add on the remaining 492 invalid transactions for a total of 
# 1992. 

valid_sample = valid_transactions.sample(1500)

sample_data = pd.concat([valid_sample, fraudulent_transactions])

# Set aside training/test data. 

sample_data_train = sample_data.sample(frac = .80, random_state = 440)
sample_data_test = sample_data.drop(sample_data_train.index)

# Make sure that the Class values (i.e. the entries of y) match the sample data.
# The training data:

X_train = np.array(sample_data_train.iloc[:, :-1])
y_train = np.array(sample_data_train.iloc[:, -1])

X_train = X_train.reshape(-1, 30)
y_train = y_train.reshape(-1, 1)

# And the test data:

X_test = np.array(sample_data_test.iloc[:, :-1])
y_test = np.array(sample_data_test.iloc[:, -1])

X_test = X_test.reshape(-1, 30)
y_test = y_test.reshape(-1, 1)

In [None]:
# Further split the training data into train_train/validation. 

from sklearn.model_selection import train_test_split

X_train_train,X_val,y_train_train,y_val = train_test_split(X_train, y_train,
                                                          test_size=.2,
                                                          shuffle=True,
                                                          stratify=y_train,
                                                          random_state=440)

# Check out the shape of the data. 

print(np.shape(X_train_train), type(X_train_train))
print(np.shape(y_train_train), type(y_train_train))
print(np.shape(X_val), type(X_val))
print(np.shape(y_val), type(y_val))
print('\n\n')
print(np.shape(to_categorical(y_train_train)), type(to_categorical(y_train_train)))
print(np.shape(to_categorical(y_val)), type(to_categorical(y_val)))

In [None]:
## Empty model
model = models.Sequential()

In [None]:
## Make the layers. ONLY RUN THIS ONCE. Make sure the input_shape matches the number of features.

model.add(layers.Dense(32, activation='relu', input_shape=(30,)))

model.add(layers.Dense(32, activation='relu'))

model.add(layers.Dense(2, activation='softmax'))

In [None]:
# Gives a model summary

model.summary()

In [None]:
# Here we compile the model. We use binary cross-entropy. 

model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

In [None]:
# Fitting the data. 

history = model.fit(X_train_train,
                        to_categorical(y_train_train),
                        epochs = 100,
                        batch_size = 512,
                        validation_data=(X_val,to_categorical(y_val)))

In [None]:
# Examine the results. 

history_dict = history.history

print(history_dict.keys())

In [None]:
# That's... intersting. 

set_style("whitegrid")

plt.figure(figsize = (10,6))

plt.scatter(range(1,101), history_dict['accuracy'], label = "Training Accuracy")
plt.scatter(range(1,101), history_dict['val_accuracy'], label = "Validation Set Accuracy")

plt.xlabel("Epoch", fontsize=18)
plt.ylabel("Accuracy", fontsize=18)

plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.legend(fontsize=18)

plt.show()

In [None]:
plt.figure(figsize = (10,6))

plt.scatter(range(1,101), history_dict['loss'], label = "Training Loss")
plt.scatter(range(1,101), history_dict['val_loss'], label = "Validation Set Loss")

plt.xlabel("Epoch", fontsize=18)
plt.ylabel("Loss Function Value", fontsize=18)

plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.legend(fontsize=18)

plt.show()

In [None]:
# Value counts of test data. 

sample_data_test.value_counts('Class')

In [None]:
# Prediction.

model.predict(X_test)

In [None]:
sample_data_test.iloc[:, -1]