In [35]:
import numpy as np
import pandas as pd
import os
os.environ["TF_CFF_MIN_LOG_LEVEL"]="3"
from sklearn.utils import shuffle
import keras


#importing keras, Dense, Dropout
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from pandas import set_option

#importing matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
    , precision_recall_curve, roc_curve, f1_score
    , fbeta_score

#importing the datasets using pan  das
df = pd.read_csv(r"C:\Users\EMMANUEL\Desktop\python\csv files\creditcardfraud.zip")

#printting the first 5 rows of our data
print(df.head(5))

#printing the datasets in an overview format that describes the data 
print(df.describe)

#printing the dimension of the data
print(df.shape)

#printing each data attribute
print(df.dtypes)

#Printing the correlation between attributes using corr()
set_option('display.width', 100)
set_option('precision', 2)
correlations = df.corr(method='pearson')
print(correlations)

#printing the skew of the data to get more accuracy using the skew() function
print(df.skew())

#Count the number of samples for each class
print(df.Class.value_counts())

#It is obvious that this dataset is highly unbalance
#it is easier to sort the datasets by "class" for stratified sampling(A statistical technique for sampling your data)
df.sort_values(by='Class', ascending=False, inplace=True)




IndentationError: unexpected indent (<ipython-input-35-bae5f6364f1c>, line 21)

In [36]:
#DATA PREPROCESSING: PREPARING THE DATA FOR PROCESSING
#Data preprocessing in Deep Learning mainly has a method known as Dropout method
#We are going to dropout the "Time" column. We donot need the time feature to determine whether the data was fraudulent or not

df.drop('Time', axis=1, inplace=True)

#Assign the first "3000" samples to new dataframe
df_sample = df.iloc[:3000, :]

#Count the number of samples for each class again 
print(df_sample.Class.value_counts())

#Randomly shuffle our dataset to remove all biasness
shuffle_df = shuffle(df_sample, random_state = 42)

KeyError: "['Time'] not found in axis"

In [None]:
#THE NEXT STEP IS DATA SPLICING WHICH WE WOULD SPLIT OUR DATA 
#INTO TRAINING DATASETS AND TESTING DATASETS
df_train = shuffle_df[0:2400]
df_test = shuffle_df[2400:]

#We are printing the train and testing datasets
print(df_train)
print(df_test)


In [25]:
#Split each dataframe into "feature" and "label" which means your input and your output
train_feature = np.array(df_train.values[:, 0:29])
train_label = np.array(df_train.values[:, -1])
test_feature = np.array(df_train.values[:, 0:29])
test_label = np.array(df_train.values[:, -1])

#Print out the size of train dataframe "Should be of size 2400*29"
print(train_feature.shape)

#Print out the size of the test dataframe "should be of size 2400*1"
print(test_label.shape)

(2400, 29)
(2400,)


In [26]:
#Standardization or Normalization of the features columns to increase the training speed
#Using MinMaxScaler() function
scaler = MinMaxScaler()
scaler.fit(train_feature)
train_feature_trans = scaler.transform(train_feature)
test_feature_trans = scaler.transform(test_feature)

#A function to plot the learning curves
def show_train_history(train_history, train, validation):
    plt.plot(train_history[train])
    plt.plot(train_history[validation])
    plt.title('Train History')
    plt.ylabel(train)
    plt.xlabel('Epoch')
    plt.legend(['train', 'validation'], loc='best')
    plt.show()
    

In [27]:
#Select the type of the model.(MODEL CREATION)
#We would be using the keras function which allows us to build a model by layers
model = Sequential()

#Add the first Dense layer with 200 neuron units and Relu activation function 
#A dense layer is standard layer in which all the nodes in the previous layer connec to the node in the current layer.
model.add(Dense(units=200,
                input_dim=29,
                kernel_initializer='uniform',
                activation='relu'))

#Add Dropout to prevent overfitting.Overfitting occurs when your model memorizes the training datasets(overfitting reduces the accuracy of the model)
model.add(Dropout(0.5))

#Add the second layer with 200 neuron units and Relu activation function
model.add(Dense(units=200,
                kernel_initializer='uniform',
               activation='relu'))

#Add Dropout method to prevent overfitting. 0.5 is a standard dropout value.
model.add(Dropout(0.5))

#Add an output layer with 1neuron unit and sigmoid activation function
model.add(Dense(units=1,
               kernel_initializer='uniform',
               activation='sigmoid'))

#Print ou the model summary
print(model.summary)

<bound method Network.summary of <keras.engine.sequential.Sequential object at 0x0000028580A52B48>>


In [33]:
#Configure the learning process by selecting 'Binary cross tropy' as a loss
#ADAM(Defualt optimizer named as Adaptive Moment Estimation) as an optitimization function, and to optimize the 'Accuracy matrix'
#An optimizer takes care of the necesssary computation to change the networks weight and bytes.
model.compile(loss='binary crossentropy', optimizer='adam',
              metrics=['accuracy'])

#fit the model by pass 'train_feature_traans' as input for X, 'train_label'
#number of epochs = 200 and batch size = 500
#Batch size is used so we dont over fit our model, it splits the model into batches.
train_history = model.fit(x=train_feature_trans, y=train_label,
                         validation_split=0.8, epochs=200,
                         batch_size=500, verbose=2)

#Print out the accuracy curves for training and validation sets
show_train_history(train_history, 'acc', 'val_acc')

#Evaluation phase
#use the testing set to evaluate the model
scores = model.evaluate(test_feature_trans, test_label)

#Print out the accuracy
print('\n')
print('Accuracy', scores[1])

prediction = model.predict_classes(test_feature_trans)

df_ans = pd.DataFrame({'Real Class': test_label})
df_ans['Prediction'] = prediction

df_ans['Prediction'].value_counts()

df_ans['Real Class'].value_counts()

cols = ['Real_Class_1', 'Real_Class_0'] #Gold standard 
rows = ['Prediction_1', 'Prediction_0'] #Diagnostic tool (our prediction)

B1P1 = len(df_ans[(df_ans['Prediction'] == df_ans['Real Class']) & (df_ans['Prediction'])])
B1P0 = len(df_ans[(df_ans['Prediction'] != df_ans['Real Class']) & (df_ans['Prediction'])])
B0P1 = len(df_ans[(df_ans['Prediction'] != df_ans['Real Class']) & (df_ans['Prediction'])])
B0P0 = len(df_ans[(df_ans['Prediction'] == df_ans['Real Class']) & (df_ans['Prediction'])])

conf = np.array([[B1P1, B0P1], [B1P0, B0P0]])
df_cm = pd.DataFrame(conf, columns=[i for i in cols], index=[i for i in rows])
f, ax = plt.subplots(figsize=(5, 5))
sns.heatmap(df_cm, annot=True, ax=ax, fmt='d')
plt.show()

#Making x label be on top is common in textbooks.
ax.xaxis.set_ticks_position('top')

print('Total number of test cases:', np.sum(conf))

#Model summary Function
def model_efficiency(conf):
    total_num = np.sum(conf)
    sen = conf[0][0] / (conf[0][0] + conf[1][0])
    

ValueError: Unknown loss function:binary crossentropy