For anomaly detection, an autoencoder is trained on normal transactions, learning to reconstruct them accurately. When an abnormal transaction (anomaly) is input to the autoencoder, it should struggle to reconstruct it, resulting in a high reconstruction error, which can then be flagged as an anomaly.

In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.read_csv(r"C:\Users\Neha\Downloads\creditcard.csv")

In [35]:
df.shape

(284807, 31)

In [3]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,1.168375e-15,3.416908e-16,-1.379537e-15,2.074095e-15,9.604066e-16,1.487313e-15,-5.556467e-16,1.213481e-16,-2.406331e-15,...,1.654067e-16,-3.568593e-16,2.578648e-16,4.473266e-15,5.340915e-16,1.683437e-15,-3.660091e-16,-1.22739e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [6]:
df.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

In [7]:
x=df.drop(columns=['Class'],axis=1).values
y=df['Class'].values

In [8]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()

x=scaler.fit_transform(x)

In [10]:
# Split the data into training (normal transactions) and test sets
x_train = x[y == 0]  # Use only normal transactions for training
x_test = x           # Use the entire dataset for testing

In [17]:
import tensorflow as tf
from tensorflow.keras import models,layers
from tensorflow.keras.optimizers import SGD

model=models.Sequential()

model.add(layers.Dense(64,activation='relu',input_shape=(x_train.shape[1],)))
model.add(layers.Dense(32,activation='relu'))

model.add(layers.Dense(16,activation='relu'))

model.add(layers.Dense(32,activation='relu'))
model.add(layers.Dense(64,activation='relu'))
model.add(layers.Dense(x_train.shape[1],activation='sigmoid'))

model.compile(optimizer='SGD',loss='binary_crossentropy',metrics=['accuracy'])

model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [18]:
history=model.fit(x_train,x_train,epochs=10,batch_size=32)

Epoch 1/10
[1m8885/8885[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 3ms/step - accuracy: 0.0419 - loss: 197.6706
Epoch 2/10
[1m8885/8885[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 3ms/step - accuracy: 0.0336 - loss: 0.0108
Epoch 3/10
[1m8885/8885[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 3ms/step - accuracy: 0.0300 - loss: 0.0134
Epoch 4/10
[1m8885/8885[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 3ms/step - accuracy: 0.0320 - loss: 0.0095
Epoch 5/10
[1m8885/8885[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 3ms/step - accuracy: 0.0285 - loss: 0.0048
Epoch 6/10
[1m8885/8885[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 3ms/step - accuracy: 0.0310 - loss: 0.0087
Epoch 7/10
[1m8885/8885[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 3ms/step - accuracy: 0.0255 - loss: 0.0117
Epoch 8/10
[1m8885/8885[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 3ms/step - accuracy: 0.0333 - loss: 0.0065
Epoch 9/10
[1

In [23]:
predicted=model.predict(x_test)

[1m8901/8901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step


In [24]:
predicted

array([[1.6685967e-06, 2.5953033e-03, 1.2262256e-24, ..., 6.0326288e-10,
        4.5133395e-09, 1.1899756e-08],
       [1.6685967e-06, 2.5953033e-03, 1.2262256e-24, ..., 6.0326288e-10,
        4.5133395e-09, 1.1899756e-08],
       [1.6685967e-06, 2.5953033e-03, 1.2262256e-24, ..., 6.0326288e-10,
        4.5133395e-09, 1.1899756e-08],
       ...,
       [1.6685967e-06, 2.5953033e-03, 1.2262256e-24, ..., 6.0326288e-10,
        4.5133395e-09, 1.1899756e-08],
       [1.6685967e-06, 2.5953033e-03, 1.2262256e-24, ..., 6.0326288e-10,
        4.5133395e-09, 1.1899756e-08],
       [1.6685967e-06, 2.5953022e-03, 1.2262256e-24, ..., 6.0326288e-10,
        4.5133395e-09, 1.1899756e-08]], dtype=float32)

In [36]:
reconstruction_errors=np.mean(np.square(x_test-predicted),axis=1)

In [37]:
reconstruction_errors

array([ 1.56017526,  1.3743831 , 12.86928876, ...,  2.06241746,
        4.57706492,  1.18955686])

In [38]:
threshold = np.percentile(reconstruction_errors, 95)  # This means the top 5% of errors are considered anomalies

# Flag samples as anomalies if their reconstruction error exceeds the threshold
anomalies = reconstruction_errors > threshold

# Output results
print("Threshold for anomaly detection:", threshold)
print("Number of anomalies detected:", np.sum(anomalies))

Threshold for anomaly detection: 17.19244553504706
Number of anomalies detected: 14241
