In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

In [2]:
# Load the dataset
data = pd.read_csv('creditcard.csv')


In [3]:
#duplicate values checking
data.duplicated().sum()


183

In [4]:
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [5]:
data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
43659,41598,-0.538686,-0.635328,-0.041569,-3.022158,0.771776,3.556774,-1.41882,1.371999,-2.475053,...,-0.090409,-0.310254,0.16851,0.955413,-0.427495,-0.375891,0.05101,0.072416,60.01,0.0
43660,41599,-0.349615,-2.860571,0.297766,0.801212,-1.523995,1.16426,0.037484,0.32522,0.862825,...,0.31537,-0.503099,-0.514747,-0.19389,-0.327091,0.87582,-0.144908,0.125368,745.56,0.0
43661,41599,-3.001222,2.899766,0.726874,-0.729992,-0.312792,-0.735557,0.960944,-0.276141,2.128747,...,-0.64554,-0.613956,0.111663,0.338567,0.268357,0.076981,1.184716,0.491066,8.99,0.0
43662,41599,1.042342,-0.390001,-0.46368,-1.737393,0.450611,0.604354,0.068099,0.27803,0.555053,...,-0.137137,-0.286887,0.143336,-0.98681,0.135599,-0.185344,0.03559,-0.008304,48.0,0.0
43663,41600,1.193909,0.067328,0.609239,0.825453,-0.332038,-0.101522,-0.130782,-0.033697,0.424588,...,-0.04995,0.213908,-0.06875,0.182281,0.557841,0.4,,,,


In [6]:
#data information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43664 entries, 0 to 43663
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    43664 non-null  int64  
 1   V1      43664 non-null  float64
 2   V2      43664 non-null  float64
 3   V3      43664 non-null  float64
 4   V4      43664 non-null  float64
 5   V5      43664 non-null  float64
 6   V6      43664 non-null  float64
 7   V7      43664 non-null  float64
 8   V8      43664 non-null  float64
 9   V9      43664 non-null  float64
 10  V10     43664 non-null  float64
 11  V11     43664 non-null  float64
 12  V12     43664 non-null  float64
 13  V13     43664 non-null  float64
 14  V14     43664 non-null  float64
 15  V15     43664 non-null  float64
 16  V16     43664 non-null  float64
 17  V17     43664 non-null  float64
 18  V18     43664 non-null  float64
 19  V19     43664 non-null  float64
 20  V20     43664 non-null  float64
 21  V21     43664 non-null  float64
 22

In [7]:
 #drops rows with null values
data.dropna(inplace=True)

In [8]:
data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [9]:
data.isnull().values.any()

False

In [10]:
# Number of legitimate and fraudulent transactions in data
data['Class'].value_counts()


0.0    43529
1.0      134
Name: Class, dtype: int64

In [11]:
# Get the Fraud and normal dataset
fraud = data[data['Class'] == 1]
normal = data[data['Class'] == 0]


In [12]:
print(fraud.shape,normal.shape)

(134, 31) (43529, 31)


In [13]:
fraud['Amount'].describe()


count     134.000000
mean       93.928433
std       231.551248
min         0.000000
25%         1.000000
50%         6.455000
75%        99.990000
max      1809.680000
Name: Amount, dtype: float64

In [14]:
normal['Amount'].describe()


count    43529.000000
mean        90.302800
std        238.465356
min          0.000000
25%          7.580000
50%         24.900000
75%         81.400000
max       7879.420000
Name: Amount, dtype: float64

In [15]:
# Take some sample of the data
data_sample= data.sample(frac = 0.1,random_state=1)
data_sample.shape

(4366, 31)

In [16]:
data.shape


(43663, 31)

In [17]:
# Determine the number of fraud and valid transactions in the dataset
Fraud = data_sample[data_sample['Class'] == 1]
Valid = data_sample[data_sample['Class'] == 0]

outlier_fraction = len(Fraud) / float(len(Valid))


In [19]:
print(outlier_fraction)
print("Fraud Cases : {}".format(len(Fraud)))
print("Valid Cases : {}".format(len(Valid)))

0.0036781609195402297
Fraud Cases : 16
Valid Cases : 4350


In [21]:
# Filter the columns to remove data we do not want
columns = [c for c in columns if c != "Class"]  # Use "Class" instead of "is_fraud"

In [22]:
# Store the variable we are predicting
target = "Class"  # Use "Class" instead of "is_fraud"

In [23]:
# Define a random state
state = np.random.RandomState(42)
X = data_sample[columns]
Y = data_sample[target]
X_outliers = state.uniform(low=0, high=1, size=(X.shape[0], X.shape[1]))

In [24]:
# Print the shapes of X & Y
print(X.shape)
print(Y.shape)

(4366, 30)
(4366,)


In [25]:
#Define the outlier detection methods
classifiers = {
    "Isolation Forest":IsolationForest(n_estimators=100, max_samples=len(X),
                                       contamination=outlier_fraction,random_state=state, verbose=0),
    "Local Outlier Factor":LocalOutlierFactor(n_neighbors=20, algorithm='auto',
                                              leaf_size=30, metric='minkowski',
                                              p=2, metric_params=None, contamination=outlier_fraction),
}

type(classifiers)


dict

In [26]:
# We need to convert the strings to float otherwise we will get error hence we convert use labelencoder

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for column_name in X.columns:
    if X[column_name].dtype == object:
        X[column_name] = le.fit_transform(X[column_name])
    else:
        pass

In [31]:
n_outliers = len(Fraud)
for i, (clf_name,clf) in enumerate(classifiers.items()):
    #Fit the data and tag outliers
    if clf_name == "Local Outlier Factor":
        y_pred = clf.fit_predict(X)
        scores_prediction = clf.negative_outlier_factor_
    else:
        clf.fit(X)
        scores_prediction = clf.decision_function(X)
        y_pred = clf.predict(X)
    #Reshape the prediction values to 0 for Valid transactions , 1 for Fraud transactions
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1
    n_errors = (y_pred != Y).sum()
    # Run Classification Metrics
    print("{}: {}".format(clf_name,n_errors))
    print("Accuracy Score :")
    print(accuracy_score(Y,y_pred))



Isolation Forest: 19
Accuracy Score :
0.9956481905634448
Local Outlier Factor: 33
Accuracy Score :
0.9924415941365093
