In [22]:
# imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from sklearn.ensemble import IsolationForest
from sklearn import set_config
set_config(transform_output='pandas')

In [6]:
# loading data 
df = pd.read_csv('Data/credit_card.csv')
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 29 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V1      10000 non-null  float64
 1   V2      10000 non-null  float64
 2   V3      10000 non-null  float64
 3   V4      10000 non-null  float64
 4   V5      10000 non-null  float64
 5   V6      10000 non-null  float64
 6   V7      10000 non-null  float64
 7   V8      10000 non-null  float64
 8   V9      10000 non-null  float64
 9   V10     10000 non-null  float64
 10  V11     10000 non-null  float64
 11  V12     10000 non-null  float64
 12  V13     10000 non-null  float64
 13  V14     10000 non-null  float64
 14  V15     10000 non-null  float64
 15  V16     10000 non-null  float64
 16  V17     10000 non-null  float64
 17  V18     10000 non-null  float64
 18  V19     10000 non-null  float64
 19  V20     10000 non-null  float64
 20  V21     10000 non-null  float64
 21  V22     10000 non-null  float64
 22 

In [11]:
df.duplicated().sum()

0

# 1. KMeans:

In [12]:
# create a copy of our df
df_kmeans = df.copy()

In [13]:
# scale data
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(df_kmeans), columns=df.columns)
X_scaled.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,-0.726092,-0.270865,1.38998,0.762227,-0.236899,0.264052,0.285346,0.131083,-0.384443,0.28612,...,0.370795,0.043066,0.689788,-0.163478,0.083851,0.099198,-0.532722,0.312929,-0.090009,0.459472
1,0.94981,-0.01302,-0.647336,0.11071,0.102482,-0.155239,-0.009963,0.120253,-0.915428,0.051569,...,-0.167706,-0.18551,-0.760673,0.268858,-0.603364,0.188884,0.033761,-0.039088,0.0435,-0.333835
2,-0.725138,-1.235035,0.733929,0.06281,-0.377373,1.293991,0.797185,0.249752,-1.995239,0.392444,...,0.830447,0.336466,1.471338,1.918734,-1.193707,-0.959876,-0.442774,-0.153601,-0.234422,1.69611
3,-0.467623,-0.356408,0.750937,-0.80803,0.042565,0.868121,0.283501,0.353112,-1.885794,0.153501,...,-0.401179,-0.056083,0.258431,-0.326493,-2.015264,1.303733,-0.591732,0.137995,0.217896,0.318445
4,-0.593701,0.452248,0.54094,0.079101,-0.295578,-0.018016,0.613062,-0.163028,0.004829,0.888749,...,0.634808,0.052846,1.513434,-0.218569,0.209441,-0.677493,0.71064,0.524976,0.791442,0.029532


In [14]:
# Fit a KMeans model to create 3 clusters.  Please use a random state of 42 for your model.
Kmeans = KMeans (n_clusters = 3, n_init = 'auto', random_state = 42)
Kmeans.fit(X_scaled)

In [18]:
# Add the clusters as a column in the dataframe
df_kmeans['cluster'] = Kmeans.labels_
df_kmeans.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,cluster
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,2
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,2
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,2
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,2
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,2


In [21]:
# Saving the cluster centers as a dataframe for visibility
cluster_centers = pd.DataFrame(Kmeans.cluster_centers_, columns=X_scaled.columns)
cluster_centers

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,-0.733373,0.614894,0.258886,0.531868,0.219655,0.238605,0.128409,-0.380314,0.343987,0.452917,...,0.132644,0.097765,0.024092,0.099802,0.018579,-0.634795,-0.172748,-0.163275,-0.312725,-0.051803
1,0.350063,-0.280681,-0.018162,-0.114395,-0.090054,-0.017565,-0.214025,0.106396,0.47703,-0.264719,...,-0.092215,-0.099574,-0.1116,0.012146,-0.002878,0.258292,0.243903,-0.04499,0.0872,0.010428
2,-0.025225,0.006732,-0.109574,-0.138477,-0.009073,-0.100062,0.176078,0.071372,-0.708874,0.070121,...,0.037044,0.062828,0.113336,-0.063763,-0.006094,0.028424,-0.187398,0.132547,0.059011,0.01429


In [23]:
# Use scipy.spatial.distance.cdist to create a matrix of distances between each data point and each cluster center
distances = cdist(X_scaled, Kmeans.cluster_centers_, 'euclidean')

In [24]:
distances.shape

(10000, 3)

In [26]:
# Saving distances as a dataframe for convenience
cluster_cols = [f"Distance (Cluster {c})" for  c in range(len(Kmeans.cluster_centers_))]
distance_df = pd.DataFrame(distances, columns = cluster_cols)
distance_df.head(3)

Unnamed: 0,Distance (Cluster 0),Distance (Cluster 1),Distance (Cluster 2)
0,4.127849,4.695017,2.825513
1,4.287068,3.583366,2.40011
2,7.190748,7.558619,6.579811


In [29]:
# Get the minimum distance to any cluster for each point
min_distances = np.min(distances, axis=1)

In [36]:
# Set a threshold based on a percentile
threshold = np.percentile(min_distances, 99.6)
threshold

20.678582375061943

In [37]:
# Identify anomalies where the distance to closest cluster center is above the threshold
filter_anomalies = min_distances > threshold
# how many were found?
filter_anomalies.sum()

40

In [44]:
# Define a list of the indices of the anomalous data using the threshold given by the stakeholder 
kmeans_anomalies = X_scaled[filter_anomalies].index
kmeans_anomalies

Int64Index([ 159, 1376, 1619, 2156, 2212, 2439, 2594, 2654, 2756, 2911, 2914,
            2917, 2923, 3443, 5303, 5412, 5413, 5529, 5674, 5704, 5764, 5977,
            6489, 6643, 6672, 7322, 7338, 7470, 7596, 7597, 8124, 8163, 8437,
            8442, 8856, 8939, 8999, 9071, 9304, 9326],
           dtype='int64')

# 2. Isolation Forest:

In [39]:
# Instantiate the model with a contaimination of 0.05 (we will identify 5% as anamolous)
iso = IsolationForest(contamination=0.004, random_state = 42)
# fit the model using .values to avoid a warning
iso.fit(df.values)

In [41]:
# Obtain results from the model
predictions = iso.predict(df.values)

In [42]:
# Change the labels to match our columns from the kmeans dataframe
# Not anamolies
predictions[predictions ==1] = 0
# Anomalies
predictions[predictions ==-1] = 1

In [45]:
# Define a list of the indices of the anomalous data using IsolationForest
iso_anomalies = df[predictions == 1].index
iso_anomalies

Int64Index([ 159, 1619, 2156, 2756, 2858, 2914, 2917, 2923, 5303, 5412, 5413,
            5704, 6311, 6489, 6581, 6595, 6634, 6643, 6672, 6757, 6761, 6798,
            6829, 7338, 7470, 7596, 7597, 8124, 8163, 8437, 8442, 8627, 8645,
            8667, 8670, 8856, 8999, 9071, 9304, 9326],
           dtype='int64')

# 3. Compare the list of anomalies from KMeans and Isolation Forest.  

In [47]:
# Make a list of anomolies identified in both methods
both = [a for a in iso_anomalies if a in kmeans_anomalies]
both

[159,
 1619,
 2156,
 2756,
 2914,
 2917,
 2923,
 5303,
 5412,
 5413,
 5704,
 6489,
 6643,
 6672,
 7338,
 7470,
 7596,
 7597,
 8124,
 8163,
 8437,
 8442,
 8856,
 8999,
 9071,
 9304,
 9326]

In [49]:
len(both)

27

Q1- How many anomalies did the two approaches agree on?
    
    - 27 anomalies

In [51]:
# Calculate the percentage of the anomalies didthe two approaches agree on
anomalie_number = 40
percentage = (27 * 100) / anomalie_number
percentage

67.5

Q2- What percentage of the anomalies did the two approaches agree on?
    
    - 67.5 %