## Anomaly (fraud) detection in credit cards
- A low precision score means more employee hour for investigating suspicious transactions and therefore more cost to the company.
- A low recall means that we miss fraudulent activities that need to be identified by the model.
- Minimizing false negatives (High recall score) is slightly more important to us than minimizing false positives (high precision score).

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report,accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
file_path = '../Datasets/creditcard.csv'
df = pd.read_csv(file_path)
print(df.columns)

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')


In [3]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
# Any missing values?
df.isnull().values.any()

False

In [5]:
nNormal = df[df['Class'] == 0].shape[0]
nFraud = df[df['Class'] == 1].shape[0]
anomalyFraction = nFraud / nNormal
print('number of frauds: {}'.format(nFraud))
print('number of normal: {}'.format(nNormal))
print('Anomaly to normal ratio: {}'.format(anomalyFraction))

number of frauds: 492
number of normal: 284315
Anomaly to normal ratio: 0.0017304750013189597


In [6]:
y = df.Class.copy()
X = df.drop(['Class'], axis=1, inplace=False).copy()

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(199364, 30) (199364,)
(85443, 30) (85443,)


# Feature normalization

In [8]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Predict anomalies

- precision = TP / (TP+FP)
- recall = TP / (TP+FN)
- f-score = (2 x precision x recall) / (precision + recall)

In [9]:
print('Number of anomalies: {}'.format(sum(y_test == 1)))

Number of anomalies: 136


In [10]:
def printErrors(y_pred):
    print('Recall score', recall_score(y_test, y_pred))
    print('Precision score', precision_score(y_test, y_pred))

### OneClassSVM

In [11]:
model = OneClassSVM(kernel='linear', degree=2, gamma='scale', nu=0.1, max_iter=10)
model.fit(X_train)
y_pred = model.predict(X_test)
y_pred[y_pred == -1] = 0
printErrors(y_pred)



Recall score 0.03676470588235294
Precision score 0.0015571473061351605


### Isolation Forest

In [33]:
model = IsolationForest(n_estimators=50, max_samples=10000, contamination=anomalyFraction, max_features=0.1)
model.fit(X_train)
y_pred = model.predict(X_test)
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1
printErrors(y_pred)

Recall score 0.3088235294117647
Precision score 0.3088235294117647


### Local Outlier Factor (LOF)

In [54]:
model = LocalOutlierFactor(n_neighbors=6, algorithm='auto', leaf_size=10, 
                           metric='minkowski', p=1, metric_params=None, contamination=anomalyFraction)
# Since it is a neighborhood based model, we need to merge test and train data (my own understanding)
X_full = pd.concat([X_train, X_test], ignore_index=True)
y_pred = model.fit_predict(X_full)
y_pred = y_pred[-y_test.shape[0]:]
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1
printErrors(y_pred)

Recall score 0.1323529411764706
Precision score 0.125


# Undersampling

In [19]:
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=2)
X_train_small, y_train_small = ros.fit_resample(X_train, y_train)

In [20]:
y_train_small.shape

(712,)

In [62]:
# For grid search
def findBestParameters(model, train_X, train_y):
    modelName = model.__class__.__name__
    parameters = {}
    if modelName == 'SVC':
        parameters = {
            'C':[50, 25, 10, 5, 1],
            'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'gamma': ['scale', 'auto']}
        
    elif modelName == 'RandomForestClassifier':
        parameters = {
            'n_estimators': [10, 50, 100],
            'max_depth': [10, 25, None],
            'max_features': ['auto', 'sqrt'],
            'min_samples_leaf': [1, 2, 4]}
        
    elif modelName == 'MLPClassifier':
        parameters = {
            'hidden_layer_sizes': [(5), (10), (20), 
                        (5, 5), (10, 10), (20, 20),
                        (5, 5, 5), (10, 10, 10), (20, 20, 20)],
            'max_iter': [500, 1000]}
    elif modelName == 'LogisticRegression':
        parameters = {
            'C': [0.1, 1, 5, 10, 25, 50],
            'max_iter': [100, 500, 1000]}
    
    clf = GridSearchCV(model, param_grid=parameters, cv=5)
    clf.fit(train_X, train_y)
    print('Best parameters for', modelName, '\n', clf.best_params_)
    
    if modelName == 'SVC':
        return SVC(C=clf.best_params_['C'], 
                   gamma=clf.best_params_['gamma'], 
                   kernel=clf.best_params_['kernel'])
    elif modelName == 'RandomForestClassifier':
        return RandomForestClassifier(n_estimators=clf.best_params_['n_estimators'], 
                                      max_depth=clf.best_params_['max_depth'],
                                     max_features=clf.best_params_['max_features'], 
                                      min_samples_leaf=clf.best_params_['min_samples_leaf'])
    elif modelName == 'MLPClassifier':
        return MLPClassifier(hidden_layer_sizes=clf.best_params_['hidden_layer_sizes'], 
                             max_iter= clf.best_params_['max_iter'])
    elif modelName == 'LogisticRegression':
        return LogisticRegression(C=clf.best_params_['C'],
                                  max_iter= clf.best_params_['max_iter'])
    
    return np.nan

In [26]:
import warnings
warnings.filterwarnings('ignore')# dont show warnings
allModels = [LogisticRegression(), SVC(), MLPClassifier(), RandomForestClassifier()]
for i, m in enumerate(allModels):
    m = findBestParameters(m, X_train_small, y_train_small)
    m.probability = True # need this for changing thresholds later
    m.fit(X_train_small, y_train_small)
    allModels[i] = m
    print('')

Best parameters for LogisticRegression 
 {'C': 25}

Best parameters for SVC 
 {'C': 5, 'gamma': 'scale', 'kernel': 'poly'}

Best parameters for MLPClassifier 
 {'hidden_layer_sizes': (20, 20, 20), 'max_iter': 1000}

Best parameters for RandomForestClassifier 
 {'max_depth': 25, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 100}



In [23]:
for m in allModels:
    print(m.__class__.__name__)
    y_pred = m.predict(X_test)
    print('Precision score', precision_score(y_test, y_pred))
    print('Recall score', recall_score(y_test, y_pred))
    print('')

LogisticRegression
Precision score 0.09779179810725552
Recall score 0.9117647058823529

SVC
Precision score 0.0574400723654455
Recall score 0.9338235294117647

MLPClassifier
Precision score 0.045454545454545456
Recall score 0.9264705882352942

RandomForestClassifier
Precision score 0.06007568590350047
Recall score 0.9338235294117647



### Results:
It seems that the models are good at minimizing false negatives, but bad at minimizing false positives. The easy solution is to change the thresholds.

# Try oversampling

In [9]:
from imblearn.over_sampling import SMOTE, ADASYN
X_train_large, y_train_large = SMOTE().fit_resample(X_train, y_train)

In [10]:
y_train_large.shape

(398016,)

In [16]:
lr = LogisticRegression(C=1, max_iter=1000)
lr.fit(X_train_large, y_train_large)
y_pred = lr.predict(X_test)
print('Precision score', precision_score(y_test, y_pred))
print('Recall score', recall_score(y_test, y_pred))

Precision score 0.06206896551724138
Recall score 0.9264705882352942


### Results:
a slight improvement in recall at the cost of precision

# Change threshold

In [58]:
threshold = 0.99
for m in allModels:
    print(m.__class__.__name__)
    m.probability = True
    #y_pred = m.predict(X_test)
    y_pred = (m.predict_proba(X_test)[:,1] >= threshold).astype(int)
    print('Precision score', precision_score(y_test, y_pred))
    print('Recall score', recall_score(y_test, y_pred))
    print('')

LogisticRegression
Precision score 0.7441860465116279
Recall score 0.7058823529411765

SVC
Precision score 0.6298342541436464
Recall score 0.8382352941176471

MLPClassifier
Precision score 0.7394366197183099
Recall score 0.7720588235294118

RandomForestClassifier
Precision score 0.8613861386138614
Recall score 0.6397058823529411



### Results 
- The threshold can be set as a number in the range [0.80, 0.999] to get a good balance between recall and precision

# Ensembled
Use a two layer ensembled method: First layer is the four models getting part of the training set and predicting probabilities, instead of 0 and 1, of the sample belonging to a class. The second layer is a linear regression model using getting the output of those models and getting trained on the remainder of the training set.

In [60]:
X_train_layer1, X_train_layer2, y_train_layer1, y_train_layer2 = \
train_test_split(X_train_small, y_train_small, test_size=0.4, random_state=42)
print(y_train_layer1.shape)
print(y_train_layer2.shape)

(427,)
(285,)


In [65]:
warnings.filterwarnings('ignore')# dont show warnings
layer1_models = [LogisticRegression(), SVC(), MLPClassifier(), RandomForestClassifier()]
for i, m in enumerate(layer1_models):
    m = findBestParameters(m, X_train_layer1, y_train_layer1)
    m.probability = True # need this for layer 2
    m.fit(X_train_layer1, y_train_layer1)
    layer1_models[i] = m
    print('')

Best parameters for LogisticRegression 
 {'C': 50, 'max_iter': 100}

Best parameters for SVC 
 {'C': 50, 'gamma': 'scale', 'kernel': 'linear'}

Best parameters for MLPClassifier 
 {'hidden_layer_sizes': (20, 20, 20), 'max_iter': 1000}

Best parameters for RandomForestClassifier 
 {'max_depth': 25, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'n_estimators': 100}



In [71]:
df_layer2_inputs = pd.DataFrame()
for m in layer1_models:
    y_pred_temp = m.predict_proba(X_train_layer2)[:,0]
    df_layer2_inputs[m.__class__.__name__] = y_pred_temp
df_layer2_inputs.head()

Unnamed: 0,LogisticRegression,SVC,MLPClassifier,RandomForestClassifier
0,0.01114235,0.01737018,0.005716266,0.04275
1,4.662597e-05,9.284086e-09,0.0001303647,0.0
2,0.9695099,0.940683,0.9776284,0.913056
3,0.8796579,0.9170129,0.7701589,0.869274
4,1.51168e-12,3.000001e-14,3.740585e-07,0.0


In [72]:
layer2_model = LogisticRegression()
layer2_model.probability = True
layer2_model = findBestParameters(layer2_model, df_layer2_inputs, y_train_layer2)
layer2_model.fit(df_layer2_inputs, y_train_layer2)

Best parameters for LogisticRegression 
 {'C': 0.1, 'max_iter': 100}


LogisticRegression(C=0.1)

In [83]:
threshold = 0.85
df_layer2_inputs = pd.DataFrame()
for m in layer1_models:
    y_pred_temp = m.predict_proba(X_test)[:,0]
    df_layer2_inputs[m.__class__.__name__] = y_pred_temp
#y_pred = layer2_model.predict(df_layer2_inputs)
y_pred = (layer2_model.predict_proba(df_layer2_inputs)[:,1] >= threshold).astype(int)
print('Precision score', precision_score(y_test, y_pred))
print('Recall score', recall_score(y_test, y_pred))

Precision score 0.7692307692307693
Recall score 0.8088235294117647


### Results
The ensembled method showed better performance than any single method, but it still relies on the threshold to find the right balance between Recall and Precision.