# Credit-Fraud Detection (Unbalanced Dataset)

### This notebook makes use of the 'Highly Unbalanced Dataset' for Credit Card Fraud Detection provided on Kaggle(https://www.kaggle.com/dalpozz/creditcardfraud)

The implementation is done by:
### Nishchal Gaba (nishgaba9@gmail.com)
(October, 2017)

#### NOTE: The implementation is varied at some points from the Research Papers mentioned later to test effects of such variations for research purposes

In [None]:
# Like, the 'Protected Division' usually returns '1' on having a denominator of 0, whereas the paper on which the Paper of 'Improving fitness...' is based on mentions the protected division to be 0 if the denominator is 0

We test two approaches provided in the research papers published for such highly unbalanced datasets

### 1> Improving Fitness Functions in Genetic Programming for Classification on Unbalanced Credit Card Datasets
(Cao, V. L., Le-Khac, N. A., Nicolau, M., ONeill, M., & McDermott, J. (2017). Improving Fitness Functions in Genetic Programming for Classification on Unbalanced Credit Card Datasets. arXiv preprint arXiv:1704.03522.)

### 2> Scalable Twin Neural Networks for Classification of Unbalanced Data
(Pant, H., Soman, S., & Sharma, M. (2017). Scalable Twin Neural Networks for Classification of Unbalanced Data. arXiv preprint arXiv:1705.00347.)

In [39]:
import numpy as np
import matplotlib
import sys
import pandas as pd
import time
import math
import matplotlib.pyplot as plt
import random
import tpot as tp
from sklearn.cross_validation import train_test_split
from gplearn import genetic
import gplearn as gp
import itertools
%matplotlib inline

## Importing and checking the dataset

In [2]:
dataSet = pd.read_csv("creditcard.csv")
dataSet.head()

# 30 features with Time and Amount + 28 features from PCA

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
# Checking the Counts of different classes
# The positive cases for our tests are 492 frauds compared to 284315 for the genuine transactions
dataSet['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

# Implementing the Genetic Algorithm with improved fitness functions

In [4]:
# Population Size
pSize = 500

# Number of Generations
numGen = 1000

# Crossover Probability
pCross = 0.9

# Mutation Probability
pMut = 0.1

# Tournament Size
tSize = 3

In [5]:
# Creating a DataFrame to store the normalized values of the orignal data with same column names
normDataSet = pd.DataFrame(columns=dataSet.columns)

In [6]:
# Normalizing the data using min-max normalization
# normalized value = (value - min(attribute))/(max(attribute)-min(attribute))
for i in range(30):
   normDataSet[normDataSet.columns[i]]=(dataSet[dataSet.columns[i]]- min(dataSet[dataSet.columns[i]]))/(max(dataSet[dataSet.columns[i]])-min(dataSet[dataSet.columns[i]]))

In [7]:
normDataSet['Class']=dataSet['Class']
normDataSet

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.000000,0.935192,0.766490,0.881365,0.313023,0.763439,0.267669,0.266815,0.786444,0.475312,...,0.561184,0.522992,0.663793,0.391253,0.585122,0.394557,0.418976,0.312697,0.005824,0
1,0.000000,0.978542,0.770067,0.840298,0.271796,0.766120,0.262192,0.264875,0.786298,0.453981,...,0.557840,0.480237,0.666938,0.336440,0.587290,0.446013,0.416345,0.313423,0.000105,0
2,0.000006,0.935217,0.753118,0.868141,0.268766,0.762329,0.281122,0.270177,0.788042,0.410603,...,0.565477,0.546030,0.678939,0.289354,0.559515,0.402727,0.415489,0.311911,0.014739,0
3,0.000006,0.941878,0.765304,0.868484,0.213661,0.765647,0.275559,0.266803,0.789434,0.414999,...,0.559734,0.510277,0.662607,0.223826,0.614245,0.389197,0.417669,0.314371,0.004807,0
4,0.000012,0.938617,0.776520,0.864251,0.269796,0.762975,0.263984,0.268968,0.782484,0.490950,...,0.561327,0.547271,0.663392,0.401270,0.566343,0.507497,0.420561,0.317490,0.002724,0
5,0.000012,0.951057,0.777393,0.857187,0.244472,0.768550,0.262721,0.268257,0.788178,0.443190,...,0.558122,0.483915,0.665042,0.332185,0.564839,0.442749,0.421196,0.314769,0.000143,0
6,0.000023,0.979184,0.768746,0.838200,0.305241,0.767008,0.265762,0.265324,0.786257,0.478797,...,0.558776,0.497402,0.663145,0.277122,0.620014,0.383429,0.417148,0.313229,0.000194,0
7,0.000041,0.947348,0.782220,0.856031,0.230111,0.772104,0.267324,0.272183,0.744539,0.483978,...,0.592809,0.462660,0.666288,0.294686,0.554597,0.417014,0.394234,0.291099,0.001588,0
8,0.000041,0.943101,0.770278,0.835452,0.239894,0.783688,0.300439,0.267610,0.794515,0.449275,...,0.560296,0.497525,0.662401,0.518546,0.598855,0.362697,0.416728,0.316014,0.003628,0
9,0.000052,0.952547,0.779072,0.855511,0.242081,0.769078,0.260539,0.269325,0.786131,0.437401,...,0.557499,0.480466,0.663640,0.330349,0.573992,0.440836,0.421056,0.314810,0.000143,0


In [56]:
# Defining Conditional if
# if first argument is negative, return the second, otherwise third argument
# This is done to have an additional operator and avoiding having only smooth decision boundaries
def cond_if(arg1, arg2, arg3):
    return np.where(arg1<0,arg2,arg3)
    
# The function for protected division which returns 0 if the denominator is 0
def p_div(x1,x2):
    with np.errstate(divide='ignore',invalid='ignore'):
        return np.where(np.abs(x2)>0.001, np.divide(x1,x2),0.)

In [57]:
# Including the conditional if in the make functions of gp learn
cif = gp.functions.make_function(function=cond_if, name = 'cif', arity=3)
pdiv = gp.functions.make_function(function=p_div,name='pdiv',arity=2)

In [82]:
# Creating the new Errors mean fitness function
# f_errors_mean = (TP)/(TP+FN) + TN/(TN+FP) + (1-Err_mean_min)+(1-Err_mean_maj)
# TP = True Positive, TN = True Negative, FP = False Positive, FN = False Negative, Err_mean_min = Mean Error Minority class, Err_mean_maj = Mean Error Majority Class
def _fem(y,y_pred,w):
    TP = len([x for x,z in zip(y_pred,y) if ((x<=0) and (z==0))])
    TN = len([x for x,z in zip(y_pred,y) if ((x>0) and (z==1))])
    FP = len([x for x,z in zip(y_pred,y) if ((x<=0) and (z==1))])
    FN = len([x for x,z in zip(y_pred,y) if ((x>0) and (z==0))])
    err_min = p_div(np.sum(np.abs(x) for x,z in zip(y_pred,y) if ((x>0) and (z==0))),FN)
    err_maj = p_div(np.sum(np.abs(x) for x,z in zip(y_pred,y) if ((x<=0) and (z==1))),FP)   
    f = p_div(TP,(TP + FN))+p_div(TN,(TN+FP))+(1-err_min)+(1-err_maj)
    return f


In [83]:
fem = gp.fitness.make_fitness(_fem,greater_is_better=True)

In [84]:
est_gp = genetic.SymbolicRegressor(metric=fem, population_size = pSize, generations = numGen, tournament_size = tSize, p_crossover = pCross, p_subtree_mutation=pMut, p_hoist_mutation=0.0,p_point_mutation=0.0,function_set=['add','sub','mul',pdiv,cif],verbose=1)

In [85]:
# Sampling 70% of the dataset randomly
tot_train = normDataSet.sample(frac = 0.7)
# Getting the gp-tree for the training data, first 30 columns as values and 31st as the Class Label
est_gp.fit(tot_train.iloc[:,0:30],tot_train['Class'])

    |    Population Average   |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left


KeyboardInterrupt: 

# Implementing the Twin Neural Networks

In [9]:
# Work in progress, to be updated soon