In [1]:
import pandas as pd
import numpy as np

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [161]:
# This notebook will help to illustrate some of the transformations we studied during session 2
# We will start reading the dataset. It is quite big (6 million rows) so it will take 10 to 15 seconds to load
# Please be patient!
d = pd.read_csv('fraud_transactions.csv')

In [162]:
# Let's have a look to the contents
d

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1080168,129,CASH_IN,147637.68,C1109456046,20712.00,168349.68,C1755346629,48166.69,0.00,0,0
1080169,129,CASH_OUT,230278.62,C640550315,168349.68,0.00,C274837916,694270.90,924549.53,0,0
1080170,129,CASH_OUT,438213.13,C1149108861,21224.00,0.00,C860418160,31534.35,225458.74,0,0
1080171,129,PAYMENT,18400.57,C1908167981,0.00,0.00,M888953392,0.00,0.00,0,0


In [163]:
# And see how the main statistic for the numeric columns
d.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,1080173.0,1080173.0,1080173.0,1080173.0,1080173.0,1080173.0,1080173.0,1080173.0
mean,29.51987,158583.4,868453.6,887875.5,975309.1,1110079.0,0.001372002,0.0
std,21.42796,269405.0,2949583.0,2985993.0,2295034.0,2414481.0,0.03701515,0.0
min,1.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0
25%,15.0,11894.75,0.0,0.0,0.0,0.0,0.0,0.0
50%,21.0,74872.37,15973.78,0.0,122076.6,212563.3,0.0,0.0
75%,40.0,212912.4,134750.0,171742.9,911944.2,1142936.0,0.0,0.0
max,129.0,10000000.0,38939420.0,38946230.0,42054660.0,42169160.0,1.0,0.0


In [164]:
# We are dropping here some columns. I will drop isFlaggedFraud because it does not belong to the problem
# Why do you think I dropped 'nameOrig' and 'nameDest' ?
d = d.drop(columns=['nameOrig', 'nameDest', 'isFlaggedFraud'])
d

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,1,PAYMENT,9839.64,170136.00,160296.36,0.00,0.00,0
1,1,PAYMENT,1864.28,21249.00,19384.72,0.00,0.00,0
2,1,TRANSFER,181.00,181.00,0.00,0.00,0.00,1
3,1,CASH_OUT,181.00,181.00,0.00,21182.00,0.00,1
4,1,PAYMENT,11668.14,41554.00,29885.86,0.00,0.00,0
...,...,...,...,...,...,...,...,...
1080168,129,CASH_IN,147637.68,20712.00,168349.68,48166.69,0.00,0
1080169,129,CASH_OUT,230278.62,168349.68,0.00,694270.90,924549.53,0
1080170,129,CASH_OUT,438213.13,21224.00,0.00,31534.35,225458.74,0
1080171,129,PAYMENT,18400.57,0.00,0.00,0.00,0.00,0


In [165]:
# Check if there are null values
d.isna().sum()

step              0
type              0
amount            0
oldbalanceOrg     0
newbalanceOrig    0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
dtype: int64

In [166]:
# I will encode the column 'type' assigning and arbitrary numerical value to each transaction type
# First, check the different values of column type
d['type'].unique()

array(['PAYMENT', 'TRANSFER', 'CASH_OUT', 'DEBIT', 'CASH_IN'],
      dtype=object)

In [167]:
# Now use tranform to apply the function encode_type
def encode_type(x):
    if x == 'PAYMENT':
        return 1
    if x == 'TRANSFER':
        return 2
    if x == 'CASH_OUT':
        return 3
    if x == 'DEBIT':
        return 4
    if x == 'CASH_IN':
        return 5
    
d['type'] = d['type'].transform(encode_type)
d

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,1,1,9839.64,170136.00,160296.36,0.00,0.00,0
1,1,1,1864.28,21249.00,19384.72,0.00,0.00,0
2,1,2,181.00,181.00,0.00,0.00,0.00,1
3,1,3,181.00,181.00,0.00,21182.00,0.00,1
4,1,1,11668.14,41554.00,29885.86,0.00,0.00,0
...,...,...,...,...,...,...,...,...
1080168,129,5,147637.68,20712.00,168349.68,48166.69,0.00,0
1080169,129,3,230278.62,168349.68,0.00,694270.90,924549.53,0
1080170,129,3,438213.13,21224.00,0.00,31534.35,225458.74,0
1080171,129,1,18400.57,0.00,0.00,0.00,0.00,0


In [151]:
# At this point you should try one hot encoding column 'type' and
# check if the model quality improves.
d = pd.get_dummies(d, prefix = ['one_hot'], columns = ['type'])
d

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,one_hot_1,one_hot_2,one_hot_3,one_hot_4,one_hot_5
0,1,9839.64,170136.00,160296.36,0.00,0.00,0,1,0,0,0,0
1,1,1864.28,21249.00,19384.72,0.00,0.00,0,1,0,0,0,0
2,1,181.00,181.00,0.00,0.00,0.00,1,0,1,0,0,0
3,1,181.00,181.00,0.00,21182.00,0.00,1,0,0,1,0,0
4,1,11668.14,41554.00,29885.86,0.00,0.00,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1080168,129,147637.68,20712.00,168349.68,48166.69,0.00,0,0,0,0,0,1
1080169,129,230278.62,168349.68,0.00,694270.90,924549.53,0,0,0,1,0,0
1080170,129,438213.13,21224.00,0.00,31534.35,225458.74,0,0,0,1,0,0
1080171,129,18400.57,0.00,0.00,0.00,0.00,0,1,0,0,0,0


In [152]:
# Insert here the code to standardize the dataest values and check
# if the model performance improves
for column in ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']:
    d[column] = (d[column] - d[column].mean()) / d[column].std()
d

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,one_hot_1,one_hot_2,one_hot_3,one_hot_4,one_hot_5
0,1,-0.552120,-0.236751,-0.243664,-0.424965,-0.459759,0,1,0,0,0,0
1,1,-0.581723,-0.287229,-0.290855,-0.424965,-0.459759,0,1,0,0,0,0
2,1,-0.587971,-0.294371,-0.297347,-0.424965,-0.459759,1,0,1,0,0,0
3,1,-0.587971,-0.294371,-0.297347,-0.415736,-0.459759,1,0,0,1,0,0
4,1,-0.545332,-0.280345,-0.287338,-0.424965,-0.459759,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1080168,129,-0.040629,-0.287411,-0.240967,-0.403978,-0.459759,0,0,0,0,0,1
1080169,129,0.266124,-0.237357,-0.297347,-0.122455,-0.076840,0,0,0,1,0,0
1080170,129,1.037953,-0.287237,-0.297347,-0.411225,-0.366381,0,0,0,1,0,0
1080171,129,-0.520342,-0.294433,-0.297347,-0.424965,-0.459759,0,1,0,0,0,0


In [153]:
# Check how balanced is the dataset
is_fraud = d[d.isFraud == 1]
is_not_fraud = d[d.isFraud == 0]

print(len(is_fraud))
print(len(is_not_fraud))

1482
1078691


In [156]:
# The dataset is very unbalanced. 
# With this code we will undersample the dataset and get balanced classes

# index property contains the indexes of the not_fraud dataset
is_not_fraud_indexes = is_not_fraud.index

# get as many fraud indexes (randomly) as non fraud occurrences
np.random.seed(42)
random_non_fraud_indexes = np.random.choice(is_not_fraud_indexes, len(is_fraud))

# Keep those entries in not fraud
is_not_fraud = d.loc[random_non_fraud_indexes]

# We will form the balanced dataset concatenating fraud and non_fraud
bln = is_fraud.append(is_not_fraud)

# Check the result
print(len(bln[bln.isFraud == 1]))
print(len(bln[bln.isFraud == 0]))

1482
1482


  bln = is_fraud.append(is_not_fraud)


In [168]:
# To see how the model performs with the unbalanced dataset 
# uncomment the next line
bln = d

In [169]:
# The regular learning process
X = bln.copy()
Y = bln['isFraud']
X = bln.drop(['isFraud'], axis=1)

trainX, testX, trainY, testY = train_test_split(X, Y, test_size = 0.2, random_state=42)

#trainX = X 
#trainY = Y

#testX = d.drop(columns=['isFraud'])
#testY = d['isFraud']

clf = linear_model.LogisticRegression()
clf.fit(trainX, trainY)
predY = clf.predict(testX)

# We measure the quality of our model using two scores:
# - Accuracy: number of correct predictions divided by the number of samples
# - AUC: area under the curve. This will be explained in session 4
# The best measure for classification problems is AUC. If you try with the unbalanced dataset
# you will get an almost perfect accuracy but a rather low AUC. Can you explain how is this possible?
# (This will be explained in detail in session 4 as well)
print(accuracy_score(testY, predY))
print(roc_auc_score(testY, predY))

0.9987455736339019
0.7042735128305146


In [170]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(testY, predY)
cm

array([[215636,     86],
       [   185,    128]])