In [2]:
import pandas as pd

# Original file is to heavy to be in this repo, so here is the download link:
# https://caelum-online-public.s3.amazonaws.com/1799-xgboost/02/creditcard.csv

data = pd.read_csv('creditcard.csv')
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
print(data.isna().sum())

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64


In [4]:
n_transactions = data['Class'].count()
n_frauds = data['Class'].sum()
n_normal = n_transactions - n_frauds

frauds_perc = n_frauds / n_transactions
normal_perc = n_normal / n_transactions

print('Transactions number: ', n_transactions)
print('Frauds number: ', n_frauds, round(frauds_perc * 100, 2))
print('Non fraudulent transactions number: ', n_normal, round(normal_perc * 100, 2))

Transactions number:  284807
Frauds number:  492 0.17
Non fraudulent transactions number:  284315 99.83


In [5]:
from sklearn.model_selection import StratifiedShuffleSplit


def execute_validator(X, y):
    validator = StratifiedShuffleSplit(n_splits = 1, test_size=0.1, random_state=0)
    for train_id, test_id in validator.split(X, y):
        X_train, X_test = X[train_id], X[test_id]
        y_train, y_test = y[train_id], y[test_id]

    return X_train, X_test, y_train, y_test

In [6]:
%%time
from sklearn import tree

def execute_classifier(classifier, X_train, X_test, y_train):    
    tree = classifier.fit(X_train, y_train)
    y_pred = tree.predict(X_test)
    return y_pred


Wall time: 112 ms


In [7]:
import matplotlib.pyplot as plt

def save_tree(classifier, name):
    plt.figure(figsize=(200,100))
    tree.plot_tree(classifier, filled = True, fontsize=14)
    plt.savefig(name)
    plt.close()

In [8]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

def validate_tree(y_test, y_pred):
    print(accuracy_score(y_test, y_pred))    
    print('Precision Score:',precision_score(y_test, y_pred))
    print('Recall Score:', recall_score(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))    

In [9]:
# Validator execution
X =  data.drop('Class', axis = 1).values
y = data['Class'].values

X_train, X_test, y_train, y_test = execute_validator(X, y)

In [10]:
# DecisionTreeClassifier execution
decision_tree_classifier = tree.DecisionTreeClassifier()

decision_tree_y_pred = execute_classifier(decision_tree_classifier, X_train, X_test, y_train)

In [11]:
# Creation of decision tree figure

save_tree(decision_tree_classifier, 'decision_tree_1.png')


In [12]:
# Decision tree validation
### confusion matrix
# [[a, b]
#  [c, d]]

### Precision d / (b + d)
### Recall: d / (d + c)

validate_tree(y_test, decision_tree_y_pred)

0.9990871107053826
Precision Score: 0.7254901960784313
Recall Score: 0.7551020408163265
[[28418    14]
 [   12    37]]


In [13]:
print(decision_tree_classifier)
print(decision_tree_classifier.get_depth())

DecisionTreeClassifier()
21


In [14]:
# DecisionTreeClassifier execution
decision_tree_classifier = tree.DecisionTreeClassifier(max_depth=10, random_state=0)

decision_tree_y_pred = execute_classifier(decision_tree_classifier, X_train, X_test, y_train)

In [15]:
# Best result
validate_tree(y_test, decision_tree_y_pred)

0.9994733330992591
Precision Score: 0.9473684210526315
Recall Score: 0.7346938775510204
[[28430     2]
 [   13    36]]


In [16]:
# DecisionTreeClassifier execution
decision_tree_classifier = tree.DecisionTreeClassifier(max_depth=10, random_state=0, min_samples_leaf=10)

decision_tree_y_pred = execute_classifier(decision_tree_classifier, X_train, X_test, y_train)

In [17]:
validate_tree(y_test, decision_tree_y_pred)

0.9993679997191109
Precision Score: 0.8604651162790697
Recall Score: 0.7551020408163265
[[28426     6]
 [   12    37]]


In [21]:
# By reducing our depth, we got worst results

decision_tree_classifier = tree.DecisionTreeClassifier(max_depth=5, random_state=0)
decision_tree_y_pred = execute_classifier(decision_tree_classifier, X_train, X_test, y_train)

validate_tree(y_test, decision_tree_y_pred)

0.999403110845827
Precision Score: 0.9210526315789473
Recall Score: 0.7142857142857143
[[28429     3]
 [   14    35]]


In [23]:
%%time
from sklearn.ensemble import RandomForestClassifier

random_forest_classifier = RandomForestClassifier(n_estimators=50, random_state=0, max_depth=10)

random_forest_y_pred = execute_classifier(random_forest_classifier, X_train, X_test, y_train)

Wall time: 1min 28s


In [24]:
validate_tree(y_test, random_forest_y_pred)

0.9995435553526912
Precision Score: 0.9736842105263158
Recall Score: 0.7551020408163265
[[28431     1]
 [   12    37]]


In [25]:
%%time
from sklearn.ensemble import RandomForestClassifier

random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=0, max_depth=10)

random_forest_y_pred = execute_classifier(random_forest_classifier, X_train, X_test, y_train)

Wall time: 3min 2s


In [27]:
save_tree(random_forest_classifier.estimators_[0], 'random_forest_1')
save_tree(random_forest_classifier.estimators_[1], 'random_forest_2')


In [28]:
validate_tree(y_test, random_forest_y_pred)

0.9995084442259752
Precision Score: 0.9487179487179487
Recall Score: 0.7551020408163265
[[28430     2]
 [   12    37]]


In [30]:
%%time
from sklearn.ensemble import AdaBoostClassifier

adaboost_classifier = AdaBoostClassifier(random_state=0)
adaboost_y_pred = execute_classifier(adaboost_classifier, X_train, X_test, y_train)
 

Wall time: 1min 34s


In [31]:
save_tree(adaboost_classifier.estimators_[0], 'adaboost1')
save_tree(adaboost_classifier.estimators_[2], 'adaboost2')

validate_tree(y_test, adaboost_y_pred)


0.9992626663389628
Precision Score: 0.8888888888888888
Recall Score: 0.6530612244897959
[[28428     4]
 [   17    32]]


In [33]:
%%time
from sklearn.ensemble import AdaBoostClassifier

#Increase the number of estimators from default (50) to 100
adaboost_classifier = AdaBoostClassifier(random_state=0, n_estimators=100)
adaboost_y_pred = execute_classifier(adaboost_classifier, X_train, X_test, y_train)
 
validate_tree(y_test, adaboost_y_pred)

0.999403110845827
Precision Score: 0.8636363636363636
Recall Score: 0.7755102040816326
[[28426     6]
 [   11    38]]
Wall time: 2min 54s


In [38]:
%%time
from sklearn.ensemble import AdaBoostClassifier

#Increase the number of estimators from 100 to 200
adaboost_classifier = AdaBoostClassifier(random_state=0, n_estimators=200)
adaboost_y_pred = execute_classifier(adaboost_classifier, X_train, X_test, y_train)
 
validate_tree(y_test, adaboost_y_pred)

0.9995435553526912
Precision Score: 0.9285714285714286
Recall Score: 0.7959183673469388
[[28429     3]
 [   10    39]]
Wall time: 5min 26s
