In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.patches as mpatches
import time
import gc

# Classifier Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import collections
import tensorflow as tf
from scipy.stats import randint as sp_randint
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
import time


import keras
from keras import backend as K
from keras.models import Sequential
from keras.layers import Activation
from keras.layers.core import Dense
from keras.optimizers import Adam
from keras.metrics import categorical_crossentropy
from keras import regularizers
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

from keras_tqdm import TQDMNotebookCallback
from tqdm import tqdm_notebook,trange, tqdm

# Other Libraries
# from imblearn.datasets import fetch_datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
# from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import random

In [None]:
df = pd.read_csv("../data/creditcardfraud/creditcard.csv")
df.head()

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler

# RobustScaler is less prone to outliers.

std_scaler = StandardScaler()
rob_scaler = RobustScaler()

df['scaled_amount'] = rob_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_time'] = rob_scaler.fit_transform(df['Time'].values.reshape(-1,1))

df.drop(['Time','Amount'], axis=1, inplace=True)

In [None]:

X = df.drop('Class', axis=1)
y = df['Class']

sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

for train_index, test_index in sss.split(X, y):
    print("Train:", train_index, "Test:", test_index)
    original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
    original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]

# We already have X_train and y_train for undersample data thats why I am using original to distinguish and to not overwrite these variables.
# original_Xtrain, original_Xtest, original_ytrain, original_ytest = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the Distribution of the labels


# Turn into an array
original_Xtrain = original_Xtrain.values
original_Xtest = original_Xtest.values
original_ytrain = original_ytrain.values
original_ytest = original_ytest.values
scaled_amount = df['scaled_amount']
scaled_time = df['scaled_time']

df.drop(['scaled_amount', 'scaled_time'], axis=1, inplace=True)
df.insert(0, 'scaled_amount', scaled_amount)
df.insert(1, 'scaled_time', scaled_time)

# Amount and Time are Scaled!

df.head()

In [None]:
# Since our classes are highly skewed we should make them equivalent in order to have a normal distribution of the classes.

# Lets shuffle the data before creating the subsamples

df = df.sample(frac=1)

# amount of fraud classes 492 rows.
fraud_df = df.loc[df['Class'] == 1]
non_fraud_df = df.loc[df['Class'] == 0][:492]

normal_distributed_df = pd.concat([fraud_df, non_fraud_df])

# Shuffle dataframe rows
new_df = normal_distributed_df.sample(frac=1, random_state=42)

new_df.head()


In [None]:
# Undersampling before cross validating (prone to overfit)
X = new_df.drop('Class', axis=1)
y = new_df['Class']

In [None]:
# Our data is already scaled we should split our training and test sets
from sklearn.model_selection import train_test_split
# This is explicitly used for undersampling.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

classifiers = {
#     "LogisiticRegression": LogisticRegression(),
#     "KNearest": KNeighborsClassifier(),
#     "Support Vector Classifier": SVC(),
    "DecisionTreeClassifier": DecisionTreeClassifier()
}

In [None]:

for key, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    training_score = cross_val_score(classifier, X_train, y_train, cv=5)
    print("Classifiers: ", classifier.__class__.__name__, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")

In [None]:
from sklearn.model_selection import GridSearchCV



# DecisionTree Classifier
tree_params = {"criterion": ["gini", "entropy"], "max_depth": list(range(1,100,1)), 
              "max_leaf_nodes": list(range(2,100,1))}
grid_tree = GridSearchCV(DecisionTreeClassifier(), cv=5, tree_params, n_jobs=-1)
grid_tree.fit(X_train, y_train)

# tree best estimator
tree_clf = grid_tree.best_estimator_

tree_score = cross_val_score(tree_clf, X_train, y_train, cv=5)
print('DecisionTree Classifier Cross Validation Score', round(tree_score.mean() * 100, 2).astype(str) + '%')

In [None]:
grid_tree.best_estimator_

In [None]:
fig, ax = plt.subplots()

labels = 'Fraud', 'No Fraud'

data = [df['Class'].value_counts()[1], df['Class'].value_counts()[0]]

print(data)

def func(pct, allvals):
    absolute = int(pct/100.*np.sum(allvals))
    return "{:.1f}%\n{:d} samples".format(pct, absolute)

ax.pie(data, labels=labels, startangle=60, autopct=lambda pct: func(pct, data))

ax.axis('equal')


plt.show()

In [None]:
scores = [x[1] for x in grid_tree.grid_scores_]

In [None]:

param_dist = {
              "n_neighbors": sp_randint(1, 20),
              "p": [1, 2]
}
start_time = time.time()
rand_kNN = RandomizedSearchCV(KNeighborsClassifier(), param_dist, cv=10, n_jobs=-1, n_iter=50)
rand_kNN.fit(X_train, y_train)
knn_clf = rand_kNN.best_estimator_
kNN_score = cross_val_score(knn_clf, X_train, y_train, cv=10)
print('kNN Cross Validation Score', round(kNN_score.mean() * 100, 2).astype(str) + '%')
print(str(i) + " " + str(scores) + "--- %s seconds ---" % (time.time() - start_time))

In [None]:
rand_kNN.best_estimator_

In [None]:
xs = [];
ys = [];
zs = [];
for i in range(0, len(rand_kNN.cv_results_['params'])):
    xs.append(rand_kNN.cv_results_['params'][i]['n_neighbors'])
    ys.append(rand_kNN.cv_results_['params'][i]['p'])
    zs.append(rand_kNN.cv_results_['mean_test_score'][i])



print(xs, ys, zs)
    
# fig = plt.figure(figsize=(8, 6))

# ax = fig.add_subplot(111)

# ax.scatter(xs, zs, label=ys)


# # ax.set_zlabel('Mean 10-fold score')

# plt.savefig("gridSearchCredit")

scatter_x = np.array(xs)
scatter_y = np.array(zs)
group = np.array(ys)
cdict = {1: 'red', 2: 'blue'}

fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111)
for g in np.unique(group):
    ix = np.where(group == g)
    ax.scatter(scatter_x[ix], scatter_y[ix], c = cdict[g], label = g, s = 100)
ax.set_xlabel('Number of Neighboors')
ax.set_ylabel('Mean 10-fold score')
ax.legend(title="Power of Minkowski Metric")
plt.show()

In [None]:
from mpl_toolkits.mplot3d import Axes3D

xs = [];
ys = [];
zs = [];

num = 0;

for score in grid_tree.grid_scores_:
#     print(score[0])
    if score[0]['criterion'] == 'entropy':
        
        if ((score[0]['max_depth'] % 5 == 0 and score[0]['max_leaf_nodes'] % 19 == 0)):
            xs.append(score[0]['max_depth'])
            ys.append(score[0]['max_leaf_nodes'])
            zs.append(score[1])
        num = num + 1


In [None]:
fig = plt.figure(figsize=(8, 6))

ax = fig.add_subplot(111, projection='3d')

ax.scatter(xs, ys, zs)

ax.set_xlabel('Depth Limit')
ax.set_ylabel('Child Node Limit')
ax.set_zlabel('Mean 5-fold score')

plt.savefig("gridSearchCredit")

plt.show()

In [None]:


n_inputs = X_train.shape[1]

def getModel(learning_rate, regularization, layers):
    undersample_model = Sequential();
    
    undersample_model.add(Dense(n_inputs, input_shape=(n_inputs, ), activation='relu'))
    for i in range(0, layers):
        undersample_model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(regularization)))
    undersample_model.add(Dense(1, activation = "sigmoid"))

    undersample_model.compile(Adam(lr=learning_rate), loss=keras.losses.binary_crossentropy, metrics=['accuracy'])
    return undersample_model;

In [None]:
getModel(.001, .1, 2).summary()

In [None]:
k = 10
num_epochs = 200
num_val_samples = len(X_train) // k
k_fold_results = [];

params = [[.001, .01, 2], [.001, .01, 1], [.001, .1, 1], [.001, .1, 2], [.0005, .1, 2], [.0005, .01, 2], [.0005, .1, 1], [.0005, .01, 1]]

for j in tqdm_notebook(params):
    print("using params! " + str(j))
    for i in tqdm_notebook(range(k)):
        undersample_model = getModel(j[0], j[1], j[2])
        print("TESTING FOLD " + str(i + 1))
        val_data = X_train[i * num_val_samples: (i + 1) * num_val_samples]
        val_targets = y_train[i * num_val_samples: (i + 1) * num_val_samples]

        partial_train_data = np.concatenate(
            [X_train[:i * num_val_samples],
            X_train[(i + 1) * num_val_samples:]],
            axis=0)
        partial_train_targets = np.concatenate(
            [y_train[:i * num_val_samples],
            y_train[(i + 1) * num_val_samples:]],
            axis=0)

        history = undersample_model.fit(partial_train_data, partial_train_targets,
                            epochs=num_epochs, batch_size=32, verbose=0,
                            validation_data=(val_data, val_targets))

        k_fold_results.append([j, history]);
        print();
    

In [None]:
idx = 0;
means = []
for i in range(0, len(params)):
    curr_params, hist_obj = k_fold_results[idx]
    mean = {
        'val_loss':[0] * num_epochs,
        'val_acc':[0] * num_epochs,
        'loss':[0] * num_epochs,
        'acc':[0] * num_epochs,
        'params': curr_params
    }

    for j in range(0, k):   
        curr_params, hist_obj = k_fold_results[idx]
        
        for mean_key in hist_obj.history:
            for j in range(0, len(hist_obj.history[mean_key])):
                mean[mean_key][j] += hist_obj.history[mean_key][j]
        idx += 1
    for key in mean.keys():
        mean[key] = [x / k for x in mean[key]]

    means.append(mean);
        


In [None]:
len(means)

In [None]:
plt.figure(figsize=(20,15))

# counter1 = 1;
# counter2 = 4
# len()
maxAccs = []
minAccs = []

for mean in means:
    maxAccs.append(max(mean['val_acc']))
    minAccs.append(min(mean['val_acc']))
#     plt.subplot(4, 4, counter1)
#     plt.plot(mean['acc'])
#     plt.plot(mean['val_acc'])
#     plt.title('Model accuracy ' + str([x * 10 for x in mean['params']]))
#     plt.ylabel('Accuracy')
#     plt.xlabel('Epoch')
#     plt.legend(['Train', 'Test'], loc='upper left')
# #     plt.show()

#     counter1 += 1
    
#     plt.subplot(2, 4, counter2)
#     plt.plot(mean['loss'])
#     plt.plot(mean['val_loss'])
#     plt.title('Model loss ' + str([x * 10 for x in mean['params']]))
#     plt.ylabel('Loss')
#     plt.xlabel('Epoch')
#     plt.legend(['Train', 'Test'], loc='upper left')
# #     plt.show()
#     counter2 += 1;

print(maxAccs)
print(sorted(maxAccs))
# 2, 5, 7

print(minAccs)
print(sorted(minAccs))
# tops = [means[2], means[5], means[7]]
# plt.tight_layout()
# plt.show()

In [None]:
tops[0]['val_acc'].index(max(tops[0]['val_acc']))

In [None]:
plt.figure(figsize=(20,10))

counter2 = 4;
counter1 = 1
for mean in tops:
    plt.subplot(2, 3, counter1)
    plt.plot(mean['acc'])
    plt.plot(mean['val_acc'])
    plt.title('Model accuracy ' + str([x * 10 for x in mean['params']]))
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper left')

    counter1 += 1
    plt.subplot(2, 3, counter2)
    plt.plot(mean['loss'])
    plt.plot(mean['val_loss'])
    plt.title('Model loss ' + str([x * 10 for x in mean['params']]))
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.ylim(0, 3.5)
#     plt.yscale("log")
    plt.legend(['Train', 'Test'], loc='upper left')
    counter2 += 1;

plt.tight_layout()
plt.show()

In [None]:
plt

In [None]:
best_dtc = DecisionTreeClassifier(max_depth=5, max_leaf_nodes=76)
adaboost = AdaBoostClassifier(base_estimator = best_dtc, n_estimators=1000)

In [None]:
adaboost_score = cross_val_score(adaboost, X_train, y_train, cv=10)
print('adaboost Cross Validation Score', round(adaboost_score.mean() * 100, 2).astype(str) + '%')

In [None]:
random.randint(1, 1000)

In [None]:
xs = [];
ys = [];
zs = [];
start_time = time.time()
for i in range(0, 50):
    n = random.randint(1, 1000)
    print("TESTING WITH " + str(n))
    best_dtc = DecisionTreeClassifier(max_depth=5, max_leaf_nodes=76)
    adaboost = AdaBoostClassifier(base_estimator = best_dtc, n_estimators=n)
#     best_dtc2 = DecisionTreeClassifier(max_depth=5, max_leaf_nodes=10)
#     adaboost2 = AdaBoostClassifier(base_estimator= best_dtc2)
    adaboost_score = cross_val_score(adaboost, X_train, y_train, cv=10, n_jobs=-1)
#     adaboost_score2 = cross_val_score(adaboost2, X_train, y_train, cv=10, n_jobs=-1)
#     zs.append(adaboost_score2)
    ys.append(adaboost_score.mean())
    xs.append(n)
    print('adaboost Cross Validation Score', round(adaboost_score.mean() * 100, 2).astype(str) + '%')
#     print('adaboost 2 Cross Validation Score', round(adaboost_score2.mean() * 100, 2).astype(str) + '%')    
print(str(i) + " " + str(scores) + "--- %s seconds ---" % (time.time() - start_time))


In [None]:
plt.xlabel("Number of Estimators")
plt.ylabel("10-fold Mean Accuracy")
plt.axhline(.935)
plt.scatter(xs, ys) #382 max

In [None]:
start_time = time.time()
param_dist = {
              "kernel": ["linear", "rbf", "sigmoid"],
              "C": sp_randint(1, 300),
}
svm = RandomizedSearchCV(SVC(), param_dist, cv=10, n_jobs=6, n_iter=50)
svm.fit(X_train, y_train)
svm_clf = svm.best_estimator_
svm_score = cross_val_score(svm_clf, X_train, y_train, cv=10)
print('svm Cross Validation Score', round(svm_score.mean() * 100, 2).astype(str) + '%')
print(str(i) + " " + str(scores) + "--- %s seconds ---" % (time.time() - start_time))

In [None]:
svm_clf

In [None]:
svm.cv_results_

In [None]:
kernelParams = {
    'linear' : {
        'xs': [],
        'ys': [],
    },
    'rbf': {
        'xs': [],
        'ys': [],
    },
    'sigmoid': {
        'xs': [],
        'ys': [],
    }
}

In [None]:

for i in range(0, len(svm.cv_results_['params'])):
    kernelParams[svm.cv_results_['params'][i]['kernel']]['xs'].append(svm.cv_results_['params'][i]['C'])
    kernelParams[svm.cv_results_['params'][i]['kernel']]['ys'].append(svm.cv_results_['mean_test_score'][i])
    

In [None]:
legendNames = [];
for x in kernelParams.keys():
    legendNames.append(x)
    plt.scatter(kernelParams[x]['xs'], kernelParams[x]['ys'])
plt.legend(legendNames)
plt.ylabel("10-fold mean accuracy")
plt.xlabel("C penalty")

In [None]:
resultsFinal = [
    [],
    [],
    [],
    [],
    []
];
print(len(X_train))
for i in range(1, 11):
    gc.collect()
    y_train_subset = y_train[:int((len(X_train)/10) * i)]
    X_train_subset = X_train[:int((len(X_train)/10) * i)]
    print(str(len(X_train)) + " " + str(len(y_train)))
    print(str(len(X_train_subset)) + " " + str(len(y_train_subset)))

    bestModels = [
        DecisionTreeClassifier(max_depth=200, max_leaf_nodes=76),
        getModel(.001, .1, 1),
        KNeighborsClassifier(n_neighbors=3, p=2),
        AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth=200, max_leaf_nodes=76), n_estimators=17),
        SVC(kernel="linear", C=199)
    ]
    
    for j in range(0, len(bestModels)):
        start_time = time.time()
        if j == 1:
            bestModels[j].fit(X_train_subset, y_train_subset, epochs=25, batch_size=32, verbose=0)
        else: 
            bestModels[j].fit(X_train_subset, y_train_subset)
    
        predicAll = bestModels[j].predict(original_Xtest)
        predicSub = bestModels[j].predict(X_test)
        scores = [roc_auc_score(y_test, predicSub), roc_auc_score(original_ytest, predicAll)]
        resultsFinal[j].append(scores)
        print(str(i) + " " + str(scores) + "--- %s seconds ---" % (time.time() - start_time))
    

In [None]:
resultsFinal

In [None]:
def plotForAlgo(plt, selector, title):
    xs1 = [];
    ys1 = [];
    ys2 = [];
    otherCounter = 0
    for i in resultsFinal:
        if otherCounter == selector:
            counter = 1;
            for j in i:
                xs1.append(counter * 10)
                counter += 1;
                ys1.append(j[0])
                ys2.append(j[1])
        otherCounter+=1;
    
    plt.subplot(3, 3, selector + 1)
    plt.plot(xs1, ys1)
    plt.plot(xs1, ys2)
    plt.title(title)
    plt.xlabel("Percent of Training Dataset")
    plt.ylabel("ROC-AUC Score")

    plt.legend(["20% undersampled subset", "Entire Dataset"])


In [None]:
plt.figure(figsize=(15,10))
plotForAlgo(plt, 0, "Decision Tree")
plotForAlgo(plt, 1, "Neural Networks")
plotForAlgo(plt, 2, "k Nearest Neighboors")
plotForAlgo(plt, 3, "Boosted Decision Tree")
plotForAlgo(plt, 4, "SVM Classifier")



plt.tight_layout()
plt.show()

In [None]:
plotForAlgo(1)

In [None]:
plotForAlgo(2)

In [None]:
plotForAlgo(3)

In [None]:
plotForAlgo(4)