# Credit Card Fraud Detection
#### By Shadi Bavar, Matthew Euliano, and Claire Parisi


##### Importing Required Libraries & Dataset

In [44]:
#Importing the libraries
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay,precision_recall_curve,auc,roc_auc_score,roc_curve,recall_score,classification_report, f1_score, accuracy_score, precision_score, recall_score
from sklearn.svm import SVC

In [71]:
# Import Data
data = pd.read_csv('creditcard.csv')

# Get samples for each class
fraudsDF = data[data.Class == 1]
normalDF = data[data.Class == 0]
N = len(data)

print("Number of Features: ", fraudsDF.shape[1])
print("Number of Fraudulent Transactions: ",fraudsDF.shape[0], "({:.2f}%)".format(100*fraudsDF.shape[0]/N))
print("Number of Normal Transactions",normalDF.shape[0], "({:.2f}%)".format(100*normalDF.shape[0]/N))

#See the dataset
data.head()

Number of Features:  31
Number of Fraudulent Transactions:  492 (0.17%)
Number of Normal Transactions 284315 (99.83%)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


#### Regularize Features & Data Prep

In [76]:
#Scaling amount 
std_scale = StandardScaler()
data['Amount'] = std_scale.fit_transform(data['Amount'].values.reshape(-1, 1))
data['Time'] = std_scale.fit_transform(data['Time'].values.reshape(-1, 1))
# data = data.drop(['Time'], axis=1)

#Split dataset into inputs (x) and labels (y)
x = data.drop(['Class'], axis = 1)
y = data['Class']

#### Break dataset into Training and Testing Sets Representative of the Imbalanced Dataset

In [77]:
#split dataset into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0, stratify = y)

#check distributions of data
fraud_train_pct = y_train[y_train== 1].value_counts()/len(y_train)*100
fraud_test_pct = y_test[y_test == 1].value_counts()/len(y_test)*100

print('Training set frauds:', round(fraud_train_pct[1], 2), '%')
print('Test set frauds:', round(fraud_test_pct[1], 2), '%')


Training set frauds: 0.17 %
Test set frauds: 0.17 %


#### Undersample the Training Set to Balance the Classes

In [78]:

#find instances of fraud in training set
train_fraud_indices = np.array(y_train[y_train== 1].index)
n_train_fraud = len(train_fraud_indices)
train_nonfraud_indices = np.array(y_train[y_train== 0].index)

#Randomly select number of non-fraud transactions to match the number of fraud transactions
random_indices = np.random.choice(train_nonfraud_indices, n_train_fraud, replace = False)
undersample_indices = np.concatenate([train_fraud_indices, random_indices])

#Resample the training data
x_train_u  = x_train.loc[undersample_indices]
y_train_u = y_train.loc[undersample_indices]

#Check the new distribution of data
fraud_train_pct_u = y_train_u[y_train_u== 1].value_counts()/len(y_train_u)*100
nonfraud_train_pct_u = y_train_u[y_train_u== 0].value_counts()/len(y_train_u)*100
print('Training set frauds:', round(fraud_train_pct_u[1], 2), '%')
print('Training set non-frauds:', round(nonfraud_train_pct_u[0], 2), '%')

Training set frauds: 50.0 %
Training set non-frauds: 50.0 %


## SVM

In [79]:
# Functions to test and train different SVM Models
from sklearn.metrics import classification_report
def train_svm_model(x,y,C,kernel,class_weight):
    model = SVC(C=C,kernel=kernel,class_weight=class_weight)
    model.fit(x, y)
    return model
    
def test_svm_model(x,y,model):
    prediction = model.predict(x)
    print(confusion_matrix(y,prediction))
    # ConfusionMatrixDisplay.from_predictions(y,prediction)
    # report= classification_report(y, prediction)
    # print(report)

#### Balanced Linear Model

In [80]:
# Balanced Model
svm_base = train_svm_model(x_train_u,y_train_u,1,'linear','balanced')
print("Base SVM Confusion Matrix tested on Training Set")
test_svm_model(x_train_u,y_train_u,svm_base)
print("Base SVM Confusion Matrix tested on Test Set")
test_svm_model(x_test,y_test,svm_base)

Base SVM Confusion Matrix tested on Training Set
[[385   9]
 [ 27 367]]
Base SVM Confusion Matrix tested on Test Set
[[55084  1780]
 [   13    85]]


#### Weighted Linear Model

In [66]:
# Balanced Model
svm_base = train_svm_model(x_train_u,y_train_u,1,'linear','balanced')
print("Base SVM Confusion Matrix tested on Training Set")
test_svm_model(x_train_u,y_train_u,svm_base)
print("Base SVM Confusion Matrix tested on Test Set")
test_svm_model(x_test,y_test,svm_base)

Base SVM Confusion Matrix tested on Training Set
[[388   6]
 [ 22 372]]
Base SVM Confusion Matrix tested on Test Set
[[55063  1801]
 [   11    87]]


#### Base RBF Model

In [118]:
model = SVC(C=1,kernel='rbf',gamma='scale')
model.fit(x_train_u,y_train_u)
print("Base SVM Confusion Matrix tested on Training Set")
test_svm_model(x_train_u,y_train_u,model)
print("Base SVM Confusion Matrix tested on Test Set")
test_svm_model(x_test,y_test,model)

Base SVM Confusion Matrix tested on Training Set
[[389   5]
 [ 42 352]]
Base SVM Confusion Matrix tested on Test Set




[[55882   982]
 [   13    85]]


#### Base Poly Model

In [82]:
model = SVC(C=10,degree=1,kernel='poly',gamma='scale')
model.fit(x_train_u,y_train_u)
print("Base SVM Confusion Matrix tested on Training Set")
test_svm_model(x_train_u,y_train_u,model)
print("Base SVM Confusion Matrix tested on Test Set")
test_svm_model(x_test,y_test,model)

Base SVM Confusion Matrix tested on Training Set
[[387   7]
 [ 30 364]]
Base SVM Confusion Matrix tested on Test Set
[[55522  1342]
 [   13    85]]


#### TODO - Cross-Validation: Hyperparams we can vary = Kernel, C-param, Class Weighting

In [14]:
# from sklearn.svm import SVC
# from sklearn.model_selection import StratifiedShuffleSplit
# from sklearn.model_selection import GridSearchCV

# C_range = np.logspace(-2, 10, 13)
# gamma_range = np.logspace(-9, 3, 13)
# param_grid = dict(gamma=gamma_range, C=C_range)
# cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
# grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
# grid.fit(x_train_u, y_train_u)

# print(
#     "The best parameters are %s with a score of %0.2f"
#     % (grid.best_params_, grid.best_score_)
# )

In [15]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

# Loading the Digits dataset
# X, y = datasets.load_digits(return_X_y=True)

# Split the dataset in two equal parts
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [
    {"kernel": ["rbf"], "gamma": [1e-3, 1e-4], "C": [1, 10, 100, 1000]},
    {"kernel": ["linear"], "C": [1, 10, 100, 1000]},
]

scores = ["precision", "recall"]

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), tuned_parameters, scoring="%s_macro" % score,n_jobs=-1)
    clf.fit(x_train_u, y_train_u)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_["mean_test_score"]
    stds = clf.cv_results_["std_test_score"]
    for mean, std, params in zip(means, stds, clf.cv_results_["params"]):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_train_u, clf.predict(x_train_u)
    print(classification_report(y_true, y_pred))
    print()

# Note the problem is too easy: the hyperparameter plateau is too flat and the
# output model is the same for precision and recall with ties in quality.

# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'C': 1000, 'kernel': 'linear'}

Grid scores on development set:

0.657 (+/-0.078) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.640 (+/-0.099) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.647 (+/-0.060) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.630 (+/-0.082) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.649 (+/-0.063) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.627 (+/-0.086) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.649 (+/-0.063) for {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
0.627 (+/-0.084) for {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
0.908 (+/-0.055) for {'C': 1, 'kernel': 'linear'}
0.913 (+/-0.039) for {'C': 10, 'kernel': 'linear'}
0.907 (+/-0.040) for {'C': 100, 'kernel': 'linear'}
0.917 (+/-0.043) for {'C': 1000, 'kernel': 'linear'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full 

In [19]:
import pickle
# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(clf, open(filename, 'wb'))
loaded_model = pickle.load(open(filename, 'rb'))
print(loaded_model.best_params_)

{'C': 1000, 'kernel': 'linear'}


In [20]:
# Test RandomizedSearchCV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
tuned_parameters = [
    {"kernel": ["linear"], "C": [0.1, 1, 10,20,50, 100,200,500, 1000,2000]},
]

scores = ["precision", "recall"]

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = RandomizedSearchCV(SVC(),tuned_parameters, n_iter=4, scoring="%s_macro" % score,n_jobs=-1)
    clf.fit(x_train_u, y_train_u)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_["mean_test_score"]
    stds = clf.cv_results_["std_test_score"]
    for mean, std, params in zip(means, stds, clf.cv_results_["params"]):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_train_u, clf.predict(x_train_u)
    print(classification_report(y_true, y_pred))
    print()

# Note the problem is too easy: the hyperparameter plateau is too flat and the
# output model is the same for precision and recall with ties in quality.

# Tuning hyper-parameters for precision





KeyboardInterrupt: 