  # **Credit Card Fraud Detection using Scikit-Learn and Snap ML**


In [None]:
%pip install opendatasets

In [None]:
import opendatasets as od
od.download("https://www.kaggle.com/mlg-ulb/creditcardfraud")

In [None]:
%pip install snapml

In [None]:
from __future__ import print_function
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
import time
import warnings
warnings.filterwarnings("ignore")


# Dataset Analysis
In this section you will read the dataset in a Pandas dataframe and visualize its content. You will also look at some data statistics.

In [None]:
#read the data
raw_data = pd.read_csv('./creditcardfraud/creditcard.csv')
print("There are " + str(len(raw_data)) + " observations in the credit card fraud dataset.")
print("There are " + str(len(raw_data.columns)) + " variables in the dataset.")

raw_data.head()

In [None]:
import pandas as pd
raw_data = pd.read_csv('./creditcardfraud/creditcard.csv')
print("There are"+str(len(raw_data))+"observations in the credit card fraud dataset.")
print("There are"+str(len(raw_data.columns))+"variables in the data set")
raw_data.head()
raw_data.hist(column=["Time","V1"],bins=10)


  In practice, a financial institution may have access to a much larger dataset of transactions. To simulate such a case, we will inflate the original one 10 times.

In [None]:
import pandas as pd
import numpy as np

# Assuming you have a CSV file named 'creditcard.csv' in the 'creditcardfraud' folder
file_path = './creditcardfraud/creditcard.csv'
raw_data = pd.read_csv(file_path)

n_replicas = 10
big_raw_data = pd.DataFrame(np.repeat(raw_data.values, n_replicas, axis=0), columns=raw_data.columns)
print("There are " + str(len(big_raw_data)) + " observations in the dataset")
print("There are " + str(len(big_raw_data.columns)) + " columns in the dataset ")


labels: This line extracts the unique values in the 'Class' column of the 'big_raw_data' DataFrame. These unique values represent the distinct classes in your dataset.
sizes: This line calculates the count of each unique class in the 'Class' column using the value_counts() function. It retrieves the values (counts) associated with each unique class.

In [None]:
import matplotlib.pyplot as plt

# Assuming 'big_raw_data' is a DataFrame with a 'Class' column
labels = big_raw_data['Class'].unique()
sizes = big_raw_data['Class'].value_counts().values

fig, ax = plt.subplots()
ax.pie(sizes, labels=labels, autopct='%1.3f%%')
ax.set_title('Target Variable Value Counts')
plt.show()


In [None]:
file_path = './creditcardfraud/creditcard.csv'
big_raw_data = pd.read_csv(file_path)


# we provide our solution here
plt.hist(big_raw_data.Amount.values, 6, histtype='bar', facecolor='g')
plt.show()

print("Minimum amount value is ", np.min(big_raw_data.Amount.values))
print("Maximum amount value is ", np.max(big_raw_data.Amount.values))
print("90% of the transactions have an amount less or equal than ", np.percentile(raw_data.Amount.values, 90))

# Dataset Preprocessing

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize

big_raw_data.iloc[:, 1:30] = StandardScaler().fit_transform(big_raw_data.iloc[:, 1:30])
data_matrix = big_raw_data.values
x = data_matrix[:, 1:30]
y = data_matrix[:, 30]
x = normalize(x, norm='l1')  # Use the normalize function from sklearn.preprocessing
print("x.shape=", x.shape, ",y.shape=", y.shape)


In [None]:
%pip install scikit-learn

# train test split

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
print("x_train=", x_train.shape)
print("x_test=", x_test.shape)
print("y_train=", y_train.shape)
print("y_test=", y_test.shape)


<div id="dt_sklearn">
    <h2>Build a Decision Tree Classifier model with Scikit-Learn</h2>
</div>



In [None]:
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.tree import DecisionTreeRegressor
import time

# Assuming you have loaded or created your dataset and defined y_train
# Example: x_train, y_train = load_or_create_regression_dataset()

w_train = compute_sample_weight('balanced', y_train)

sklearn_dt = DecisionTreeRegressor(max_depth=4, random_state=35)

t0 = time.time()
sklearn_dt.fit(x_train, y_train, sample_weight=w_train)
sklearn_time = time.time() - t0
print("[Scikit-Learn] Training time (s):  {0:.5f}".format(sklearn_time))


<div id="dt_snapml">
    <h2>Build a Decision Tree Classifier model with Snap ML</h2>
</div>


In [None]:
# Assuming you have computed your sample weights and defined x_train, y_train
# Example: w_train = compute_sample_weight('balanced', y_train)

# Import the Decision Tree Regressor Model from Snap ML
from snapml import DecisionTreeRegressor

# Snap ML offers multi-threaded CPU/GPU training of decision trees, unlike scikit-learn

# Uncomment the line below if you have a GPU and want to use it
# snapml_dt = DecisionTreeRegressor(max_depth=4, random_state=45, use_gpu=True, n_jobs=4)

# Use the line below for CPU training
snapml_dt = DecisionTreeRegressor(max_depth=4, random_state=45, n_jobs=4)

# Train a Decision Tree Regressor model using Snap ML
t0 = time.time()
snapml_dt.fit(x_train, y_train, sample_weight=w_train)
snapml_time = time.time() - t0
print("[Snap ML] Training time (s):  {0:.5f}".format(snapml_time))


<div id="dt_sklearn_snapml">
    <h2>Evaluate the Scikit-Learn and Snap ML Decision Tree Classifier Models</h2>
</div>


In [None]:
from snapml import DecisionTreeRegressor
snapml_dt = DecisionTreeRegressor(max_depth=5 , random_state=45,n_jobs=4)

t0 = time.time()
snapml_dt.fit(x_train,y_train,sample_weight=w_train)
snapml_time = time.time() - t0
print("[Snapml]training time(s): {0:.3F}".format(snapml_time))

<div id="svm_sklearn">
    <h2>Build a Support Vector Machine model with Scikit-Learn</h2>
</div>


In [None]:
# import the linear Support Vector Machine (SVM) model for regression from Scikit-Learn
from sklearn.svm import LinearSVR
import time

# Assuming there is a mismatch in the number of samples, slice y_train to match x_train
y_train = y_train[:199364]

# Flatten y_train to make it a 1-dimensional array
y_train = y_train.ravel()

# instantiate a scikit-learn SVM regression model
# for reproducible output across multiple function calls, set random_state to a given integer value
sklearn_svm = LinearSVR(random_state=31, loss="epsilon_insensitive", fit_intercept=False)

# train a linear Support Vector Machine regression model using Scikit-Learn
t0 = time.time()
sklearn_svm.fit(x_train, y_train)
sklearn_time = time.time() - t0
print("[Scikit-Learn] Training time (s):  {0:.5f}".format(sklearn_time))


<div id="svm_snap">
    <h2>Build a Support Vector Machine model with Snap ML</h2>
</div>


In [None]:
# import the Support Vector Machine model (SVM) from Snap ML
from snapml import SupportVectorMachine
import time

# in contrast to scikit-learn's LinearSVC, Snap ML offers multi-threaded CPU/GPU training of SVMs
# to use the GPU, set the use_gpu parameter to True
# snapml_svm = SupportVectorMachine(class_weight='balanced', random_state=25, use_gpu=True, fit_intercept=False)

# to set the number of threads used at training time, one needs to set the n_jobs parameter
snapml_svm = SupportVectorMachine(class_weight='balanced', random_state=25, n_jobs=4, fit_intercept=False)
# print(snapml_svm.get_params())

# train an SVM model using Snap ML
t0 = time.time()
model = snapml_svm.fit(x_train, y_train)
snapml_time = time.time() - t0
print("[Snap ML] Training time (s):  {0:.2f}".format(snapml_time))

<div id="svm_sklearn_snap">
    <h2>Evaluate the Scikit-Learn and Snap ML Support Vector Machine Models</h2>
</div>


In [None]:
# compute the Snap ML vs Scikit-Learn training speedup
training_speedup = sklearn_time / snapml_time
print('[Support Vector Machine] Snap ML vs. Scikit-Learn training speedup : {0:.2f}x '.format(training_speedup))

# run inference using the Scikit-Learn model
# get the confidence scores for the test samples
sklearn_pred = sklearn_svm.predict(x_test)

# evaluate accuracy on the test set
acc_sklearn = roc_auc_score(y_test, sklearn_pred)
print("[Scikit-Learn] ROC-AUC score:   {0:.3f}".format(acc_sklearn))

# run inference using the Snap ML model
# get the confidence scores for the test samples
snapml_pred = snapml_svm.decision_function(x_test)

# evaluate accuracy on the test set
acc_snapml = roc_auc_score(y_test, snapml_pred)
print("[Snap ML] ROC-AUC score:   {0:.3f}".format(acc_snapml))


# Author

**Muizz.**