Get dataset from https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud

In [None]:
import pandas as pd
from IPython.display import display

data = pd.read_csv('creditcard.csv')

display(data.head())

value_counts = data.value_counts('Class')
print("There are {} frauds and {} non-frauds in this dataset.".format(value_counts[1], value_counts[0]))


In [48]:
from sklearn.model_selection import train_test_split

# Drop the 'Time' column

data = data.drop('Time', axis=1)
X = data.drop('Class', axis=1)
y = data['Class']


In [None]:

# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("There are {} training samples and {} test samples.".format(X_train.shape[0], X_test.shape[0]))
print(y_train.value_counts('Class'))
print(y_test.value_counts('Class'))

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Create a decision tree classifier and fit it to the training data

clf = DecisionTreeClassifier(random_state=42)

clf.fit(X_train, y_train)

In [None]:
# Evaluate the classifier on the test data

y_pred = clf.predict(X_test)

from sklearn.metrics import classification_report

print("Baseline Scores For Decision Tree Classifier")
print(classification_report(y_test, y_pred))

Here I shall experiment with different techniques that might improve the performance of the model (or several models) and determine which of them really works.

The techniques I want to try are as follows:

1. Scaling features that were not already scaled.
2. Balancing training data by oversampling with SMOTE.
3. Balancing training data with random undersampling.
4. Removing outliers from the training data.

In [50]:
# Scale the amount.

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_std = X_train.copy()

X_train_std['Amount'] = scaler.fit_transform(X_train_std[['Amount']])

X_test_std = X_test.copy()

X_test_std['Amount'] = scaler.transform(X_test_std[['Amount']])

In [None]:
# Test the performance after scaling the amount.

clf_std = DecisionTreeClassifier(random_state=42)

clf_std.fit(X_train_std, y_train)

clf_std_pred = clf_std.predict(X_test_std)

print("Scores For Decision Tree Classifier After Scaling The Amount")

print(classification_report(y_test, clf_std_pred))

We don't get any improvements after normalizing amount.

Let's continue with oversampling.

In [None]:
# Create a new training dataset with SMOTE.

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_train_smote, y_train_smote = smote.fit_resample(X_train_std, y_train)

print(y_train_smote.value_counts('Class'))

print("There are {} training samples and {} test samples.".format(X_train_smote.shape[0], X_test_std.shape[0]))

# Test the performance after using SMOTE.

clf_smote = DecisionTreeClassifier(random_state=42)

clf_smote.fit(X_train_smote, y_train_smote)

clf_smote_pred = clf_smote.predict(X_test_std)

print("Scores For Decision Tree Classifier After Using SMOTE")

print(classification_report(y_test, clf_smote_pred))

Oversampling with SMOTE produced even worse results. That's unexpected. I thought balancing training classes would improve the score.

Let's try random undersampling.

In [None]:
# Create a new training dataset with NearMiss.

from imblearn.under_sampling import NearMiss

nearmiss = NearMiss()

X_train_nearmiss, y_train_nearmiss = nearmiss.fit_resample(X_train_std, y_train)

print(y_train_nearmiss.value_counts('Class'))

print("There are {} training samples and {} test samples.".format(X_train_nearmiss.shape[0], X_test_std.shape[0]))

# Test the performance after using NearMiss.

clf_nearmiss = DecisionTreeClassifier(random_state=42)

clf_nearmiss.fit(X_train_nearmiss, y_train_nearmiss)

clf_nearmiss_pred = clf_nearmiss.predict(X_test_std)

# Evaluate the classifier on the test data

print("Scores For Decision Tree Classifier After Using NearMiss")

print(classification_report(y_test, clf_nearmiss_pred))

Random sampling wrecks the scores. Let's try training multiple decision tree classifiers and train them on different random subsamples and take a majority vote to make a prediction.

In [None]:
training_data = pd.concat([X_train_std, y_train], axis=1)

frauds = training_data[training_data['Class'] == 1]
non_frauds = training_data[training_data['Class'] == 0]

fraud_count = len(frauds)

def get_random_subsample(frauds, non_frauds, fraud_count):
    non_frauds_sample = non_frauds.sample(fraud_count)

    balanced_training_data = pd.concat([frauds, non_frauds_sample])
    balanced_training_data = balanced_training_data.sample(frac=1)

    X_train_balanced = balanced_training_data.drop('Class', axis=1)
    y_train_balanced = balanced_training_data['Class']

    X_train_balanced.reset_index(drop=True, inplace=True)
    y_train_balanced.reset_index(drop=True, inplace=True)

    return X_train_balanced, y_train_balanced.astype('int')

tree_ensemble = []

for i in range(100):
    X_train_balanced, y_train_balanced = get_random_subsample(frauds, non_frauds, fraud_count)

    clf_balanced = DecisionTreeClassifier(random_state=42)

    clf_balanced.fit(X_train_balanced, y_train_balanced)

    tree_ensemble.append(clf_balanced)

def predict(X):
    return sum(clf_balanced.predict(X) for clf_balanced in tree_ensemble) / len(tree_ensemble)

y_pred_balanced = predict(X_test_std)

y_pred_balanced = (y_pred_balanced > 0.5).astype(int)

print("Scores For Decision Tree Classifier After Using Balanced Training Data")

print(classification_report(y_test, y_pred_balanced))


Training an ensemble of tree on different subsets of data also doesn't work.

Let's see how removing outliers affect the performance.

In [None]:
# Remove outliers from the training data.

# Q1 and Q3 are the first and third quartiles of the 'Amount' column.

Q1 = X_train_std['Amount'].quantile(0.25)

Q3 = X_train_std['Amount'].quantile(0.75)

IQR = Q3 - Q1

# The lower and upper bounds are the values that are 1.5 times the IQR below Q1 and above Q3.

lower_bound = Q1 - 1.5 * IQR

upper_bound = Q3 + 1.5 * IQR

# Remove the outliers from the training data.

X_train_no_outliers = X_train_std[(X_train_std['Amount'] > lower_bound) & (X_train_std['Amount'] < upper_bound)]

y_train_no_outliers = y_train.loc[X_train_no_outliers.index]

print("There are {} training samples and {} test samples.".format(X_train_no_outliers.shape[0], X_test_std.shape[0]))

# Test the performance after removing the outliers.

clf_no_outliers = DecisionTreeClassifier(random_state=42)

clf_no_outliers.fit(X_train_no_outliers, y_train_no_outliers)

clf_no_outliers_pred = clf_no_outliers.predict(X_test_std)

print("Scores For Decision Tree Classifier After Removing Outliers")

print(classification_report(y_test, clf_no_outliers_pred))

Removing the outliers from the Amount column slightly improves the model. I wonder if removing the outliers from other columns would help.

In [None]:
def remove_outliers_from_column(X, y, column):

    Q1 = X[column].quantile(0.25)

    Q3 = X[column].quantile(0.75)

    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR

    upper_bound = Q3 + 1.5 * IQR

    X_train_no_outliers = X[(X[column] > lower_bound) & (X[column] < upper_bound)]

    y_train_no_outliers = y.loc[X_train_no_outliers.index]

    return X_train_no_outliers, y_train_no_outliers

def remove_outliers_from_columns(X, y, columns):
    X_train_no_outliers, y_train_no_outliers = X, y

    for column in columns:
        X_train_no_outliers, y_train_no_outliers = remove_outliers_from_column(X_train_no_outliers, y_train_no_outliers, column)

    return X_train_no_outliers, y_train_no_outliers

# X_train_no_outliers, y_train_no_outliers = remove_outliers_from_column(X_train_std, y_train, 'Amount')
# X_train_no_outliers, y_train_no_outliers = remove_outliers_from_column(X_train_no_outliers, y_train_no_outliers, 'V17')

X_train_no_outliers, y_train_no_outliers = remove_outliers_from_columns(X_train_std, y_train, ['Amount'])

print("There are {} training samples and {} test samples.".format(X_train_no_outliers.shape[0], X_test_std.shape[0]))

# Test the performance after removing the outliers from the 'Amount' column.

clf_no_outliers = DecisionTreeClassifier(random_state=42)

clf_no_outliers.fit(X_train_no_outliers, y_train_no_outliers)

clf_no_outliers_pred = clf_no_outliers.predict(X_test_std)

print("Scores For Decision Tree Classifier After Removing Outliers From The 'Amount' Column")

print(classification_report(y_test, clf_no_outliers_pred))

Removing outliers in the amount column look promising. Other columns not so much. Why? When does it help to remove outliers from a columns. Anyways, let's train a random forest on this new data without outliers in the amount column.

In [None]:
# Test the performance after removing the outliers from the 'Amount' on a random forest.

from sklearn.ensemble import RandomForestClassifier

clf_no_outliers_forest = RandomForestClassifier(random_state=42)

clf_no_outliers_forest.fit(X_train_no_outliers, y_train_no_outliers)

clf_no_outliers_forest_pred = clf_no_outliers_forest.predict(X_test_std)

print("Scores For Random Forest Classifier After Removing Outliers From The 'Amount' Column")

print(classification_report(y_test, clf_no_outliers_forest_pred))

In [None]:
# Compare with the original dataset.add

clf = RandomForestClassifier(random_state=42)

clf.fit(X_train, y_train)

clf_pred = clf.predict(X_test)

print("Scores For Random Forest Classifier With Original Dataset")

print(classification_report(y_test, clf_pred))

In [None]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, clf_pred))

print(confusion_matrix(y_test, clf_no_outliers_forest_pred))

Removing the outliers doesn't improve the random forest classifier all that much.

Results from the notebook:

Scaling the data, oversampling or undersampling doesn't really help with model performance. Only removing outliers does help a little to decision tree classifier.