In [1]:
#IMPORT LIBRARIES 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, cohen_kappa_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [2]:
#LOADING DATA
data = pd.read_csv('credit_card_transactions.csv')
data = data.dropna()

x = data.drop(columns=['is_fraud'])
y = data['is_fraud']

# Set a custom sampling strategy
# Example: Maintain 2:1 ratio of majority to minority samples
# For instance, if minority class has 100 samples, we want majority class to have 200 samples
majority_count = y.value_counts().max()  # Total samples in the majority class
minority_count = y.value_counts().min()  # Total samples in the minority class

# Define sampling_strategy based on the desired ratio
desired_majority_count = 10 * minority_count  # Desired majority samples (2:1 ratio)
sampling_strategy = {0: desired_majority_count, 1: minority_count}  # Map class to desired counts

# Apply RandomUnderSampler with the custom strategy
rus = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)
x, y = rus.fit_resample(x, y)

In [3]:
#FEATURE ENGINEERING AND PREPROCESSING
categorical_features = ['merchant', 'category', 'gender', 'city', 'state', 'job']
numeric_features = ['amt', 'city_pop']

# Preprocessor: OneHotEncoding categorical variables and scaling numeric ones
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features), 
        ('num', StandardScaler(), numeric_features)
    ])

In [4]:
# Preprocessing pipeline (without SMOTE)
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

#TRAIN AND TEST SLIP
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

# Preprocess the data
x_train_preprocessed = pipeline.fit_transform(x_train)
x_test_preprocessed = pipeline.transform(x_test)

# Apply SMOTE separately after preprocessing
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train_preprocessed, y_train)

In [5]:
# DECISION TREE CLASSIFIER
tree = DecisionTreeClassifier(random_state=42)

# Train the model on the resampled data
tree.fit(x_train_resampled, y_train_resampled)

# Make predictions
y_pred = tree.predict(x_test_preprocessed)

In [6]:
acc_score = accuracy_score(y_test, y_pred)
print("Accuracy: {:.3f}".format(acc_score))
rec_score = recall_score(y_test, y_pred)
print("Recall: {:.3f}".format(rec_score))
prec_score = precision_score(y_test, y_pred)
print("Precision Score: {:.3f}".format(prec_score))
F1_score = f1_score(y_test, y_pred)
print("F1-Score: {:.3f}".format(F1_score))
kappa = cohen_kappa_score(y_test, y_pred)
print("Kappa Score: {:.3f}".format(kappa))

Accuracy: 0.978
Recall: 0.893
Precision Score: 0.870
F1-Score: 0.881
Kappa Score: 0.869
