In [1]:
import pandas as pd

# Replace 'path_to_your_csv.csv' with the actual path to your CSV file
df = pd.read_csv('data.csv')

In [2]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['FLAG'] = le.fit_transform(df['FLAG'])  # Converts 'Fraud' to 1 and 'Non-Fraud' to 0

In [3]:
from sklearn.preprocessing import StandardScaler

features = df.drop('FLAG', axis=1)  # Exclude the target variable
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

In [4]:
X = features_scaled
y = df['FLAG'].values

In [5]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority', k_neighbors=5)
X_res, y_res = smote.fit_resample(X, y)

In [7]:
from sklearn.feature_selection import mutual_info_classif

info_gain = mutual_info_classif(X_res, y_res)
top_features_indices = info_gain.argsort()[-10:][::-1]  # Get indices of top 10 features
X_top_features = X_res[:, top_features_indices]

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_top_features, y_res, test_size=0.2, random_state=42)

In [12]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=1000, random_state=42)
mlp.fit(X_train, y_train)

In [13]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = mlp.predict(X_test)
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98      2991
           1       0.98      0.98      0.98      3129

    accuracy                           0.98      6120
   macro avg       0.98      0.98      0.98      6120
weighted avg       0.98      0.98      0.98      6120

Accuracy: 0.9784313725490196


In [None]:
import shap
# SHAP Values - Explain the model's predictions
explainer = shap.KernelExplainer(mlp.predict, X_train)
shap_values = explainer.shap_values(X_test, nsamples=10)  # nsamples is a parameter you can tweak

Using 24478 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


  0%|          | 0/6120 [00:00<?, ?it/s]

Regressors in active set degenerate. Dropping a regressor, after 5 iterations, i.e. alpha=7.862e-03, with an active set of 5 regressors, and the smallest cholesky pivot element being 2.220e-16. Reduce max_iter or increase eps parameters.
Regressors in active set degenerate. Dropping a regressor, after 6 iterations, i.e. alpha=3.931e-03, with an active set of 6 regressors, and the smallest cholesky pivot element being 2.220e-16. Reduce max_iter or increase eps parameters.
Regressors in active set degenerate. Dropping a regressor, after 6 iterations, i.e. alpha=3.931e-03, with an active set of 6 regressors, and the smallest cholesky pivot element being 5.960e-08. Reduce max_iter or increase eps parameters.
Regressors in active set degenerate. Dropping a regressor, after 3 iterations, i.e. alpha=4.790e-04, with an active set of 3 regressors, and the smallest cholesky pivot element being 2.220e-16. Reduce max_iter or increase eps parameters.
Regressors in active set degenerate. Dropping a 

In [None]:
# Plot SHAP values
shap.summary_plot(shap_values, X_test, feature_names=feature_names)  # Replace feature_names with your actual features

# Visualize the first prediction's explanation
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0], X_test_sample[0])