<a href="https://colab.research.google.com/github/matidesalegn/Improved-detection-of-fraud-cases-in-e-commerce-and-bank-transactions/blob/task-3/notebooks/model%20explainability.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load the processed dataset

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib
import shap
import lime
import lime.lime_tabular
import matplotlib.pyplot as plt

# Load processed data
fraud_data = pd.read_csv('processed/processed_fraud_data_with_country.csv', low_memory=False)
credit_card_data = pd.read_csv('processed/processed_credit_card_data.csv')


## Convert or Drop Non-numeric Columns And Split the data into features and target

In [11]:
# Convert date-time columns to numeric (timestamp)
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time']).astype(int) / 10**9
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time']).astype(int) / 10**9

# Ensure all columns are numeric
X_fraud = fraud_data.drop(columns=['class'])
y_fraud = fraud_data['class']

# Drop any remaining non-numeric columns (e.g., 'user_id', 'device_id', 'ip_address')
X_fraud = X_fraud.select_dtypes(include=['number'])

# Similarly handle the credit card data if necessary (though the provided columns are already numeric)
X_credit = credit_card_data.drop(columns=['Class'])
y_credit = credit_card_data['Class']

# Ensure all columns are numeric for the credit card data
X_credit = X_credit.select_dtypes(include=['number'])

# Split the data into training and test sets
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42)
X_train_credit, X_test_credit, y_train_credit, y_test_credit = train_test_split(X_credit, y_credit, test_size=0.2, random_state=42)


 ## Train and Save the **Models**

In [None]:
# Train the model for fraud detection
fraud_model = RandomForestClassifier(random_state=42)
fraud_model.fit(X_train_fraud, y_train_fraud)

# Save the fraud model
joblib.dump(fraud_model, 'models/fraud_data/random_forest_model.joblib')

# Train the model for credit card default prediction
credit_model = RandomForestClassifier(random_state=42)
credit_model.fit(X_train_credit, y_train_credit)

# Save the credit card model
joblib.dump(credit_model, 'models/credit_card_data/random_forest_model.joblib')


## Model Explainability with SHAP and LIME

In [None]:
# Load your trained model for fraud detection
fraud_model = joblib.load('models/fraud_data/random_forest_model.joblib')

# Ensure test data is only numeric for SHAP explainability
X_test_fraud_numeric = X_test_fraud.select_dtypes(include=['number'])
X_test_credit_numeric = X_test_credit.select_dtypes(include=['number'])

# SHAP explainability for fraud model
explainer_shap_fraud = shap.TreeExplainer(fraud_model)
shap_values_fraud = explainer_shap_fraud.shap_values(X_test_fraud_numeric)

# SHAP summary plot for fraud model
shap.summary_plot(shap_values_fraud, X_test_fraud_numeric)
plt.show()

# SHAP force plot for the first instance in the fraud test set
shap.force_plot(explainer_shap_fraud.expected_value[1], shap_values_fraud[1][0], X_test_fraud_numeric.iloc[0])

# SHAP dependence plot for a specific feature, e.g., 'feature_name'
shap.dependence_plot('feature_name', shap_values_fraud[1], X_test_fraud_numeric)
plt.show()

# SHAP explainability for credit card model
explainer_shap_credit = shap.TreeExplainer(credit_model)
shap_values_credit = explainer_shap_credit.shap_values(X_test_credit_numeric)

# SHAP summary plot for credit card model
shap.summary_plot(shap_values_credit, X_test_credit_numeric)
plt.show()

# SHAP force plot for the first instance in the credit card test set
shap.force_plot(explainer_shap_credit.expected_value[1], shap_values_credit[1][0], X_test_credit_numeric.iloc[0])

# SHAP dependence plot for a specific feature, e.g., 'feature_name'
shap.dependence_plot('feature_name', shap_values_credit[1], X_test_credit_numeric)
plt.show()

# LIME explainability for fraud model
explainer_lime_fraud = lime.lime_tabular.LimeTabularExplainer(X_train_fraud.values, feature_names=X_train_fraud.columns, class_names=['Not Fraud', 'Fraud'], discretize_continuous=True)

# Explain the prediction for the first instance in the fraud test set
i = 0
exp_fraud = explainer_lime_fraud.explain_instance(X_test_fraud_numeric.iloc[i].values, fraud_model.predict_proba, num_features=10)

# Show the explanation in a notebook
exp_fraud.show_in_notebook(show_all=False)

# LIME feature importance plot for fraud model
exp_fraud.as_pyplot_figure()
plt.show()

# LIME explainability for credit card model
explainer_lime_credit = lime.lime_tabular.LimeTabularExplainer(X_train_credit.values, feature_names=X_train_credit.columns, class_names=['Class 0', 'Class 1'], discretize_continuous=True)

# Explain the prediction for the first instance in the credit card test set
i = 0
exp_credit = explainer_lime_credit.explain_instance(X_test_credit_numeric.iloc[i].values, credit_model.predict_proba, num_features=10)

# Show the explanation in a notebook
exp_credit.show_in_notebook(show_all=False)

# LIME feature importance plot for credit card model
exp_credit.as_pyplot_figure()
plt.show()