<a href="https://colab.research.google.com/github/matidesalegn/Improved-detection-of-fraud-cases-in-e-commerce-and-bank-transactions/blob/task-3/models/model%20explainability.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
!pip install shap
!pip install lime
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
import shap
import lime
import lime.lime_tabular



Let's outline a step-by-step approach to explainability using both SHAP and LIME:

Step 1: Load the Trained Model and Data
Assume you have already trained your fraud detection models (Decision Tree, Random Forest, MLP, etc.) and have a dataset ready for explanations.

In [6]:
# Load the data with specified dtypes to avoid warnings and inspect the data
data = pd.read_csv('processed_fraud_data_with_country.csv', low_memory=False)

# Print the first few rows to inspect the DataFrame structure
print(data.head())

# Print the column names to verify the presence of the 'fraud' column
print(data.columns)

# If the 'fraud' column is present, proceed with the next steps
if 'fraud' in data.columns:
    X = data.drop(columns=['fraud'])
    y = data['fraud']
else:
    print("The 'fraud' column is not found in the dataset.")


   user_id          signup_time        purchase_time  purchase_value  \
0    22058  2015-02-24 22:55:49  2015-04-18 02:47:11       -0.160204   
1   333320  2015-06-07 20:39:50  2015-06-08 01:38:54       -1.142592   
2     1359  2015-01-01 18:52:44  2015-01-01 18:52:45       -1.197169   
3   150084  2015-04-28 21:13:25  2015-05-04 13:54:50        0.385567   
4   221365  2015-07-21 07:09:52  2015-09-09 18:40:53        0.112681   

       device_id       age    ip_address  class  ip_address_int  \
0  QVPSPJUOCKZAR  0.679914  7.327584e+08      0       732758368   
1  EOGFQPIZPYXFZ  2.304476  3.503114e+08      0       350311387   
2  YSSKYOSJHPPLJ  2.304476  2.621474e+09      1      2621473820   
3  ATGTXKYKUDUQN  0.911994  3.840542e+09      0      3840542443   
4  NAUITBZFJKHWW  1.376155  4.155831e+08      0       415583117   

   transaction_count  ...  country_Unknown  country_Uruguay  \
0                0.0  ...            False            False   
1                0.0  ...            F

In [11]:
class ModelExplainability:
    def __init__(self, model_path, data_path):
        self.model_path = model_path
        self.data_path = data_path

        # Load and prepare data
        self.data = pd.read_csv(data_path, low_memory=False)
        if 'class' not in self.data.columns:
            raise KeyError("The 'class' column is not found in the dataset.")

        # Convert datetime columns to numerical features
        self.data['signup_time'] = pd.to_datetime(self.data['signup_time'])
        self.data['purchase_time'] = pd.to_datetime(self.data['purchase_time'])
        self.data['signup_time'] = (self.data['signup_time'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
        self.data['purchase_time'] = (self.data['purchase_time'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

        # Separate features into numeric and categorical
        self.X = self.data.drop(columns=['class'])
        self.y = self.data['class']
        self.numeric_features = self.X.select_dtypes(include=['int64', 'float64']).columns.tolist()
        self.categorical_features = self.X.select_dtypes(include=['object']).columns.tolist()

        # Define preprocessing for numeric and categorical features
        numeric_transformer = Pipeline(steps=[
            ('imputer', 'passthrough')  # No transformation, but could add scalers or imputers
        ])

        categorical_transformer = Pipeline(steps=[
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        # Bundle preprocessing for numeric and categorical features
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, self.numeric_features),
                ('cat', categorical_transformer, self.categorical_features)
            ])

        # Define the model pipeline
        self.model_pipeline = Pipeline(steps=[
            ('preprocessor', self.preprocessor),
            ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
        ])

        # Split data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.3, random_state=42)

        # Load or train model
        try:
            self.model = joblib.load(model_path)
        except FileNotFoundError:
            self.model_pipeline.fit(self.X_train, self.y_train)
            joblib.dump(self.model_pipeline, model_path)
            self.model = self.model_pipeline

        # Create explainers
        self.explainer_shap = shap.TreeExplainer(self.model.named_steps['classifier'])
        self.shap_values = self.explainer_shap.shap_values(self.model.named_steps['preprocessor'].transform(self.X_test))
        self.explainer_lime = lime.lime_tabular.LimeTabularExplainer(self.X_test.values, feature_names=self.X_test.columns, class_names=['Non-Fraud', 'Fraud'], discretize_continuous=True)

    def shap_summary_plot(self):
        shap.summary_plot(self.shap_values, self.X_test)

    def shap_force_plot(self, index):
        shap.force_plot(self.explainer_shap.expected_value[1], self.shap_values[1][index], self.X_test.iloc[index], matplotlib=True)

    def shap_dependence_plot(self, feature):
        shap.dependence_plot(feature, self.shap_values[1], self.X_test)

    def lime_explanation(self, index):
        exp = self.explainer_lime.explain_instance(self.X_test.iloc[index].values, self.model.predict_proba, num_features=10)
        exp.show_in_notebook(show_table=True)

if __name__ == "__main__":
    model_path = 'random_forest_model.pkl'
    data_path = 'processed_fraud_data_with_country.csv'
    explainability = ModelExplainability(model_path, data_path)

    # Generate plots
    explainability.shap_summary_plot()
    explainability.shap_force_plot(0)
    explainability.shap_dependence_plot('purchase_value')  # Example feature
    explainability.lime_explanation(0)

UFuncTypeError: Cannot cast ufunc 'isnan' input from dtype('O') to dtype('bool') with casting rule 'same_kind'