<a href="https://colab.research.google.com/github/mahditeymori/classification_with-search-Grid/blob/main/shap_and_lime_explainer_fraud_creditcart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'creditcardfraud:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F310%2F23498%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240821%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240821T122601Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D46c3e4e76aaa26ee134603094a16b2059675c055813d44119bfaf354b86813a4a6de6630a4563d4477e630dbb329c9752e47c60e207d3043169ffb3c21c05db93966a00fb7530d9d1e48ed501147e2d67a56bb4a866c6b8818a6ccd12fe0ae1d2ffee247897877369f85be1cb7dbfecde5073caffe93e3b621bf3409d00c9e0ddf7445e00ddd58f1a5cbcbbb8b09a87909f2cc940c8381e31aa4b5a39f829a01acf0520d02d6cb24e00a91b0dd5f76bd04a32ca04bfee37c80e8d4b7c24cf3056ff7d4f2feab0483eda6e1debae06ec2f4966783a9bbdde4305572d0435b87d8a2a05e7aaf07b4249dceb62a8468ccd3cc8d4b5e6d39c2fb4ea522574b955a2f'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import necessary libraries for data processing, modeling, and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.tree import plot_tree
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
data=pd.read_csv("/kaggle/input/creditcardfraud/creditcard.csv")

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
# Calculate the correlation matrix for numerical features
corr = data.select_dtypes(exclude=['object']).corr()
# Create a heatmap to visualize the correlations
plt.subplots(1,1, figsize=(25,25))
sns.heatmap(data=corr, cmap ='Oranges', annot = corr, cbar=None)
plt.title('Correlation between features and target')
plt.show()

In [None]:
data.Class.value_counts()

In [None]:
# Boxplot to visualize the distribution of the 'Area' feature for each class
sns.boxplot(x='Class', y='V1', data=data)

In [None]:
# Separate features (X) and target (y) variables
X = data.drop('Class', axis=1)
y = data['Class']

# Encode the target variable (y) as integers
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the feature variables
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
# Initialize machine learning models with hyperparameters for GridSearch
models = {
    'Logistic Regression': {
        'model': LogisticRegression(),
        'params': {
            'C': [0.1, 1, 10],
            'solver': [ 'liblinear']
            ,'random_state': [42]
        }
     },
    'Decision Tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini', 'entropy'],
            'max_depth': [10, 20, 30, None]
        }
    },
    # 'SVM': {
    #     'model': SVC(),
    #     'params': {
    #         'C': [0.1, 1, 10, 100],
    #         'kernel': ['linear', 'rbf']
    #     }
    # },
    # 'KNN': {
    #     'model': KNeighborsClassifier(),
    #     'params': {
    #         'n_neighbors': [3, 5, 7, 9],
    #         'weights': ['uniform', 'distance']
    #     }
    #
    # },
    'xgboost': {
        'model': XGBClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [10, 20, 30, None]
        }
     }
}

In [None]:
# Initialize an empty dictionary to store the best models for each algorithm
best_models = {}
# Iterate over each model name and its corresponding information
for model_name, model_info in models.items():
    # Perform Grid Search Cross-Validation to find the best hyperparameters
    grid_search = GridSearchCV(model_info['model'], model_info['params'], cv=6, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    # Store the best estimator (model with optimal hyperparameters) in the dictionary
    best_models[model_name] = grid_search.best_estimator_

In [None]:
# Iterate over the best models and print their names and corresponding model objects
for model_name, model in best_models.items():
    print(f"Best {model_name}: {model}")

In [None]:
# Iterate through the best models and evaluate their performance
for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    print(f"Model: {model_name}")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print(f"Accuracy Score: {accuracy_score(y_test, y_pred) * 100:.2f}%")

In [None]:
incorrect_indices = [i for i in range(len(y_test)) if y_test[i] != y_pred[i]]

In [None]:
incorrect_indices

In [None]:
import lime.lime_tabular
explainer = lime.lime_tabular.LimeTabularExplainer(X_train ,feature_names = list(data.columns)[:-1], class_names=[0,1])

In [None]:
y_test[incorrect_indices[2]]

In [None]:
exp = explainer.explain_instance(X_test[incorrect_indices[1]],best_models['xgboost'].predict_proba)
exp.show_in_notebook()

In [None]:
import shap
explainer = shap.TreeExplainer(best_models['xgboost'],feature_names=data.columns[:-1])

shap_values = explainer(X_test)

In [None]:
y_test[incorrect_indices[0]]

In [None]:
from shap import waterfall_plot
waterfall_plot(shap_values[incorrect_indices[0]])

In [None]:
shap.summary_plot(shap_values, X_test)

In [None]:
from xgboost import plot_tree
fig, ax = plt.subplots(figsize=(150, 512))
plot_tree(best_models['xgboost'], num_trees=4, ax=ax)
plt.savefig("temp.pdf")

In [None]:
data

In [None]:
data_reduce = data.drop('Time', axis=1)

In [None]:
# Separate features (X) and target (y) variables
X_reduce = data_reduce.drop('Class', axis=1)

# Split the dataset into training and testing sets
X_train_reduce, X_test_reduce, y_train, y_test = train_test_split(X_reduce, y, test_size=0.2, random_state=42)

# Scale the feature variables
scaler = StandardScaler()
X_train_reduce = scaler.fit_transform(X_train_reduce)
X_test_reduce = scaler.transform(X_test_reduce)
model=best_models['xgboost']
model.fit(X_train_reduce, y_train)
y_pred = model.predict(X_test_reduce)
print(f"Model: {model_name}")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f"Accuracy Score: {accuracy_score(y_test, y_pred) * 100:.2f}%")

In [None]:
from tensorflow.keras.optimizers import Adam
# Initialize a neural network model
model = Sequential()
# Add the input layer
model.add(Dense(32, activation='relu', input_dim=(X_train_reduce.shape[1])))
model.add(Dense(16,activation='relu'))
# Add a dropout layer for regularization
model.add(Dropout(0.3))
# Add the output layer
model.add(Dense(1, activation='sigmoid'))
# Compile the model
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
# Initialize early stopping
#early_stopping = EarlyStopping(patience=3)
#class weight
class_weights = {0: 1.0, 1: 30.0}
# Train the model
history = model.fit(X_train_reduce, y_train, validation_split=0.2, epochs=100, batch_size=32
                    ,class_weight=class_weights)
# Evaluate the model
loss, accuracy=model.evaluate(X_test_reduce, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

y_pred_prob = model.predict(X_test_reduce)

# Convert the probabilities to class predictions
y_pred = np.argmax(y_pred_prob, axis=1)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
