In [18]:
# Shapley Feature Explanations 

import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

iris = load_iris()
df = pd.DataFrame(data=iris["data"], columns=iris["feature_names"])
df["target"] = iris["target"]
# iris["target"]
# iris["target_names"]

In [19]:
def log_reg_accuracy(x, y):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=3)
    log_reg_classifier = LogisticRegression()
    log_reg_classifier.fit(x_train, y_train)
    pred_y = log_reg_classifier.predict(x_test)
    accuracy = accuracy_score(y_test, pred_y)
    return accuracy
# random_state can affect the results

In [20]:
# code one flower as 0, and the rests are 1
# test the accuracy by removing one of features each time
target_class = [0, 1, 2]
all_features = iris["feature_names"]
flower_features = []

for i in target_class:
    df["recode"] = df["target"].apply(lambda x: 0 if x == i else 1)
    y = df["recode"].values
    features_acc = []
    for feature in all_features:
        features_copy = all_features.copy()
        features_copy.remove(feature)
        x = df[features_copy].values
        selected_features_accuracy = log_reg_accuracy(x, y)
        features_acc.append(selected_features_accuracy)
    flower_features.append(features_acc)
selected_features_accuracy = np.array(flower_features)

table1 = pd.DataFrame({"Features\Flowers": iris["feature_names"],\
                     iris["target_names"][0]: selected_features_accuracy[0],\
                     iris["target_names"][1]: selected_features_accuracy[1],\
                     iris["target_names"][2]: selected_features_accuracy[2]})
table1.style.set_caption("Removal Accuracy")

Unnamed: 0,Features\Flowers,setosa,versicolor,virginica
0,sepal length (cm),1.0,0.786667,0.946667
1,sepal width (cm),1.0,0.693333,0.96
2,petal length (cm),1.0,0.773333,0.96
3,petal width (cm),1.0,0.76,0.96


In [21]:
# Accuracy with all features
flower_all_features = []
for i in target_class:
    df["recode"] = df["target"].apply(lambda x: 0 if x == i else 1)
    y_label = df["recode"].values
    x_feature = df[all_features].values
    all_features_accuracy = log_reg_accuracy(x_feature, y_label)
    flower_all_features.append(all_features_accuracy)
all_features_accuracy = np.array(flower_all_features)
all_features_accuracy

array([1.        , 0.77333333, 0.94666667])

In [22]:
setosa_contribution_accuracy = all_features_accuracy[0] - selected_features_accuracy[0]
versicolor_contribution_accuracy = all_features_accuracy[1] - selected_features_accuracy[1]
virginica_contribution_accuracy = all_features_accuracy[2] - selected_features_accuracy[2]

table2 = pd.DataFrame({"Features\Flowers": iris["feature_names"],\
                     iris["target_names"][0]: setosa_contribution_accuracy,\
                     iris["target_names"][1]: versicolor_contribution_accuracy,\
                     iris["target_names"][2]: virginica_contribution_accuracy})
table2.style.set_caption("Marginal Contributions")

Unnamed: 0,Features\Flowers,setosa,versicolor,virginica
0,sepal length (cm),0.0,-0.013333,0.0
1,sepal width (cm),0.0,0.08,-0.013333
2,petal length (cm),0.0,0.0,-0.013333
3,petal width (cm),0.0,0.013333,-0.013333


For setosa, removing any one of the features can give the same accuracy, so there is no significant differnce among features to differenciate setosa and other flowers.\
For versicolor, sepal width is the most important feature to identify versicolor and other flowers.\
For virginica, removing one of features except for sepal length can increase the accuracy.