Solving mushroom classification problem from https://github.com/pbiecek/InterpretableMachineLearning2020/issues/5

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from ceteris_paribus import explainer, profiles
from ceteris_paribus.plots import plots
%matplotlib inline

In [2]:
data = pd.read_csv("dataset_24_mushroom.csv")
#remove aphostrophes from values
for col in data.columns:
    data[col] = data[col].str.replace("'", "")
data.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises%3F,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class
0,x,s,n,t,p,f,c,n,k,e,...,w,w,p,w,o,p,k,s,u,p
1,x,s,y,t,a,f,c,b,k,e,...,w,w,p,w,o,p,n,n,g,e
2,b,s,w,t,l,f,c,b,n,e,...,w,w,p,w,o,p,n,n,m,e
3,x,y,w,t,p,f,c,n,n,e,...,w,w,p,w,o,p,k,s,u,p
4,x,s,g,f,n,f,w,b,k,t,...,w,w,p,w,o,e,n,a,g,e


Target class is in column "class", "p" means poissonous, "e" means eatable. Let's preprocess the data ie. encode classes.

In [3]:
X = data.drop(columns=["class"])
class_enc = preprocessing.LabelEncoder().fit(data["class"])
y = class_enc.transform(data["class"])
encoders = {}
categorical_names = {}
for col in X.columns:
    encoders[col] = preprocessing.LabelEncoder().fit(X[col])
    categorical_names[X.columns.get_loc(col)] = encoders[col].classes_
    X[col] = encoders[col].transform(X[col])

print("Number of samples: %d" % len(y))

Number of samples: 8124


Let's train and evaluate Random Forest model. Using 20% of data as test set.

In [4]:
encoder = preprocessing.OneHotEncoder().fit(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3234)
X_train_enc = encoder.transform(X_train)
classifier = RandomForestClassifier().fit(X_train_enc, y_train)
pred = classifier.predict(encoder.transform(X_test))
acc = np.mean(pred == y_test)
print("Accuracy: %f" % acc)

Accuracy: 1.000000


We have perfect accuracy. Now let's check CeterisParibus explanations and focus on 4 features that in LIME explanations were most important.

In [5]:
def predict(data):
    return classifier.predict_proba(encoder.transform(data))[::, 1]
expl = explainer.explain(classifier, data=X_train, y=y_train, label='Random Forest', 
                                              predict_function=predict)
print("Case 1 (result: %s)" % class_enc.classes_[y[32]])
p1 = profiles.individual_variable_profile(expl, X.iloc[32], y[32], variables=["odor", "gill-size", "gill-spacing", "stalk-surface-above-ring"])
plots.plot_notebook(p1, selected_variables=["odor", "gill-size", "gill-spacing", "stalk-surface-above-ring"], print_observations=False)

Case 1 (result: e)


In [6]:
print("Case 2 (result: %s)" % class_enc.classes_[y[123]])
p2 = profiles.individual_variable_profile(expl, X.iloc[123], y[123], variables=["odor", "gill-size", "gill-spacing", "stalk-surface-above-ring"])
plots.plot_notebook(p2, selected_variables=["odor", "gill-size", "gill-spacing", "stalk-surface-above-ring"], print_observations=False)

Case 2 (result: e)


In [7]:
print("Case 3 (result: %s)" % class_enc.classes_[y[675]])
p3 = profiles.individual_variable_profile(expl, X.iloc[675], y[675], variables=["odor", "gill-size", "gill-spacing", "stalk-surface-above-ring"])
plots.plot_notebook(p3, selected_variables=["odor", "gill-size", "gill-spacing", "stalk-surface-above-ring"], print_observations=False)

Case 3 (result: e)


We see that, the CP profiles for different samples can differ. The shape for plot of odor in first example does not resemble shapes in second and third example. Also the dependency on gill-size is opposite in second and third example.

Let's train another model (Logistic Regression) and compare explanations.

In [8]:
logistic_classifier = LogisticRegression().fit(X_train_enc, y_train)
pred = logistic_classifier.predict(encoder.transform(X_test))
acc = np.mean(pred == y_test)
print("Accuracy: %f" % acc)
def predict_l(data):
    return logistic_classifier.predict_proba(encoder.transform(data))[::, 1]

Accuracy: 1.000000


In [9]:
expl_l = explainer.explain(logistic_classifier, data=X, y=y, label='Logistic Regression', 
                                              predict_function=predict_l)
print("Case 1 (result: %s)" % class_enc.classes_[y[32]])
p1_l = profiles.individual_variable_profile(expl_l, X.iloc[32], y[32])
plots.plot_notebook(p1_l, selected_variables=["odor", "gill-size", "gill-spacing", "stalk-surface-above-ring"], print_observations=False)

Case 1 (result: e)


In [10]:
print("Case 2 (result: %s)" % class_enc.classes_[y[123]])
p2_l = profiles.individual_variable_profile(expl_l, X.iloc[123], y[123])
plots.plot_notebook(p2_l, selected_variables=["odor", "gill-size", "gill-spacing", "stalk-surface-above-ring"], print_observations=False)

Case 2 (result: e)


In [11]:
print("Case 3 (result: %s)" % class_enc.classes_[y[675]])
p3_l = profiles.individual_variable_profile(expl_l, X.iloc[675], y[675])
plots.plot_notebook(p3_l, selected_variables=["odor", "gill-size", "gill-spacing", "stalk-surface-above-ring"], print_observations=False)

Case 3 (result: e)


CP profiles between models also differ significally. Logistic Regression have very different profiles for odor (but simillar in different samples) and almost no dependence on gill-size and gill-spacing. Profiles for stalk-surface-above-ring have simillar shape, but more variability in Random Forest.

Concluding, analysis of Ceteris Paribus profiles shows us different structure of this two models. Both of them achieve 100% accuracy, so they give the same (and correct) predicions, but depend on data in different ways. Logistic Regression uses mostly odor for predicting results, whereas Random Forest have different dependency on odor, but depends also on other parameters, like gill-size or stalk-surface-above-ring.

## Appendix: code

In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from ceteris_paribus import explainer, profiles
from ceteris_paribus.plots import plots

data = pd.read_csv("dataset_24_mushroom.csv")
#remove aphostrophes from values
for col in data.columns:
    data[col] = data[col].str.replace("'", "")
data.head()

X = data.drop(columns=["class"])
class_enc = preprocessing.LabelEncoder().fit(data["class"])
y = class_enc.transform(data["class"])
encoders = {}
categorical_names = {}
for col in X.columns:
    encoders[col] = preprocessing.LabelEncoder().fit(X[col])
    categorical_names[X.columns.get_loc(col)] = encoders[col].classes_
    X[col] = encoders[col].transform(X[col])

print("Number of samples: %d" % len(y))

encoder = preprocessing.OneHotEncoder().fit(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3234)
X_train_enc = encoder.transform(X_train)
classifier = RandomForestClassifier().fit(X_train_enc, y_train)
pred = classifier.predict(encoder.transform(X_test))
acc = np.mean(pred == y_test)
print("Accuracy: %f" % acc)

def predict(data):
    return classifier.predict_proba(encoder.transform(data))[::, 1]
expl = explainer.explain(classifier, data=X_train, y=y_train, label='Random Forest', 
                                              predict_function=predict)

p1 = profiles.individual_variable_profile(expl, X.iloc[32], y[32], variables=["odor", "gill-size", "gill-spacing", "stalk-surface-above-ring"])
plots.plot_notebook(p1, selected_variables=["odor", "gill-size", "gill-spacing", "stalk-surface-above-ring"], print_observations=False)

p2 = profiles.individual_variable_profile(expl, X.iloc[123], y[123], variables=["odor", "gill-size", "gill-spacing", "stalk-surface-above-ring"])
plots.plot_notebook(p2, selected_variables=["odor", "gill-size", "gill-spacing", "stalk-surface-above-ring"], print_observations=False)

p3 = profiles.individual_variable_profile(expl, X.iloc[675], y[675], variables=["odor", "gill-size", "gill-spacing", "stalk-surface-above-ring"])
plots.plot_notebook(p3, selected_variables=["odor", "gill-size", "gill-spacing", "stalk-surface-above-ring"], print_observations=False)

logistic_classifier = LogisticRegression().fit(X_train_enc, y_train)
pred = logistic_classifier.predict(encoder.transform(X_test))
acc = np.mean(pred == y_test)
print("Accuracy: %f" % acc)
def predict_l(data):
    return logistic_classifier.predict_proba(encoder.transform(data))[::, 1]

expl_l = explainer.explain(logistic_classifier, data=X, y=y, label='Logistic Regression', 
                                              predict_function=predict_l)
p1_l = profiles.individual_variable_profile(expl_l, X.iloc[32], y[32])
plots.plot_notebook(p1_l, selected_variables=["odor", "gill-size", "gill-spacing", "stalk-surface-above-ring"], print_observations=False)

p2_l = profiles.individual_variable_profile(expl_l, X.iloc[123], y[123])
plots.plot_notebook(p2_l, selected_variables=["odor", "gill-size", "gill-spacing", "stalk-surface-above-ring"], print_observations=False)

p3_l = profiles.individual_variable_profile(expl_l, X.iloc[675], y[675])
plots.plot_notebook(p3_l, selected_variables=["odor", "gill-size", "gill-spacing", "stalk-surface-above-ring"], print_observations=False)