In [1]:
from geplearn.counterfactual_explanations import CounterfactualExplainer
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score

In [2]:
data = pd.read_csv("./datasets/titanic.csv")
data.drop(["PassengerId", "Name", "Parch", "Ticket", "Cabin"], axis=1, inplace=True)
X = data.dropna().copy().drop("Survived", axis=1)
y = data.dropna().copy().loc[:, "Survived"]

In [3]:
encoder = OrdinalEncoder()

categorical_cols = ["Pclass", "Sex", "Embarked"]
categorical_features_indexes = [X.columns.get_loc(col) for col in categorical_cols]

count_cols = ["SibSp"]
count_features_indexes = [X.columns.get_loc(col) for col in count_cols]

continuous_cols = ["Fare", "Age"]
continuous_features_indexes = [X.columns.get_loc(col) for col in continuous_cols]

X.loc[:, categorical_cols] = encoder.fit_transform(X.loc[:, categorical_cols])
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare,Embarked
0,2.0,1.0,22.0,1,7.25,2.0
1,0.0,0.0,38.0,1,71.2833,0.0
2,2.0,0.0,26.0,0,7.925,2.0
3,0.0,0.0,35.0,1,53.1,2.0
4,2.0,1.0,35.0,0,8.05,2.0


In [4]:
categorical_features_values = {
    i: X.iloc[:, i].unique().tolist()
    for i in categorical_features_indexes
}

count_features_range = {
    i: (X.iloc[:, i].min(), X.iloc[:, i].max())
    for i in count_features_indexes
}

continuous_features_range = {
    i: (X.iloc[:, i].min(), X.iloc[:, i].max())
    for i in continuous_features_indexes
}

In [5]:
rf = RandomForestClassifier()
rf.fit(X.values, y.values)

RandomForestClassifier()

In [6]:
print(accuracy_score(y.values, rf.predict(X)), f1_score(y.values, rf.predict(X)))

0.9859550561797753 0.9823943661971831


In [12]:
explainer = CounterfactualExplainer(
    predict_proba_fn=rf.predict_proba,
    X_obs=X.iloc[0].values,
    threshold=0.5,
    features_names=X.columns.tolist(),
    categorical_features_indexes=categorical_features_indexes,
    continuous_features_indexes=continuous_features_indexes,
    count_features_indexes=count_features_indexes,
    categorical_features_values=categorical_features_values,
    continuous_features_range=continuous_features_range,
    count_features_range=count_features_range,
    pop_size=500,
    n_hall_of_fame=50,
    early_stopping_threshold=0.9,
    distance="ned"
)

In [13]:
best_individuals = explainer.explain()

100%|██████████| 5/5 [00:11<00:00,  2.39s/it]

Original Observation:
Pclass       2.00
Sex          1.00
Age         22.00
SibSp        1.00
Fare         7.25
Embarked     2.00
dtype: float64
Best synthetic observation:
Pclass      2.000000
Sex         1.000000
Age         9.570776
SibSp       1.000000
Fare        2.967629
Embarked    2.000000
fitness     0.990660
Name: 200, dtype: float64
Model prediction on synthetic observation: [0.49 0.51]





In [14]:
explainer.feature_importance(best_individuals, X)

Unnamed: 0,column,feature_importance
4,Fare,13.052632
2,Age,12.4
0,Pclass,6.333333
1,Sex,5.0
3,SibSp,0.666667
5,Embarked,0.0
