Students: Marina SANGINETO JUCA ; Marilyn CHAHINE


### 1. Instalation

In [86]:
!pip install dice-ml



In [2]:
# imports
import math
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import make_moons, load_breast_cancer, load_wine
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import pairwise_distances
import dice_ml

## 2. Données
Crée les deux halfMoons et ajouter de bruit gaussien.

In [3]:
X, y = make_moons(n_samples=1000, shuffle=True, noise=None, random_state=42)

In [4]:
# divide in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [5]:
train_data = np.column_stack((X_train,y_train.T))

In [6]:
df_train = pd.DataFrame(data=train_data, columns=['x_axis','y_axis','class'])

In [7]:
data_train = dice_ml.Data(dataframe=df_train, continuous_features=['x_axis','y_axis'], outcome_name='class')

## 3. Classifiers

In [8]:
clf = RandomForestClassifier()

In [9]:
trained_classifier = clf.fit(df_train[['x_axis','y_axis']],df_train['class'])

In [10]:
model = dice_ml.Model(model=trained_classifier, backend='sklearn')

## 4. Géneration d'exemples contrefactuels

In [96]:
explainer = dice_ml.Dice(data_train, model, method='random')

In [97]:
query_instances = pd.DataFrame(data=X_test, columns = ['x_axis','y_axis'])

In [None]:
generated_cf = explainer.generate_counterfactuals(query_instances,
                                                  2,
                                                  desired_class="opposite",
                                                  # proximity_weight=0.5,
                                                  # diversity_weight=1.0,
                                                  features_to_vary="all",
                                                  permitted_range=None,
                                                  posthoc_sparsity_param=0.1)

 92%|█████████▏| 92/100 [19:41<00:01,  4.98it/s]

In [None]:
generated_cf

## 5. Récupération des exemples contre-factuels générés

In [None]:
generated_cf.visualize_as_dataframe()

In [None]:
generated_cf.cf_examples_list[0].final_cfs_df

### Evaluating

In [None]:
cf_df = generated_cf.cf_examples_list[0].final_cfs_df
idx = 1
while idx < query_instances.shape[0]:
  cf_df = pd.concat([cf_df,generated_cf.cf_examples_list[idx].final_cfs_df])
  idx+=1

In [None]:
def plotResults_2Features(X_train, y_train, clf, x, y, e):
    X_feature1 = []
    X_feature2 = []

    for elem in X_train:
        X_feature1.append(elem[0])
        X_feature2.append(elem[1])

    colors = np.where(np.asarray(y_train) == 1, "red", "blue")

    plt.figure(figsize=(6, 5))


    # Données
    plt.scatter(X_feature1, X_feature2, c=colors, s=1, edgecolor="none")


    # Frontiere de décision
       # 1. Create a grid of points
    xx, yy = np.meshgrid(
        np.linspace(X_train[:,0].min()-1, X_train[:,0].max()+1, 200),
        np.linspace(X_train[:,1].min()-1, X_train[:,1].max()+1, 200)
    )

       # 2. Evaluate decision function
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

       # 3. Plot decision boundary (level set where Z = 0)
    plt.contour(xx, yy, Z, levels=[0], linewidths=0.5, colors="k")
    plt.contourf(xx, yy, Z, levels=[-np.inf,0,np.inf], colors=["blue","red"], alpha=0.2)


    # Donnée à expliquer
    colors = np.where(np.asarray(y) == 1, "red", "blue")
    plt.scatter(x[:,0], x[:,1], c=colors, s=2, edgecolors="k")


    # Exemple contrefactuel
    plt.scatter(e['x_axis'], e['y_axis'], c="green", s=20, edgecolors="none")


    plt.xlabel("Feature 1")
    plt.ylabel("Feature 2")
    plt.title("Scatter Plot with: \n X = Feature 1, Y = Feature 2, Red/Blue = class1/class2, \n separating line = decision border, circled point is the data point \n to be explained by a counterfactual example, and green point is the counterfactual")
    plt.show()

In [None]:
plotResults_2Features(X_train, y_train, clf, X_test, y_test, cf_df)

In [None]:
def compute_validity(cf_df, clf, desired_class):
  preds = clf.predict(cf_df[cf_df.columns[:-1]])
  valid = np.mean(preds == 1 - preds[0])
  return valid


In [None]:
def compute_proximity(query_instances, generated_cf):
  """
  L2 distance
  """
  columns = query_instances.columns
  distances = []
  for i, cf_example in enumerate(generated_cf.cf_examples_list):
      x = cf_example.test_instance_df[columns].to_numpy()
      cfs = cf_example.final_cfs_df[columns].to_numpy()
      d = pairwise_distances(cfs, x).mean()
      distances.append(d)
  return np.mean(distances)


In [None]:
def compute_sparsity(query_instances, generated_cf):
  columns = query_instances.columns
  changes = []
  for i, cf_example in enumerate(generated_cf.cf_examples_list):
      x = cf_example.test_instance_df[columns].to_numpy()
      cfs = cf_example.final_cfs_df[columns].to_numpy()
      diff = np.abs(cfs - x) > 1e-6
      changes.append(diff.sum(axis=1).mean())
  return np.mean(changes)


In [None]:
def compute_diversity(generated_cf):
  columns = query_instances.columns
  diversities = []
  for cf_example in generated_cf.cf_examples_list:
      cfs = cf_example.final_cfs_df[columns].to_numpy()
      if cfs.shape[0] > 1:
          dists = pairwise_distances(cfs)
          diversities.append(np.mean(dists))
  return np.mean(diversities)


In [None]:
def evaluate_counterfactuals(clf, query_instances, generated_cf, desired_class="opposite"):
  val = compute_validity(
      pd.concat([cf.final_cfs_df for cf in generated_cf.cf_examples_list]), clf, desired_class)
  prox = compute_proximity(query_instances, generated_cf)
  spars = compute_sparsity(query_instances, generated_cf)
  div = compute_diversity(generated_cf)

  print(f"Validity   : {val:.3f}")
  print(f"Proximity  : {prox:.3f}")
  print(f"Sparsity   : {spars:.3f}")
  print(f"Diversity  : {div:.3f}")

  return {"validity": val, "proximity": prox, "sparsity": spars, "diversity": div}


In [None]:
results = evaluate_counterfactuals(clf, query_instances, generated_cf)


## 6. Etudes expérimentales

#### Different classification model

#### Logistic Regression

In [None]:
clf = LogisticRegression()

In [None]:
trained_classifier = clf.fit(df_train[['x_axis','y_axis']],df_train['class'])

In [None]:
model = dice_ml.Model(model=trained_classifier, backend='sklearn')

In [None]:
explainer = dice_ml.Dice(data_train, model, method='random')

In [None]:
generated_cf = explainer.generate_counterfactuals(query_instances,
                                                  2,
                                                  desired_class="opposite",
                                                  # proximity_weight=0.5,
                                                  # diversity_weight=1.0,
                                                  features_to_vary="all",
                                                  permitted_range=None,
                                                  posthoc_sparsity_param=0.1)

In [None]:
results = evaluate_counterfactuals(clf, query_instances, generated_cf)

### Different dataset

In [None]:
df_wine = load_wine(as_frame=True).frame

In [None]:
df_wine

In [None]:
# verify the continuous values
df_wine.dtypes

In [None]:
continuous_features = df_wine.columns[:-1] # all are continuous

In [None]:
df_wine_train, df_wine_test = train_test_split(df_wine, test_size=0.1, random_state=42)

In [None]:
data_train = dice_ml.Data(dataframe=df_train, continuous_features=continuous_features, outcome_name='class')

In [None]:
clf = RandomForestClassifier()

In [None]:
trained_classifier = clf.fit(df_train[['x_axis','y_axis']],df_train['class'])

In [None]:
model = dice_ml.Model(model=trained_classifier, backend='sklearn')

In [None]:
explainer = dice_ml.Dice(data_train, model, method='random')

In [None]:
query_instances = df_wine_test[continuous_features]

In [None]:
generated_cf = explainer.generate_counterfactuals(query_instances,
                                                  2,
                                                  desired_class="opposite",
                                                  features_to_vary="all",
                                                  permitted_range=None,
                                                  posthoc_sparsity_param=0.1)

In [None]:
results = evaluate_counterfactuals(clf, query_instances, generated_cf)

### Varying the parameters

In [None]:
generated_cf = explainer.generate_counterfactuals(query_instances,
                                                  2,
                                                  desired_class="opposite",
                                                  features_to_vary=["alcohol","hue"],
                                                  posthoc_sparsity_param=0.1)

In [None]:
results = evaluate_counterfactuals(clf, query_instances, generated_cf)

In [None]:
generated_cf = explainer.generate_counterfactuals(query_instances,
                                                  2,
                                                  desired_class="opposite",
                                                  permitted_range={'magnesium':[70,100], 'proline':[500,1000]},
                                                  posthoc_sparsity_param=0.1)

In [None]:
results = evaluate_counterfactuals(clf, query_instances, generated_cf)

## 7. Extension de Growing Spheres

<br> ADDED:
<br> Parameter **excludedFeatures**, a list of features not to change

In [11]:
def uniformGenSL(x, a0, a1, n, excludedFeatures = []):
    """
    x : array-like, shape (d,)
    a0, a1 : scalars with 0 <= a0 < a1
    n : number of points to generate
    excludedFeatures : list of features not to change referenced by their indices
    returns: array shape (n, d)
    """
    x = np.asarray(x)
    d = x.shape[0]

    # generate n standard normal vectors 
    Y = np.random.randn(n, d)             

    # normalize to unit vectors
    norms = np.linalg.norm(Y, axis=1, keepdims=True)
    # avoid division by zero (very unlikely)
    norms[norms == 0] = 1.0
    U = Y / norms                          # shape (n, d), unit directions
    # sample radii so volume is uniform in layer
    # sample uniformly in [a0^d, a1^d]
    low = a0**d
    high = a1**d
    V = np.random.uniform(low, high, size=(n,))
    r = V**(1.0/d)                         # radii

    # form points
    points = x + (r[:, None] * U)          # shape (n, d)

    """
    for i in range(len(points)):
        for j in excludedFeatures:
            points[i][j] = x[j]
    """
    
    return points

ADDED: 
<br>Parameter **ne** the number of counterfactual point to generate \n
<br>The function will first divide the sphere by 2 until it has less than ne points left in it
<br>Then expand it until we have exactly ne points in the sphere
<br><br>Parameter **excludedFeatures**, a list of features not to change, included in order to pass it to uniformGenSL()


In [12]:
def growingSpheresGen(clf, x, rad, n, ne, excludedFeatures = []):
    """
    clf : classifier used to generate x
    x : array-like, shape (d,)
    rad : radius of the sphere to generate points in
    n : number of points to generate
    ne : number of counterfactual points to generate
    excludedFeatures : list of features not to change referenced by their indices
    returns: array of size (ne, d), list of counterfactual points
    """
    
    a0 = 0
    a1 = rad
    Z = uniformGenSL(x, a0, a1, n, excludedFeatures)
    
    flag = True
    
    while (flag == True):
        #print("block1")
        found = 0
        for e in Z:
            if (clf.predict([e]) != clf.predict([x])):
                #print("block2")
                found = found + 1
                if (found >= ne):
                    rad = rad/2
                    Z = uniformGenSL(x, 0, rad, n, excludedFeatures)
            elif (np.array_equal(e, Z[-1])):
                #print("block3")
                if (found < ne):
                    flag = False
            
    a0 = rad
    a1 = 2*rad

    e_final_list = []
    found = 0
    while (found < ne):
        for e in Z:
            if (clf.predict([e]) != clf.predict([x])):
                #print("block4")
                e_final_list.append(e)
                found = found + 1
                if (found == ne):
                    break
            elif (np.array_equal(e, Z[-1])):
                #print("block5")
                Z = uniformGenSL(x, a0, a1, n, excludedFeatures)
                a0 = a1
                a1 = a1 + rad
                
    return e_final_list

ADDED: <br>
Feature selection done for each point generated in growingSpheresGen
<br> Parameter **excludedFeatures**, a list of features not to change

In [13]:
def growingSpheresFeatureSelec(clf, x, e_list, excludedFeatures = []):
    """
    clf : classifier used to generate x
    x : array-like, shape (d,)
    e_list : list of counterfactual points 
    excludedFeatures : list of features not to change referenced by their indices
    returns: list array of size (ne, d) with minimum features modified
    """
    e_opt_list = []
    
    for e in e_list:
        e_temp = e.copy()
        while ( clf.predict([e_temp]) != clf.predict([x])):
            e_opt = e_temp.copy()
            dist_list = []
            for j in range(len(x)):
                if j not in excludedFeatures:
                    if (e_temp[j] == x[j]):
                        dist_list.append(np.inf)
                    else:
                        dist_list.append(abs(e_temp[j] - x[j]))
            i = np.argmin(dist_list)
            e_temp[i] = x[i]
        e_opt_list.append(e_opt)
        
    return e_opt_list

excludedFeatures is not functional on uniformGenSL, **yet :)**

In [14]:
# specify the point we're describing 
x = X_test[0]
# specify starting radius
rad = 0.1
# specify the number of points to generate in the uniform generation 
# of points in the Spherical Layer 
n = 100
# specify the number of counterfactual points you want to generate
ne = 4
# specify the indices that shouldn't be modified
excludedFeatures = [0]


print("Data point we're studying: ", x)

e = growingSpheresGen(clf, x, rad, n, ne, excludedFeatures)
print("The ", str(len(e)), " counterfactual points before feature selection: \n", e)

e_opt_list = growingSpheresFeatureSelec(clf, x, e, excludedFeatures)
print("The ", str(len(e_opt_list)), " counterfactual points after feature selection: \n", e_opt_list)

Data point we're studying:  [ 0.49818367 -0.3649742 ]




The  4  counterfactual points before feature selection: 
 [array([0.59884609, 0.03091521]), array([0.57106886, 0.08561777]), array([0.57957414, 0.08453361]), array([0.77476811, 0.00716991])]
The  4  counterfactual points after feature selection: 
 [array([0.59884609, 0.03091521]), array([0.57106886, 0.08561777]), array([0.57957414, 0.08453361]), array([0.77476811, 0.00716991])]


