# Reproduction of Experiments on Dataset D8

> **Note:** The paper calls this dataset/split `Yeast-0-3-5-9-vs-7–8`, but the majority and minority ratio & counts indicate they actually meant `Yeast-2-3-5-9-vs-7–8` (e.g. class 2 instead of 0).

## Read data

In [1]:
import sys 
import os
sys.path.insert(0, os.path.join("..", "src"))
import pandas as pd
%matplotlib inline

names = ["Sequence Name", "mcg", "gvh", "alm", "mit", "erl", "pox", "vac", "nuc", "class"]

path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data'
df = pd.read_csv(path, header=None, names=names, delim_whitespace=True)
df

Unnamed: 0,Sequence Name,mcg,gvh,alm,mit,erl,pox,vac,nuc,class
0,ADT1_YEAST,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,MIT
1,ADT2_YEAST,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,MIT
2,ADT3_YEAST,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,MIT
3,AAR2_YEAST,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,NUC
4,AATM_YEAST,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,MIT
...,...,...,...,...,...,...,...,...,...,...
1479,YUR1_YEAST,0.81,0.62,0.43,0.17,0.5,0.0,0.53,0.22,ME2
1480,ZIP1_YEAST,0.47,0.43,0.61,0.40,0.5,0.0,0.48,0.47,NUC
1481,ZNRP_YEAST,0.67,0.57,0.36,0.19,0.5,0.0,0.56,0.22,ME2
1482,ZUO1_YEAST,0.43,0.40,0.60,0.16,0.5,0.0,0.53,0.39,NUC


In [2]:
df = df.drop("Sequence Name", axis=1)
df

Unnamed: 0,mcg,gvh,alm,mit,erl,pox,vac,nuc,class
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,MIT
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,MIT
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,MIT
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,NUC
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,MIT
...,...,...,...,...,...,...,...,...,...
1479,0.81,0.62,0.43,0.17,0.5,0.0,0.53,0.22,ME2
1480,0.47,0.43,0.61,0.40,0.5,0.0,0.48,0.47,NUC
1481,0.67,0.57,0.36,0.19,0.5,0.0,0.56,0.22,ME2
1482,0.43,0.40,0.60,0.16,0.5,0.0,0.53,0.39,NUC


In [3]:
df["class"].value_counts()

CYT    463
NUC    429
MIT    244
ME3    163
ME2     51
ME1     44
EXC     35
VAC     30
POX     20
ERL      5
Name: class, dtype: int64

In [4]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

le.fit_transform(df["class"])
le.classes_ = df["class"].value_counts().index.tolist()
df["class"] = le.transform(df["class"])
df

Unnamed: 0,mcg,gvh,alm,mit,erl,pox,vac,nuc,class
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,2
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,2
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,2
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,1
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,2
...,...,...,...,...,...,...,...,...,...
1479,0.81,0.62,0.43,0.17,0.5,0.0,0.53,0.22,4
1480,0.47,0.43,0.61,0.40,0.5,0.0,0.48,0.47,1
1481,0.67,0.57,0.36,0.19,0.5,0.0,0.56,0.22,4
1482,0.43,0.40,0.60,0.16,0.5,0.0,0.53,0.39,1


In [5]:
label_column = "class"

minority_labels = [7, 8]
majority_labels = [2, 3, 5, 9]

df_filtered = df[df[label_column].isin(majority_labels + minority_labels)].copy()

df_filtered.loc[df_filtered[label_column].isin(majority_labels), label_column] = 1
df_filtered.loc[df_filtered[label_column].isin(minority_labels), label_column] = 0

X = df_filtered.drop(label_column, axis=1)
y = df_filtered[label_column]

In [6]:
y.value_counts()

1    456
0     50
Name: class, dtype: int64

# Classification

In [7]:
from sklearn.model_selection import ParameterGrid, KFold, GridSearchCV
from sklearn.metrics import roc_auc_score
import numpy as np

> **Note:** In the following, we use a custom piece of code for the grid search and cross-validation. We do this to enforce that the augmentation/oversampling step is only applied to training data and the test data is actually kept separate. For the baseline model, this means that we could also simply make use of the functions provided by sklearn instead of our custom code. However, in order to make the results comparable, we use the same code in each case.

In [8]:
kf = KFold(n_splits=5, shuffle=True, random_state=7018321)

## Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier

In [10]:
rf_param_grid = {
    'n_estimators': [50, 100, 200, 400, 600],
    'max_depth': [None, 4, 6, 10, 20, 30, 50, 80, 100],
}

# compute all combinations of parameters
combination_dicts = list(ParameterGrid(rf_param_grid))

### Baseline (no augmentation)

In [None]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    #print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        
    for param_comb in combination_dicts:
        clf = RandomForestClassifier(random_state=19231823, **param_comb)
        clf.fit(X_train, y_train)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

In [23]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'max_depth': 6, 'n_estimators': 50}
Best AUC: 0.8233432489618057


To verify that our cross-validation code works as expected, we can compare to a regular sklearn grid search here:

In [24]:
baseline_search = GridSearchCV(RandomForestClassifier(random_state=19231823), rf_param_grid, scoring="roc_auc")
baseline_search.fit(X, y)
baseline_search.best_score_

0.7863449593884376

We can see that the best model found by a sklearn grid search performs similarly well to our best model, which indicates that the code works as expected.

### With CFA

In [11]:
from cfa import Iterative_CFA

For CFA, we try two different approaches, since it is not entirely clear from the paper what the authors actually did.
1. The first approach strictly follows the pseudo-code and description given in section 3.2 of the paper. 
2. In an earlier section of the paper, the authors say that _"the class of [a] new [synthetic counterfactual] instance needs to be verified by the underlying ML model."_ This means that we use some ML model trained on the data (without CFA) to assign a class to a new synthetic counterfactual, and only keep those that were classified to be a minority instance. This seems to lead to much more reasonable synthetic counterfactuals (see visualizations of the algorithm in exp001) but, at the same time, often means that the algorithm is unable to produce a fully balanced dataset (since it may terminate early).

> **Note:** We also use a different tolerance level of 50% here, since the 10%-threshold proposed by the authors does not yield any "good" native counterfactuals, which makes the algorithm unusable.

#### Approach 1: No verification 

In [13]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    
    X_train_augmented, y_train_augmented = Iterative_CFA(X_train, 
                                                         y_train, 
                                                         stddev_percent=50, 
                                                         verify_with_baseline_model=False,
                                                         visualize_with_pca=False)
    
    for param_comb in combination_dicts:
        clf = RandomForestClassifier(random_state=19231823, **param_comb)
        clf.fit(X_train_augmented, y_train_augmented)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

Fold 0:
Data distribution before CFA:
	Majority (1): 359, Minority (label 0): 45
Number of 'good' native counterfactuals in data: 195
Data distribution after iteration 0:
	Majority (1): 359, Minority (label 0): 198
Number of 'good' native counterfactuals in data: 270
Data distribution after iteration 1:
	Majority (1): 359, Minority (label 0): 241
Number of 'good' native counterfactuals in data: 285
Data distribution after iteration 2:
	Majority (1): 359, Minority (label 0): 261
Number of 'good' native counterfactuals in data: 290
Data distribution after iteration 3:
	Majority (1): 359, Minority (label 0): 274
Number of 'good' native counterfactuals in data: 290
Data distribution after iteration 4:
	Majority (1): 359, Minority (label 0): 287
Number of 'good' native counterfactuals in data: 291
Data distribution after iteration 5:
	Majority (1): 359, Minority (label 0): 298
Number of 'good' native counterfactuals in data: 291
Data distribution after iteration 6:
	Majority (1): 359, Minor

Number of 'good' native counterfactuals in data: 287
Data distribution after iteration 6:
	Majority (1): 365, Minority (label 0): 350
Number of 'good' native counterfactuals in data: 287
Data distribution after iteration 7:
	Majority (1): 365, Minority (label 0): 362
Data distribution after CFA:
	Majority (1): 365, Minority (label 0): 362


In [27]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'max_depth': 10, 'n_estimators': 100}
Best AUC: 0.7419417180757387


#### Approach 2: Verification of new synthetic counterfactuals with baseline model

In [14]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    
    print("Fitting baseline model needed for verification... ")
    baseline_search = GridSearchCV(RandomForestClassifier(random_state=19231823), rf_param_grid, scoring="roc_auc")
    baseline_search.fit(X_train, y_train)
    
    X_train_augmented, y_train_augmented = Iterative_CFA(X_train, 
                                                         y_train, 
                                                         stddev_percent=50, 
                                                         verify_with_baseline_model=True, 
                                                         baseline_model=baseline_search, 
                                                         visualize_with_pca=False)
    
    for param_comb in combination_dicts:
        clf = RandomForestClassifier(random_state=19231823, **param_comb)
        clf.fit(X_train_augmented, y_train_augmented)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

Fold 0:
Fitting baseline model needed for verification... 
Data distribution before CFA:
	Majority (1): 359, Minority (label 0): 45
Number of 'good' native counterfactuals in data: 195
Data distribution after iteration 0:
	Majority (1): 359, Minority (label 0): 46
Number of 'good' native counterfactuals in data: 196
Classifier predicted all new synthetic counterfactuals to be in the majority class! => No new minority instances => Terminating...
Data distribution after CFA:
	Majority (1): 359, Minority (label 0): 46
Fold 1:
Fitting baseline model needed for verification... 
Data distribution before CFA:
	Majority (1): 366, Minority (label 0): 39
Number of 'good' native counterfactuals in data: 167
Data distribution after iteration 0:
	Majority (1): 366, Minority (label 0): 41
Number of 'good' native counterfactuals in data: 169
Classifier predicted all new synthetic counterfactuals to be in the majority class! => No new minority instances => Terminating...
Data distribution after CFA:
	

Data distribution after iteration 51:
	Majority (1): 366, Minority (label 0): 197
Number of 'good' native counterfactuals in data: 161
Data distribution after iteration 52:
	Majority (1): 366, Minority (label 0): 200
Number of 'good' native counterfactuals in data: 161
Data distribution after iteration 53:
	Majority (1): 366, Minority (label 0): 203
Number of 'good' native counterfactuals in data: 161
Data distribution after iteration 54:
	Majority (1): 366, Minority (label 0): 206
Number of 'good' native counterfactuals in data: 161
Data distribution after iteration 55:
	Majority (1): 366, Minority (label 0): 209
Number of 'good' native counterfactuals in data: 161
Data distribution after iteration 56:
	Majority (1): 366, Minority (label 0): 212
Number of 'good' native counterfactuals in data: 161
Data distribution after iteration 57:
	Majority (1): 366, Minority (label 0): 215
Number of 'good' native counterfactuals in data: 161
Data distribution after iteration 58:
	Majority (1): 36

Data distribution after iteration 112:
	Majority (1): 366, Minority (label 0): 272
Number of 'good' native counterfactuals in data: 140
Data distribution after iteration 113:
	Majority (1): 366, Minority (label 0): 273
Number of 'good' native counterfactuals in data: 140
Data distribution after iteration 114:
	Majority (1): 366, Minority (label 0): 274
Number of 'good' native counterfactuals in data: 140
Data distribution after iteration 115:
	Majority (1): 366, Minority (label 0): 275
Number of 'good' native counterfactuals in data: 140
Data distribution after iteration 116:
	Majority (1): 366, Minority (label 0): 276
Number of 'good' native counterfactuals in data: 140
Data distribution after iteration 117:
	Majority (1): 366, Minority (label 0): 277
Number of 'good' native counterfactuals in data: 140
Data distribution after iteration 118:
	Majority (1): 366, Minority (label 0): 278
Number of 'good' native counterfactuals in data: 140
Data distribution after iteration 119:
	Majority

Data distribution after iteration 173:
	Majority (1): 366, Minority (label 0): 335
Number of 'good' native counterfactuals in data: 118
Data distribution after iteration 174:
	Majority (1): 366, Minority (label 0): 336
Number of 'good' native counterfactuals in data: 118
Data distribution after iteration 175:
	Majority (1): 366, Minority (label 0): 337
Number of 'good' native counterfactuals in data: 109
Data distribution after iteration 176:
	Majority (1): 366, Minority (label 0): 338
Number of 'good' native counterfactuals in data: 109
Data distribution after iteration 177:
	Majority (1): 366, Minority (label 0): 339
Number of 'good' native counterfactuals in data: 109
Data distribution after iteration 178:
	Majority (1): 366, Minority (label 0): 340
Number of 'good' native counterfactuals in data: 109
Data distribution after iteration 179:
	Majority (1): 366, Minority (label 0): 341
Number of 'good' native counterfactuals in data: 109
Data distribution after iteration 180:
	Majority

In [29]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'max_depth': 10, 'n_estimators': 600}
Best AUC: 0.8200915579266095


### ADASYN instead of CFA

In [30]:
from imblearn.over_sampling import ADASYN
ada = ADASYN(random_state=9317231)

In [None]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    #print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        
    X_train_resampled, y_train_resampled = ada.fit_resample(X_train, y_train)
        
    for param_comb in combination_dicts:
        clf = RandomForestClassifier(random_state=19231823, **param_comb)
        clf.fit(X_train_resampled, y_train_resampled)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

In [32]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'max_depth': 4, 'n_estimators': 200}
Best AUC: 0.7789210331478372


### Random Oversampling instead of CFA

In [33]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=52012318)

In [None]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    #print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
        
    for param_comb in combination_dicts:
        clf = RandomForestClassifier(random_state=19231823, **param_comb)
        clf.fit(X_train_resampled, y_train_resampled)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

In [35]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'max_depth': None, 'n_estimators': 600}
Best AUC: 0.811675861366583


## Logistic Regression 

In [15]:
from sklearn.linear_model import LogisticRegression

In [16]:
lr_param_grid = {"max_iter": [100, 200, 1000], 
                 "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                 "solver": ["newton-cg", "lbfgs", "liblinear", "sag"]}

# compute all combinations of parameters
combination_dicts = list(ParameterGrid(lr_param_grid))

### Baseline (no augmentation)

In [None]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    #print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        
    for param_comb in combination_dicts:
        clf = LogisticRegression(random_state=19231823, **param_comb)
        clf.fit(X_train, y_train)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

In [39]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'C': 10, 'max_iter': 100, 'solver': 'newton-cg'}
Best AUC: 0.8111887196423279


Again, we can compare to an out-of-the-box grid search to make sure:

In [40]:
baseline_search = GridSearchCV(LogisticRegression(random_state=19231823), lr_param_grid, scoring="roc_auc")
baseline_search.fit(X, y)
baseline_search.best_score_



0.8081055900621118

### With CFA

#### Approach 1: No verification 

In [17]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    
    X_train_augmented, y_train_augmented = Iterative_CFA(X_train, 
                                                         y_train, 
                                                         stddev_percent=50, 
                                                         verify_with_baseline_model=False, 
                                                         visualize_with_pca=False)

    for param_comb in combination_dicts:
        clf = LogisticRegression(random_state=19231823, **param_comb)
        clf.fit(X_train_augmented, y_train_augmented)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

Fold 0:
Data distribution before CFA:
	Majority (1): 359, Minority (label 0): 45
Number of 'good' native counterfactuals in data: 195
Data distribution after iteration 0:
	Majority (1): 359, Minority (label 0): 198
Number of 'good' native counterfactuals in data: 270
Data distribution after iteration 1:
	Majority (1): 359, Minority (label 0): 241
Number of 'good' native counterfactuals in data: 285
Data distribution after iteration 2:
	Majority (1): 359, Minority (label 0): 261
Number of 'good' native counterfactuals in data: 290
Data distribution after iteration 3:
	Majority (1): 359, Minority (label 0): 274
Number of 'good' native counterfactuals in data: 290
Data distribution after iteration 4:
	Majority (1): 359, Minority (label 0): 287
Number of 'good' native counterfactuals in data: 291
Data distribution after iteration 5:
	Majority (1): 359, Minority (label 0): 298
Number of 'good' native counterfactuals in data: 291
Data distribution after iteration 6:
	Majority (1): 359, Minor



Fold 1:
Data distribution before CFA:
	Majority (1): 366, Minority (label 0): 39
Number of 'good' native counterfactuals in data: 167
Data distribution after iteration 0:
	Majority (1): 366, Minority (label 0): 226
Number of 'good' native counterfactuals in data: 279
Data distribution after iteration 1:
	Majority (1): 366, Minority (label 0): 274
Number of 'good' native counterfactuals in data: 303
Data distribution after iteration 2:
	Majority (1): 366, Minority (label 0): 294
Number of 'good' native counterfactuals in data: 307
Data distribution after iteration 3:
	Majority (1): 366, Minority (label 0): 308
Number of 'good' native counterfactuals in data: 307
Data distribution after iteration 4:
	Majority (1): 366, Minority (label 0): 322
Number of 'good' native counterfactuals in data: 307
Data distribution after iteration 5:
	Majority (1): 366, Minority (label 0): 336
Number of 'good' native counterfactuals in data: 307
Data distribution after iteration 6:
	Majority (1): 366, Minor



Fold 2:
Data distribution before CFA:
	Majority (1): 366, Minority (label 0): 39
Number of 'good' native counterfactuals in data: 181
Data distribution after iteration 0:
	Majority (1): 366, Minority (label 0): 213
Number of 'good' native counterfactuals in data: 271
Data distribution after iteration 1:
	Majority (1): 366, Minority (label 0): 266
Number of 'good' native counterfactuals in data: 291
Data distribution after iteration 2:
	Majority (1): 366, Minority (label 0): 281
Number of 'good' native counterfactuals in data: 294
Data distribution after iteration 3:
	Majority (1): 366, Minority (label 0): 292
Number of 'good' native counterfactuals in data: 295
Data distribution after iteration 4:
	Majority (1): 366, Minority (label 0): 302
Number of 'good' native counterfactuals in data: 295
Data distribution after iteration 5:
	Majority (1): 366, Minority (label 0): 312
Number of 'good' native counterfactuals in data: 297
Data distribution after iteration 6:
	Majority (1): 366, Minor



Fold 3:
Data distribution before CFA:
	Majority (1): 368, Minority (label 0): 37
Number of 'good' native counterfactuals in data: 207
Data distribution after iteration 0:
	Majority (1): 368, Minority (label 0): 187
Number of 'good' native counterfactuals in data: 272
Data distribution after iteration 1:
	Majority (1): 368, Minority (label 0): 233
Number of 'good' native counterfactuals in data: 289
Data distribution after iteration 2:
	Majority (1): 368, Minority (label 0): 253
Number of 'good' native counterfactuals in data: 298
Data distribution after iteration 3:
	Majority (1): 368, Minority (label 0): 262
Number of 'good' native counterfactuals in data: 299
Data distribution after iteration 4:
	Majority (1): 368, Minority (label 0): 268
Number of 'good' native counterfactuals in data: 299
Data distribution after iteration 5:
	Majority (1): 368, Minority (label 0): 274
Number of 'good' native counterfactuals in data: 299
Data distribution after iteration 6:
	Majority (1): 368, Minor



Fold 4:
Data distribution before CFA:
	Majority (1): 365, Minority (label 0): 40
Number of 'good' native counterfactuals in data: 184
Data distribution after iteration 0:
	Majority (1): 365, Minority (label 0): 211
Number of 'good' native counterfactuals in data: 269
Data distribution after iteration 1:
	Majority (1): 365, Minority (label 0): 267
Number of 'good' native counterfactuals in data: 278
Data distribution after iteration 2:
	Majority (1): 365, Minority (label 0): 297
Number of 'good' native counterfactuals in data: 287
Data distribution after iteration 3:
	Majority (1): 365, Minority (label 0): 312
Number of 'good' native counterfactuals in data: 285
Data distribution after iteration 4:
	Majority (1): 365, Minority (label 0): 326
Number of 'good' native counterfactuals in data: 287
Data distribution after iteration 5:
	Majority (1): 365, Minority (label 0): 338
Number of 'good' native counterfactuals in data: 287
Data distribution after iteration 6:
	Majority (1): 365, Minor



In [42]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'C': 1, 'max_iter': 100, 'solver': 'liblinear'}
Best AUC: 0.6949766155951723


#### Approach 2: Verification of new synthetic counterfactuals with baseline model

In [18]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    
    print("Fitting baseline model needed for verification... ")
    baseline_search = GridSearchCV(LogisticRegression(random_state=19231823), lr_param_grid, scoring="roc_auc")
    baseline_search.fit(X_train, y_train)
    
    X_train_augmented, y_train_augmented = Iterative_CFA(X_train, 
                                                         y_train, 
                                                         stddev_percent=50, 
                                                         verify_with_baseline_model=True, 
                                                         baseline_model=baseline_search,
                                                         visualize_with_pca=False)

    for param_comb in combination_dicts:
        clf = LogisticRegression(random_state=19231823, **param_comb)
        clf.fit(X_train_augmented, y_train_augmented)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

Fold 0:
Fitting baseline model needed for verification... 




Data distribution before CFA:
	Majority (1): 359, Minority (label 0): 45
Number of 'good' native counterfactuals in data: 195
Data distribution after iteration 0:
	Majority (1): 359, Minority (label 0): 47
Number of 'good' native counterfactuals in data: 197
Classifier predicted all new synthetic counterfactuals to be in the majority class! => No new minority instances => Terminating...
Data distribution after CFA:
	Majority (1): 359, Minority (label 0): 47




Fold 1:
Fitting baseline model needed for verification... 




Data distribution before CFA:
	Majority (1): 366, Minority (label 0): 39
Number of 'good' native counterfactuals in data: 167
Data distribution after iteration 0:
	Majority (1): 366, Minority (label 0): 41
Number of 'good' native counterfactuals in data: 169
Classifier predicted all new synthetic counterfactuals to be in the majority class! => No new minority instances => Terminating...
Data distribution after CFA:
	Majority (1): 366, Minority (label 0): 41




Fold 2:
Fitting baseline model needed for verification... 




Data distribution before CFA:
	Majority (1): 366, Minority (label 0): 39
Number of 'good' native counterfactuals in data: 181
Data distribution after iteration 0:
	Majority (1): 366, Minority (label 0): 41
Number of 'good' native counterfactuals in data: 181
Data distribution after iteration 1:
	Majority (1): 366, Minority (label 0): 43
Number of 'good' native counterfactuals in data: 181
Data distribution after iteration 2:
	Majority (1): 366, Minority (label 0): 45
Number of 'good' native counterfactuals in data: 174
Data distribution after iteration 3:
	Majority (1): 366, Minority (label 0): 47
Number of 'good' native counterfactuals in data: 174
Data distribution after iteration 4:
	Majority (1): 366, Minority (label 0): 49
Number of 'good' native counterfactuals in data: 174
Data distribution after iteration 5:
	Majority (1): 366, Minority (label 0): 51
Number of 'good' native counterfactuals in data: 174
Data distribution after iteration 6:
	Majority (1): 366, Minority (label 0):

Data distribution after iteration 64:
	Majority (1): 366, Minority (label 0): 169
Number of 'good' native counterfactuals in data: 158
Data distribution after iteration 65:
	Majority (1): 366, Minority (label 0): 171
Number of 'good' native counterfactuals in data: 158
Data distribution after iteration 66:
	Majority (1): 366, Minority (label 0): 173
Number of 'good' native counterfactuals in data: 158
Data distribution after iteration 67:
	Majority (1): 366, Minority (label 0): 175
Number of 'good' native counterfactuals in data: 158
Data distribution after iteration 68:
	Majority (1): 366, Minority (label 0): 177
Number of 'good' native counterfactuals in data: 158
Data distribution after iteration 69:
	Majority (1): 366, Minority (label 0): 179
Number of 'good' native counterfactuals in data: 158
Data distribution after iteration 70:
	Majority (1): 366, Minority (label 0): 181
Number of 'good' native counterfactuals in data: 158
Data distribution after iteration 71:
	Majority (1): 36



Fold 3:
Fitting baseline model needed for verification... 




Data distribution before CFA:
	Majority (1): 368, Minority (label 0): 37
Number of 'good' native counterfactuals in data: 207
Data distribution after iteration 0:
	Majority (1): 368, Minority (label 0): 39
Number of 'good' native counterfactuals in data: 199
Classifier predicted all new synthetic counterfactuals to be in the majority class! => No new minority instances => Terminating...
Data distribution after CFA:
	Majority (1): 368, Minority (label 0): 39




Fold 4:
Fitting baseline model needed for verification... 




Data distribution before CFA:
	Majority (1): 365, Minority (label 0): 40
Number of 'good' native counterfactuals in data: 184
Data distribution after iteration 0:
	Majority (1): 365, Minority (label 0): 41
Number of 'good' native counterfactuals in data: 185
Classifier predicted all new synthetic counterfactuals to be in the majority class! => No new minority instances => Terminating...
Data distribution after CFA:
	Majority (1): 365, Minority (label 0): 41




In [44]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'C': 10, 'max_iter': 100, 'solver': 'newton-cg'}
Best AUC: 0.8121044705580788


### ADASYN

In [None]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    #print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        
    X_train_resampled, y_train_resampled = ada.fit_resample(X_train, y_train)
        
    for param_comb in combination_dicts:
        clf = LogisticRegression(random_state=19231823, **param_comb)
        clf.fit(X_train_resampled, y_train_resampled)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

In [46]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'C': 1, 'max_iter': 100, 'solver': 'liblinear'}
Best AUC: 0.8061193388512976


### Random Oversampling

In [None]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    #print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
        
    for param_comb in combination_dicts:
        clf = LogisticRegression(random_state=19231823, **param_comb)
        clf.fit(X_train_resampled, y_train_resampled)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

In [48]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'C': 1, 'max_iter': 100, 'solver': 'liblinear'}
Best AUC: 0.7988203892327604
