# Reproduction of Experiments on Dataset D14

## Read data

In [1]:
import sys 
import os
sys.path.insert(0, os.path.join("..", "src"))
import pandas as pd
%matplotlib inline

names = ["Sex", "Length", "Diameter", "Height", "Whole weight", "Shucked weight", "Viscera weight", "Shell weight", "Rings"] 

path = 'http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data'
df = pd.read_csv(path, header=None, names=names)
df

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [2]:
# preprocess categorical column
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

df["Sex"] = label_encoder.fit_transform(df["Sex"])
df

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,2,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,2,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,0,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,2,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,1,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,0,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,2,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,2,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,0,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [3]:
label_column = "Rings"

minority_labels = [13]
majority_labels = df[label_column].value_counts().drop(minority_labels).index.tolist() # all except 13

df_filtered = df[df[label_column].isin(majority_labels + minority_labels)].copy()

df_filtered.loc[df_filtered[label_column].isin(majority_labels), label_column] = 1
df_filtered.loc[df_filtered[label_column].isin(minority_labels), label_column] = 0

X = df_filtered.drop(label_column, axis=1)
y = df_filtered[label_column]

In [4]:
y.value_counts()

1    3974
0     203
Name: Rings, dtype: int64

# Classification

In [5]:
from sklearn.model_selection import ParameterGrid, KFold, GridSearchCV
from sklearn.metrics import roc_auc_score
import numpy as np

> **Note:** In the following, we use a custom piece of code for the grid search and cross-validation. We do this to enforce that the augmentation/oversampling step is only applied to training data and the test data is actually kept separate. For the baseline model, this means that we could also simply make use of the functions provided by sklearn instead of our custom code. However, in order to make the results comparable, we use the same code in each case.

In [6]:
kf = KFold(n_splits=5, shuffle=True, random_state=7018321)

## Random Forest

In [7]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
rf_param_grid = {
    'n_estimators': [50, 100, 200, 400, 600],
    'max_depth': [None, 4, 6, 10, 20, 30, 50, 80, 100],
}

# compute all combinations of parameters
combination_dicts = list(ParameterGrid(rf_param_grid))

### Baseline (no augmentation)

In [None]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    #print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        
    for param_comb in combination_dicts:
        clf = RandomForestClassifier(random_state=19231823, **param_comb)
        clf.fit(X_train, y_train)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

In [10]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'max_depth': 6, 'n_estimators': 600}
Best AUC: 0.7395650202089131


To verify that our cross-validation code works as expected, we can compare to a regular sklearn grid search here:

In [11]:
baseline_search = GridSearchCV(RandomForestClassifier(random_state=19231823), rf_param_grid, scoring="roc_auc")
baseline_search.fit(X, y)
baseline_search.best_score_

0.7195691823899371

We can see that the best model found by a sklearn grid search performs similarly well to our best model, which indicates that the code works as expected.

### With CFA

In [9]:
from cfa import Iterative_CFA

For CFA, we try two different approaches, since it is not entirely clear from the paper what the authors actually did.
1. The first approach strictly follows the pseudo-code and description given in section 3.2 of the paper. 
2. In an earlier section of the paper, the authors say that _"the class of [a] new [synthetic counterfactual] instance needs to be verified by the underlying ML model."_ This means that we use some ML model trained on the data (without CFA) to assign a class to a new synthetic counterfactual, and only keep those that were classified to be a minority instance. This seems to lead to much more reasonable synthetic counterfactuals (see visualizations of the algorithm in exp001) but, at the same time, often means that the algorithm is unable to produce a fully balanced dataset (since it may terminate early).

> **Note:** We also use a different tolerance level of 50% here, since the 10%-threshold proposed by the authors does not yield any "good" native counterfactuals, which makes the algorithm unusable.

#### Approach 1: No verification 

In [10]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    
    X_train_augmented, y_train_augmented = Iterative_CFA(X_train, 
                                                         y_train, 
                                                         stddev_percent=50, 
                                                         verify_with_baseline_model=False,
                                                         visualize_with_pca=False)
    
    for param_comb in combination_dicts:
        clf = RandomForestClassifier(random_state=19231823, **param_comb)
        clf.fit(X_train_augmented, y_train_augmented)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

Fold 0:
Data distribution before CFA:
	Majority (1): 3183, Minority (label 0): 158
Number of 'good' native counterfactuals in data: 1378
Data distribution after iteration 0:
	Majority (1): 3183, Minority (label 0): 310
Number of 'good' native counterfactuals in data: 1125
Data distribution after iteration 1:
	Majority (1): 3183, Minority (label 0): 347
Number of 'good' native counterfactuals in data: 1136
Data distribution after iteration 2:
	Majority (1): 3183, Minority (label 0): 353
Number of 'good' native counterfactuals in data: 1141
Data distribution after iteration 3:
	Majority (1): 3183, Minority (label 0): 354
Number of 'good' native counterfactuals in data: 1141
Data distribution after iteration 4:
	Majority (1): 3183, Minority (label 0): 355
Number of 'good' native counterfactuals in data: 1141
Data distribution after iteration 5:
	Majority (1): 3183, Minority (label 0): 356
Number of 'good' native counterfactuals in data: 1141
Data distribution after iteration 6:
	Majority 

Data distribution after iteration 61:
	Majority (1): 3183, Minority (label 0): 412
Number of 'good' native counterfactuals in data: 1137
Data distribution after iteration 62:
	Majority (1): 3183, Minority (label 0): 413
Number of 'good' native counterfactuals in data: 1137
Data distribution after iteration 63:
	Majority (1): 3183, Minority (label 0): 414
Number of 'good' native counterfactuals in data: 1137
Data distribution after iteration 64:
	Majority (1): 3183, Minority (label 0): 415
Number of 'good' native counterfactuals in data: 1137
Data distribution after iteration 65:
	Majority (1): 3183, Minority (label 0): 416
Number of 'good' native counterfactuals in data: 1137
Data distribution after iteration 66:
	Majority (1): 3183, Minority (label 0): 417
Number of 'good' native counterfactuals in data: 1137
Data distribution after iteration 67:
	Majority (1): 3183, Minority (label 0): 418
Number of 'good' native counterfactuals in data: 1137
Data distribution after iteration 68:
	Ma

Number of 'good' native counterfactuals in data: 1133
Data distribution after iteration 122:
	Majority (1): 3183, Minority (label 0): 473
Number of 'good' native counterfactuals in data: 1133
Data distribution after iteration 123:
	Majority (1): 3183, Minority (label 0): 474
Number of 'good' native counterfactuals in data: 1133
Data distribution after iteration 124:
	Majority (1): 3183, Minority (label 0): 475
Number of 'good' native counterfactuals in data: 1133
Data distribution after iteration 125:
	Majority (1): 3183, Minority (label 0): 476
Number of 'good' native counterfactuals in data: 1133
Data distribution after iteration 126:
	Majority (1): 3183, Minority (label 0): 477
Number of 'good' native counterfactuals in data: 1133
Data distribution after iteration 127:
	Majority (1): 3183, Minority (label 0): 478
Number of 'good' native counterfactuals in data: 1133
Data distribution after iteration 128:
	Majority (1): 3183, Minority (label 0): 479
Number of 'good' native counterfac

Number of 'good' native counterfactuals in data: 1124
Data distribution after iteration 182:
	Majority (1): 3183, Minority (label 0): 533
Number of 'good' native counterfactuals in data: 1124
Data distribution after iteration 183:
	Majority (1): 3183, Minority (label 0): 534
Number of 'good' native counterfactuals in data: 1124
Data distribution after iteration 184:
	Majority (1): 3183, Minority (label 0): 535
Number of 'good' native counterfactuals in data: 1124
Data distribution after iteration 185:
	Majority (1): 3183, Minority (label 0): 536
Number of 'good' native counterfactuals in data: 1124
Data distribution after iteration 186:
	Majority (1): 3183, Minority (label 0): 537
Number of 'good' native counterfactuals in data: 1124
Data distribution after iteration 187:
	Majority (1): 3183, Minority (label 0): 538
Number of 'good' native counterfactuals in data: 1124
Data distribution after iteration 188:
	Majority (1): 3183, Minority (label 0): 539
Number of 'good' native counterfac

Number of 'good' native counterfactuals in data: 1093
Data distribution after iteration 242:
	Majority (1): 3183, Minority (label 0): 593
Number of 'good' native counterfactuals in data: 1093
Data distribution after iteration 243:
	Majority (1): 3183, Minority (label 0): 594
Number of 'good' native counterfactuals in data: 1093
Data distribution after iteration 244:
	Majority (1): 3183, Minority (label 0): 595
Number of 'good' native counterfactuals in data: 1093
Data distribution after iteration 245:
	Majority (1): 3183, Minority (label 0): 596
Number of 'good' native counterfactuals in data: 1093
Data distribution after iteration 246:
	Majority (1): 3183, Minority (label 0): 597
Number of 'good' native counterfactuals in data: 1092
Data distribution after iteration 247:
	Majority (1): 3183, Minority (label 0): 598
Number of 'good' native counterfactuals in data: 1092
Data distribution after iteration 248:
	Majority (1): 3183, Minority (label 0): 599
Number of 'good' native counterfac

Data distribution after iteration 45:
	Majority (1): 3185, Minority (label 0): 669
Number of 'good' native counterfactuals in data: 1355
Data distribution after iteration 46:
	Majority (1): 3185, Minority (label 0): 671
Number of 'good' native counterfactuals in data: 1355
Data distribution after iteration 47:
	Majority (1): 3185, Minority (label 0): 673
Number of 'good' native counterfactuals in data: 1355
Data distribution after iteration 48:
	Majority (1): 3185, Minority (label 0): 675
Number of 'good' native counterfactuals in data: 1355
Data distribution after iteration 49:
	Majority (1): 3185, Minority (label 0): 677
Number of 'good' native counterfactuals in data: 1355
Data distribution after iteration 50:
	Majority (1): 3185, Minority (label 0): 679
Number of 'good' native counterfactuals in data: 1355
Data distribution after iteration 51:
	Majority (1): 3185, Minority (label 0): 681
Number of 'good' native counterfactuals in data: 1356
Data distribution after iteration 52:
	Ma

Number of 'good' native counterfactuals in data: 1357
Data distribution after iteration 106:
	Majority (1): 3185, Minority (label 0): 791
Number of 'good' native counterfactuals in data: 1357
Data distribution after iteration 107:
	Majority (1): 3185, Minority (label 0): 793
Number of 'good' native counterfactuals in data: 1357
Data distribution after iteration 108:
	Majority (1): 3185, Minority (label 0): 795
Number of 'good' native counterfactuals in data: 1357
Data distribution after iteration 109:
	Majority (1): 3185, Minority (label 0): 797
Number of 'good' native counterfactuals in data: 1357
Data distribution after iteration 110:
	Majority (1): 3185, Minority (label 0): 799
Number of 'good' native counterfactuals in data: 1357
Data distribution after iteration 111:
	Majority (1): 3185, Minority (label 0): 801
Number of 'good' native counterfactuals in data: 1357
Data distribution after iteration 112:
	Majority (1): 3185, Minority (label 0): 803
Number of 'good' native counterfac

Data distribution after iteration 166:
	Majority (1): 3185, Minority (label 0): 911
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 167:
	Majority (1): 3185, Minority (label 0): 913
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 168:
	Majority (1): 3185, Minority (label 0): 915
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 169:
	Majority (1): 3185, Minority (label 0): 917
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 170:
	Majority (1): 3185, Minority (label 0): 919
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 171:
	Majority (1): 3185, Minority (label 0): 921
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 172:
	Majority (1): 3185, Minority (label 0): 923
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 

Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 227:
	Majority (1): 3185, Minority (label 0): 1033
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 228:
	Majority (1): 3185, Minority (label 0): 1035
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 229:
	Majority (1): 3185, Minority (label 0): 1037
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 230:
	Majority (1): 3185, Minority (label 0): 1039
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 231:
	Majority (1): 3185, Minority (label 0): 1041
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 232:
	Majority (1): 3185, Minority (label 0): 1043
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 233:
	Majority (1): 3185, Minority (label 0): 1045
Number of 'good' native cou

Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 286:
	Majority (1): 3185, Minority (label 0): 1154
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 287:
	Majority (1): 3185, Minority (label 0): 1156
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 288:
	Majority (1): 3185, Minority (label 0): 1158
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 289:
	Majority (1): 3185, Minority (label 0): 1160
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 290:
	Majority (1): 3185, Minority (label 0): 1162
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 291:
	Majority (1): 3185, Minority (label 0): 1164
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 292:
	Majority (1): 3185, Minority (label 0): 1166
Number of 'good' native cou

Data distribution after iteration 348:
	Majority (1): 3185, Minority (label 0): 1278
Number of 'good' native counterfactuals in data: 1377
Data distribution after iteration 349:
	Majority (1): 3185, Minority (label 0): 1280
Number of 'good' native counterfactuals in data: 1377
Data distribution after iteration 350:
	Majority (1): 3185, Minority (label 0): 1282
Number of 'good' native counterfactuals in data: 1377
Data distribution after iteration 351:
	Majority (1): 3185, Minority (label 0): 1284
Number of 'good' native counterfactuals in data: 1377
Data distribution after iteration 352:
	Majority (1): 3185, Minority (label 0): 1286
Number of 'good' native counterfactuals in data: 1377
Data distribution after iteration 353:
	Majority (1): 3185, Minority (label 0): 1288
Number of 'good' native counterfactuals in data: 1377
Data distribution after iteration 354:
	Majority (1): 3185, Minority (label 0): 1290
Number of 'good' native counterfactuals in data: 1377
Data distribution after ite

Data distribution after iteration 407:
	Majority (1): 3185, Minority (label 0): 1396
Number of 'good' native counterfactuals in data: 1378
Data distribution after iteration 408:
	Majority (1): 3185, Minority (label 0): 1398
Number of 'good' native counterfactuals in data: 1378
Data distribution after iteration 409:
	Majority (1): 3185, Minority (label 0): 1400
Number of 'good' native counterfactuals in data: 1382
Data distribution after iteration 410:
	Majority (1): 3185, Minority (label 0): 1402
Number of 'good' native counterfactuals in data: 1382
Data distribution after iteration 411:
	Majority (1): 3185, Minority (label 0): 1404
Number of 'good' native counterfactuals in data: 1382
Data distribution after iteration 412:
	Majority (1): 3185, Minority (label 0): 1406
Number of 'good' native counterfactuals in data: 1382
Data distribution after iteration 413:
	Majority (1): 3185, Minority (label 0): 1408
Number of 'good' native counterfactuals in data: 1382
Data distribution after ite

Number of 'good' native counterfactuals in data: 1385
Data distribution after iteration 470:
	Majority (1): 3185, Minority (label 0): 1522
Number of 'good' native counterfactuals in data: 1385
Data distribution after iteration 471:
	Majority (1): 3185, Minority (label 0): 1524
Number of 'good' native counterfactuals in data: 1385
Data distribution after iteration 472:
	Majority (1): 3185, Minority (label 0): 1526
Number of 'good' native counterfactuals in data: 1385
Data distribution after iteration 473:
	Majority (1): 3185, Minority (label 0): 1528
Number of 'good' native counterfactuals in data: 1385
Data distribution after iteration 474:
	Majority (1): 3185, Minority (label 0): 1530
Number of 'good' native counterfactuals in data: 1385
Data distribution after iteration 475:
	Majority (1): 3185, Minority (label 0): 1532
Number of 'good' native counterfactuals in data: 1385
Data distribution after iteration 476:
	Majority (1): 3185, Minority (label 0): 1534
Number of 'good' native cou

Number of 'good' native counterfactuals in data: 1426
Data distribution after iteration 530:
	Majority (1): 3185, Minority (label 0): 1642
Number of 'good' native counterfactuals in data: 1426
Data distribution after iteration 531:
	Majority (1): 3185, Minority (label 0): 1644
Number of 'good' native counterfactuals in data: 1426
Data distribution after iteration 532:
	Majority (1): 3185, Minority (label 0): 1646
Number of 'good' native counterfactuals in data: 1426
Data distribution after iteration 533:
	Majority (1): 3185, Minority (label 0): 1648
Number of 'good' native counterfactuals in data: 1426
Data distribution after iteration 534:
	Majority (1): 3185, Minority (label 0): 1650
Number of 'good' native counterfactuals in data: 1426
Data distribution after iteration 535:
	Majority (1): 3185, Minority (label 0): 1652
Number of 'good' native counterfactuals in data: 1426
Data distribution after iteration 536:
	Majority (1): 3185, Minority (label 0): 1654
Number of 'good' native cou

Number of 'good' native counterfactuals in data: 1431
Data distribution after iteration 592:
	Majority (1): 3185, Minority (label 0): 1766
Number of 'good' native counterfactuals in data: 1431
Data distribution after iteration 593:
	Majority (1): 3185, Minority (label 0): 1768
Number of 'good' native counterfactuals in data: 1431
Data distribution after iteration 594:
	Majority (1): 3185, Minority (label 0): 1770
Number of 'good' native counterfactuals in data: 1431
Data distribution after iteration 595:
	Majority (1): 3185, Minority (label 0): 1772
Number of 'good' native counterfactuals in data: 1431
Data distribution after iteration 596:
	Majority (1): 3185, Minority (label 0): 1774
Number of 'good' native counterfactuals in data: 1431
Data distribution after iteration 597:
	Majority (1): 3185, Minority (label 0): 1776
Number of 'good' native counterfactuals in data: 1431
Data distribution after iteration 598:
	Majority (1): 3185, Minority (label 0): 1778
Number of 'good' native cou

Number of 'good' native counterfactuals in data: 1432
Data distribution after iteration 652:
	Majority (1): 3185, Minority (label 0): 1886
Number of 'good' native counterfactuals in data: 1432
Data distribution after iteration 653:
	Majority (1): 3185, Minority (label 0): 1888
Number of 'good' native counterfactuals in data: 1432
Data distribution after iteration 654:
	Majority (1): 3185, Minority (label 0): 1890
Number of 'good' native counterfactuals in data: 1432
Data distribution after iteration 655:
	Majority (1): 3185, Minority (label 0): 1892
Number of 'good' native counterfactuals in data: 1432
Data distribution after iteration 656:
	Majority (1): 3185, Minority (label 0): 1894
Number of 'good' native counterfactuals in data: 1432
Data distribution after iteration 657:
	Majority (1): 3185, Minority (label 0): 1896
Number of 'good' native counterfactuals in data: 1432
Data distribution after iteration 658:
	Majority (1): 3185, Minority (label 0): 1898
Number of 'good' native cou

Number of 'good' native counterfactuals in data: 1433
Data distribution after iteration 713:
	Majority (1): 3185, Minority (label 0): 2008
Number of 'good' native counterfactuals in data: 1433
Data distribution after iteration 714:
	Majority (1): 3185, Minority (label 0): 2010
Number of 'good' native counterfactuals in data: 1433
Data distribution after iteration 715:
	Majority (1): 3185, Minority (label 0): 2012
Number of 'good' native counterfactuals in data: 1433
Data distribution after iteration 716:
	Majority (1): 3185, Minority (label 0): 2014
Number of 'good' native counterfactuals in data: 1433
Data distribution after iteration 717:
	Majority (1): 3185, Minority (label 0): 2016
Number of 'good' native counterfactuals in data: 1433
Data distribution after iteration 718:
	Majority (1): 3185, Minority (label 0): 2018
Number of 'good' native counterfactuals in data: 1433
Data distribution after iteration 719:
	Majority (1): 3185, Minority (label 0): 2020
Number of 'good' native cou

Number of 'good' native counterfactuals in data: 1438
Data distribution after iteration 773:
	Majority (1): 3185, Minority (label 0): 2128
Number of 'good' native counterfactuals in data: 1438
Data distribution after iteration 774:
	Majority (1): 3185, Minority (label 0): 2130
Number of 'good' native counterfactuals in data: 1438
Data distribution after iteration 775:
	Majority (1): 3185, Minority (label 0): 2132
Number of 'good' native counterfactuals in data: 1438
Data distribution after iteration 776:
	Majority (1): 3185, Minority (label 0): 2134
Number of 'good' native counterfactuals in data: 1438
Data distribution after iteration 777:
	Majority (1): 3185, Minority (label 0): 2136
Number of 'good' native counterfactuals in data: 1438
Data distribution after iteration 778:
	Majority (1): 3185, Minority (label 0): 2138
Number of 'good' native counterfactuals in data: 1438
Data distribution after iteration 779:
	Majority (1): 3185, Minority (label 0): 2140
Number of 'good' native cou

Number of 'good' native counterfactuals in data: 1439
Data distribution after iteration 833:
	Majority (1): 3185, Minority (label 0): 2248
Number of 'good' native counterfactuals in data: 1439
Data distribution after iteration 834:
	Majority (1): 3185, Minority (label 0): 2250
Number of 'good' native counterfactuals in data: 1439
Data distribution after iteration 835:
	Majority (1): 3185, Minority (label 0): 2252
Number of 'good' native counterfactuals in data: 1439
Data distribution after iteration 836:
	Majority (1): 3185, Minority (label 0): 2254
Number of 'good' native counterfactuals in data: 1439
Data distribution after iteration 837:
	Majority (1): 3185, Minority (label 0): 2256
Number of 'good' native counterfactuals in data: 1439
Data distribution after iteration 838:
	Majority (1): 3185, Minority (label 0): 2258
Number of 'good' native counterfactuals in data: 1439
Data distribution after iteration 839:
	Majority (1): 3185, Minority (label 0): 2260
Number of 'good' native cou

Number of 'good' native counterfactuals in data: 1444
Data distribution after iteration 894:
	Majority (1): 3185, Minority (label 0): 2370
Number of 'good' native counterfactuals in data: 1444
Data distribution after iteration 895:
	Majority (1): 3185, Minority (label 0): 2372
Number of 'good' native counterfactuals in data: 1444
Data distribution after iteration 896:
	Majority (1): 3185, Minority (label 0): 2374
Number of 'good' native counterfactuals in data: 1444
Data distribution after iteration 897:
	Majority (1): 3185, Minority (label 0): 2376
Number of 'good' native counterfactuals in data: 1444
Data distribution after iteration 898:
	Majority (1): 3185, Minority (label 0): 2378
Number of 'good' native counterfactuals in data: 1444
Data distribution after iteration 899:
	Majority (1): 3185, Minority (label 0): 2380
Number of 'good' native counterfactuals in data: 1444
Data distribution after iteration 900:
	Majority (1): 3185, Minority (label 0): 2382
Number of 'good' native cou

Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 953:
	Majority (1): 3185, Minority (label 0): 2488
Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 954:
	Majority (1): 3185, Minority (label 0): 2490
Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 955:
	Majority (1): 3185, Minority (label 0): 2492
Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 956:
	Majority (1): 3185, Minority (label 0): 2494
Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 957:
	Majority (1): 3185, Minority (label 0): 2496
Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 958:
	Majority (1): 3185, Minority (label 0): 2498
Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 959:
	Majority (1): 3185, Minority (label 0): 2500
Number of 'good' native cou

Data distribution after iteration 1013:
	Majority (1): 3185, Minority (label 0): 2608
Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 1014:
	Majority (1): 3185, Minority (label 0): 2610
Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 1015:
	Majority (1): 3185, Minority (label 0): 2612
Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 1016:
	Majority (1): 3185, Minority (label 0): 2614
Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 1017:
	Majority (1): 3185, Minority (label 0): 2616
Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 1018:
	Majority (1): 3185, Minority (label 0): 2618
Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 1019:
	Majority (1): 3185, Minority (label 0): 2620
Number of 'good' native counterfactuals in data: 1445
Data distribution af

Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1072:
	Majority (1): 3185, Minority (label 0): 2726
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1073:
	Majority (1): 3185, Minority (label 0): 2728
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1074:
	Majority (1): 3185, Minority (label 0): 2730
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1075:
	Majority (1): 3185, Minority (label 0): 2732
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1076:
	Majority (1): 3185, Minority (label 0): 2734
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1077:
	Majority (1): 3185, Minority (label 0): 2736
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1078:
	Majority (1): 3185, Minority (label 0): 2738
Number of 'good' nat

Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1132:
	Majority (1): 3185, Minority (label 0): 2846
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1133:
	Majority (1): 3185, Minority (label 0): 2848
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1134:
	Majority (1): 3185, Minority (label 0): 2850
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1135:
	Majority (1): 3185, Minority (label 0): 2852
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1136:
	Majority (1): 3185, Minority (label 0): 2854
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1137:
	Majority (1): 3185, Minority (label 0): 2856
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1138:
	Majority (1): 3185, Minority (label 0): 2858
Number of 'good' nat

Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1194:
	Majority (1): 3185, Minority (label 0): 2970
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1195:
	Majority (1): 3185, Minority (label 0): 2972
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1196:
	Majority (1): 3185, Minority (label 0): 2974
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1197:
	Majority (1): 3185, Minority (label 0): 2976
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1198:
	Majority (1): 3185, Minority (label 0): 2978
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1199:
	Majority (1): 3185, Minority (label 0): 2980
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1200:
	Majority (1): 3185, Minority (label 0): 2982
Number of 'good' nat

Number of 'good' native counterfactuals in data: 1458
Data distribution after iteration 1254:
	Majority (1): 3185, Minority (label 0): 3090
Number of 'good' native counterfactuals in data: 1458
Data distribution after iteration 1255:
	Majority (1): 3185, Minority (label 0): 3092
Number of 'good' native counterfactuals in data: 1458
Data distribution after iteration 1256:
	Majority (1): 3185, Minority (label 0): 3094
Number of 'good' native counterfactuals in data: 1458
Data distribution after iteration 1257:
	Majority (1): 3185, Minority (label 0): 3096
Number of 'good' native counterfactuals in data: 1458
Data distribution after iteration 1258:
	Majority (1): 3185, Minority (label 0): 3098
Number of 'good' native counterfactuals in data: 1458
Data distribution after iteration 1259:
	Majority (1): 3185, Minority (label 0): 3100
Number of 'good' native counterfactuals in data: 1458
Data distribution after iteration 1260:
	Majority (1): 3185, Minority (label 0): 3102
Number of 'good' nat

Number of 'good' native counterfactuals in data: 1075
Data distribution after iteration 26:
	Majority (1): 3173, Minority (label 0): 486
Number of 'good' native counterfactuals in data: 1075
Data distribution after iteration 27:
	Majority (1): 3173, Minority (label 0): 490
Number of 'good' native counterfactuals in data: 1075
Data distribution after iteration 28:
	Majority (1): 3173, Minority (label 0): 494
Number of 'good' native counterfactuals in data: 1074
Data distribution after iteration 29:
	Majority (1): 3173, Minority (label 0): 498
Number of 'good' native counterfactuals in data: 1074
Data distribution after iteration 30:
	Majority (1): 3173, Minority (label 0): 502
Number of 'good' native counterfactuals in data: 1074
Data distribution after iteration 31:
	Majority (1): 3173, Minority (label 0): 506
Number of 'good' native counterfactuals in data: 1074
Data distribution after iteration 32:
	Majority (1): 3173, Minority (label 0): 510
Number of 'good' native counterfactuals i

Data distribution after iteration 88:
	Majority (1): 3173, Minority (label 0): 729
Number of 'good' native counterfactuals in data: 1007
Data distribution after iteration 89:
	Majority (1): 3173, Minority (label 0): 732
Number of 'good' native counterfactuals in data: 1007
Data distribution after iteration 90:
	Majority (1): 3173, Minority (label 0): 735
Number of 'good' native counterfactuals in data: 1007
Data distribution after iteration 91:
	Majority (1): 3173, Minority (label 0): 738
Number of 'good' native counterfactuals in data: 1007
Data distribution after iteration 92:
	Majority (1): 3173, Minority (label 0): 741
Number of 'good' native counterfactuals in data: 1007
Data distribution after iteration 93:
	Majority (1): 3173, Minority (label 0): 744
Number of 'good' native counterfactuals in data: 1006
Data distribution after iteration 94:
	Majority (1): 3173, Minority (label 0): 747
Number of 'good' native counterfactuals in data: 1006
Data distribution after iteration 95:
	Ma

Number of 'good' native counterfactuals in data: 999
Data distribution after iteration 148:
	Majority (1): 3173, Minority (label 0): 909
Number of 'good' native counterfactuals in data: 999
Data distribution after iteration 149:
	Majority (1): 3173, Minority (label 0): 912
Number of 'good' native counterfactuals in data: 999
Data distribution after iteration 150:
	Majority (1): 3173, Minority (label 0): 915
Number of 'good' native counterfactuals in data: 999
Data distribution after iteration 151:
	Majority (1): 3173, Minority (label 0): 918
Number of 'good' native counterfactuals in data: 999
Data distribution after iteration 152:
	Majority (1): 3173, Minority (label 0): 921
Number of 'good' native counterfactuals in data: 999
Data distribution after iteration 153:
	Majority (1): 3173, Minority (label 0): 924
Number of 'good' native counterfactuals in data: 999
Data distribution after iteration 154:
	Majority (1): 3173, Minority (label 0): 927
Number of 'good' native counterfactuals i

Number of 'good' native counterfactuals in data: 987
Data distribution after iteration 211:
	Majority (1): 3173, Minority (label 0): 1098
Number of 'good' native counterfactuals in data: 987
Data distribution after iteration 212:
	Majority (1): 3173, Minority (label 0): 1101
Number of 'good' native counterfactuals in data: 987
Data distribution after iteration 213:
	Majority (1): 3173, Minority (label 0): 1104
Number of 'good' native counterfactuals in data: 987
Data distribution after iteration 214:
	Majority (1): 3173, Minority (label 0): 1107
Number of 'good' native counterfactuals in data: 987
Data distribution after iteration 215:
	Majority (1): 3173, Minority (label 0): 1110
Number of 'good' native counterfactuals in data: 987
Data distribution after iteration 216:
	Majority (1): 3173, Minority (label 0): 1113
Number of 'good' native counterfactuals in data: 987
Data distribution after iteration 217:
	Majority (1): 3173, Minority (label 0): 1116
Number of 'good' native counterfac

Number of 'good' native counterfactuals in data: 982
Data distribution after iteration 271:
	Majority (1): 3173, Minority (label 0): 1278
Number of 'good' native counterfactuals in data: 982
Data distribution after iteration 272:
	Majority (1): 3173, Minority (label 0): 1281
Number of 'good' native counterfactuals in data: 981
Data distribution after iteration 273:
	Majority (1): 3173, Minority (label 0): 1284
Number of 'good' native counterfactuals in data: 981
Data distribution after iteration 274:
	Majority (1): 3173, Minority (label 0): 1287
Number of 'good' native counterfactuals in data: 981
Data distribution after iteration 275:
	Majority (1): 3173, Minority (label 0): 1290
Number of 'good' native counterfactuals in data: 981
Data distribution after iteration 276:
	Majority (1): 3173, Minority (label 0): 1293
Number of 'good' native counterfactuals in data: 981
Data distribution after iteration 277:
	Majority (1): 3173, Minority (label 0): 1296
Number of 'good' native counterfac

Number of 'good' native counterfactuals in data: 943
Data distribution after iteration 331:
	Majority (1): 3173, Minority (label 0): 1458
Number of 'good' native counterfactuals in data: 943
Data distribution after iteration 332:
	Majority (1): 3173, Minority (label 0): 1461
Number of 'good' native counterfactuals in data: 943
Data distribution after iteration 333:
	Majority (1): 3173, Minority (label 0): 1464
Number of 'good' native counterfactuals in data: 943
Data distribution after iteration 334:
	Majority (1): 3173, Minority (label 0): 1467
Number of 'good' native counterfactuals in data: 943
Data distribution after iteration 335:
	Majority (1): 3173, Minority (label 0): 1470
Number of 'good' native counterfactuals in data: 943
Data distribution after iteration 336:
	Majority (1): 3173, Minority (label 0): 1473
Number of 'good' native counterfactuals in data: 943
Data distribution after iteration 337:
	Majority (1): 3173, Minority (label 0): 1476
Number of 'good' native counterfac

Number of 'good' native counterfactuals in data: 933
Data distribution after iteration 391:
	Majority (1): 3173, Minority (label 0): 1638
Number of 'good' native counterfactuals in data: 933
Data distribution after iteration 392:
	Majority (1): 3173, Minority (label 0): 1641
Number of 'good' native counterfactuals in data: 933
Data distribution after iteration 393:
	Majority (1): 3173, Minority (label 0): 1644
Number of 'good' native counterfactuals in data: 933
Data distribution after iteration 394:
	Majority (1): 3173, Minority (label 0): 1647
Number of 'good' native counterfactuals in data: 933
Data distribution after iteration 395:
	Majority (1): 3173, Minority (label 0): 1650
Number of 'good' native counterfactuals in data: 933
Data distribution after iteration 396:
	Majority (1): 3173, Minority (label 0): 1653
Number of 'good' native counterfactuals in data: 933
Data distribution after iteration 397:
	Majority (1): 3173, Minority (label 0): 1656
Number of 'good' native counterfac

Number of 'good' native counterfactuals in data: 708
Data distribution after iteration 451:
	Majority (1): 3173, Minority (label 0): 1818
Number of 'good' native counterfactuals in data: 708
Data distribution after iteration 452:
	Majority (1): 3173, Minority (label 0): 1821
Number of 'good' native counterfactuals in data: 708
Data distribution after iteration 453:
	Majority (1): 3173, Minority (label 0): 1824
Number of 'good' native counterfactuals in data: 708
Data distribution after iteration 454:
	Majority (1): 3173, Minority (label 0): 1827
Number of 'good' native counterfactuals in data: 708
Data distribution after iteration 455:
	Majority (1): 3173, Minority (label 0): 1830
Number of 'good' native counterfactuals in data: 708
Data distribution after iteration 456:
	Majority (1): 3173, Minority (label 0): 1833
Number of 'good' native counterfactuals in data: 708
Data distribution after iteration 457:
	Majority (1): 3173, Minority (label 0): 1836
Number of 'good' native counterfac

Number of 'good' native counterfactuals in data: 706
Data distribution after iteration 511:
	Majority (1): 3173, Minority (label 0): 1998
Number of 'good' native counterfactuals in data: 706
Data distribution after iteration 512:
	Majority (1): 3173, Minority (label 0): 2001
Number of 'good' native counterfactuals in data: 706
Data distribution after iteration 513:
	Majority (1): 3173, Minority (label 0): 2004
Number of 'good' native counterfactuals in data: 706
Data distribution after iteration 514:
	Majority (1): 3173, Minority (label 0): 2007
Number of 'good' native counterfactuals in data: 706
Data distribution after iteration 515:
	Majority (1): 3173, Minority (label 0): 2010
Number of 'good' native counterfactuals in data: 706
Data distribution after iteration 516:
	Majority (1): 3173, Minority (label 0): 2013
Number of 'good' native counterfactuals in data: 706
Data distribution after iteration 517:
	Majority (1): 3173, Minority (label 0): 2016
Number of 'good' native counterfac

Data distribution after iteration 574:
	Majority (1): 3173, Minority (label 0): 2187
Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 575:
	Majority (1): 3173, Minority (label 0): 2190
Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 576:
	Majority (1): 3173, Minority (label 0): 2193
Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 577:
	Majority (1): 3173, Minority (label 0): 2196
Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 578:
	Majority (1): 3173, Minority (label 0): 2199
Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 579:
	Majority (1): 3173, Minority (label 0): 2202
Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 580:
	Majority (1): 3173, Minority (label 0): 2205
Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 

Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 636:
	Majority (1): 3173, Minority (label 0): 2373
Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 637:
	Majority (1): 3173, Minority (label 0): 2376
Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 638:
	Majority (1): 3173, Minority (label 0): 2379
Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 639:
	Majority (1): 3173, Minority (label 0): 2382
Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 640:
	Majority (1): 3173, Minority (label 0): 2385
Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 641:
	Majority (1): 3173, Minority (label 0): 2388
Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 642:
	Majority (1): 3173, Minority (label 0): 2391
Number of 'good' native counterfac

Data distribution after iteration 697:
	Majority (1): 3173, Minority (label 0): 2556
Number of 'good' native counterfactuals in data: 716
Data distribution after iteration 698:
	Majority (1): 3173, Minority (label 0): 2559
Number of 'good' native counterfactuals in data: 716
Data distribution after iteration 699:
	Majority (1): 3173, Minority (label 0): 2562
Number of 'good' native counterfactuals in data: 716
Data distribution after iteration 700:
	Majority (1): 3173, Minority (label 0): 2565
Number of 'good' native counterfactuals in data: 716
Data distribution after iteration 701:
	Majority (1): 3173, Minority (label 0): 2568
Number of 'good' native counterfactuals in data: 716
Data distribution after iteration 702:
	Majority (1): 3173, Minority (label 0): 2571
Number of 'good' native counterfactuals in data: 716
Data distribution after iteration 703:
	Majority (1): 3173, Minority (label 0): 2574
Number of 'good' native counterfactuals in data: 716
Data distribution after iteration 

Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 760:
	Majority (1): 3173, Minority (label 0): 2745
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 761:
	Majority (1): 3173, Minority (label 0): 2748
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 762:
	Majority (1): 3173, Minority (label 0): 2751
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 763:
	Majority (1): 3173, Minority (label 0): 2754
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 764:
	Majority (1): 3173, Minority (label 0): 2757
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 765:
	Majority (1): 3173, Minority (label 0): 2760
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 766:
	Majority (1): 3173, Minority (label 0): 2763
Number of 'good' native counterfac

Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 820:
	Majority (1): 3173, Minority (label 0): 2925
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 821:
	Majority (1): 3173, Minority (label 0): 2928
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 822:
	Majority (1): 3173, Minority (label 0): 2931
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 823:
	Majority (1): 3173, Minority (label 0): 2934
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 824:
	Majority (1): 3173, Minority (label 0): 2937
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 825:
	Majority (1): 3173, Minority (label 0): 2940
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 826:
	Majority (1): 3173, Minority (label 0): 2943
Number of 'good' native counterfac

Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 880:
	Majority (1): 3173, Minority (label 0): 3105
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 881:
	Majority (1): 3173, Minority (label 0): 3108
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 882:
	Majority (1): 3173, Minority (label 0): 3111
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 883:
	Majority (1): 3173, Minority (label 0): 3114
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 884:
	Majority (1): 3173, Minority (label 0): 3117
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 885:
	Majority (1): 3173, Minority (label 0): 3120
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 886:
	Majority (1): 3173, Minority (label 0): 3123
Number of 'good' native counterfac

Data distribution after iteration 47:
	Majority (1): 3172, Minority (label 0): 387
Number of 'good' native counterfactuals in data: 995
Data distribution after iteration 48:
	Majority (1): 3172, Minority (label 0): 389
Number of 'good' native counterfactuals in data: 991
Data distribution after iteration 49:
	Majority (1): 3172, Minority (label 0): 391
Number of 'good' native counterfactuals in data: 991
Data distribution after iteration 50:
	Majority (1): 3172, Minority (label 0): 393
Number of 'good' native counterfactuals in data: 991
Data distribution after iteration 51:
	Majority (1): 3172, Minority (label 0): 395
Number of 'good' native counterfactuals in data: 990
Data distribution after iteration 52:
	Majority (1): 3172, Minority (label 0): 397
Number of 'good' native counterfactuals in data: 990
Data distribution after iteration 53:
	Majority (1): 3172, Minority (label 0): 399
Number of 'good' native counterfactuals in data: 990
Data distribution after iteration 54:
	Majority 

In [14]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'max_depth': 10, 'n_estimators': 600}
Best AUC: 0.7317163918516486


#### Approach 2: Verification of new synthetic counterfactuals with baseline model

In [11]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    
    print("Fitting baseline model needed for verification... ")
    baseline_search = GridSearchCV(RandomForestClassifier(random_state=19231823), rf_param_grid, scoring="roc_auc")
    baseline_search.fit(X_train, y_train)
    
    X_train_augmented, y_train_augmented = Iterative_CFA(X_train, 
                                                         y_train, 
                                                         stddev_percent=50, 
                                                         verify_with_baseline_model=True, 
                                                         baseline_model=baseline_search,
                                                         visualize_with_pca=False)
    
    for param_comb in combination_dicts:
        clf = RandomForestClassifier(random_state=19231823, **param_comb)
        clf.fit(X_train_augmented, y_train_augmented)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

Fold 0:
Fitting baseline model needed for verification... 
Data distribution before CFA:
	Majority (1): 3183, Minority (label 0): 158
Number of 'good' native counterfactuals in data: 1378
Classifier predicted all new synthetic counterfactuals to be in the majority class! => No new minority instances => Terminating...
Data distribution after CFA:
	Majority (1): 3183, Minority (label 0): 158
Fold 1:
Fitting baseline model needed for verification... 
Data distribution before CFA:
	Majority (1): 3185, Minority (label 0): 156
Number of 'good' native counterfactuals in data: 1056
Classifier predicted all new synthetic counterfactuals to be in the majority class! => No new minority instances => Terminating...
Data distribution after CFA:
	Majority (1): 3185, Minority (label 0): 156
Fold 2:
Fitting baseline model needed for verification... 
Data distribution before CFA:
	Majority (1): 3173, Minority (label 0): 169
Number of 'good' native counterfactuals in data: 1029
Classifier predicted all n

In [16]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'max_depth': 6, 'n_estimators': 600}
Best AUC: 0.7395650202089131


### ADASYN instead of CFA

In [17]:
from imblearn.over_sampling import ADASYN
ada = ADASYN(random_state=9317231)

In [None]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    #print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        
    X_train_resampled, y_train_resampled = ada.fit_resample(X_train, y_train)
        
    for param_comb in combination_dicts:
        clf = RandomForestClassifier(random_state=19231823, **param_comb)
        clf.fit(X_train_resampled, y_train_resampled)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

In [19]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'max_depth': 10, 'n_estimators': 400}
Best AUC: 0.7172285092627817


### Random Oversampling

In [20]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=52012318)

In [None]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    #print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
        
    for param_comb in combination_dicts:
        clf = RandomForestClassifier(random_state=19231823, **param_comb)
        clf.fit(X_train_resampled, y_train_resampled)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

In [22]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'max_depth': 6, 'n_estimators': 100}
Best AUC: 0.7175582926156095


## Logistic Regression 

In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
lr_param_grid = {"max_iter": [100, 200, 1000], 
                 "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                 "solver": ["newton-cg", "lbfgs", "liblinear", "sag"]}

# compute all combinations of parameters
combination_dicts = list(ParameterGrid(lr_param_grid))

### Baseline (no augmentation)

In [None]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    #print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        
    for param_comb in combination_dicts:
        clf = LogisticRegression(random_state=19231823, **param_comb)
        clf.fit(X_train, y_train)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

In [26]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'C': 10, 'max_iter': 100, 'solver': 'liblinear'}
Best AUC: 0.7342425375116226


Again, we can compare to an out-of-the-box grid search to make sure:

In [27]:
baseline_search = GridSearchCV(LogisticRegression(random_state=19231823), lr_param_grid, scoring="roc_auc")
baseline_search.fit(X, y)
baseline_search.best_score_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.7366341749344969

### With CFA

#### Approach 1: No verification 

In [14]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    
    X_train_augmented, y_train_augmented = Iterative_CFA(X_train, 
                                                         y_train, 
                                                         stddev_percent=50, 
                                                         verify_with_baseline_model=False, 
                                                         visualize_with_pca=False)

    for param_comb in combination_dicts:
        clf = LogisticRegression(random_state=19231823, **param_comb)
        clf.fit(X_train_augmented, y_train_augmented)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

Fold 0:
Data distribution before CFA:
	Majority (1): 3183, Minority (label 0): 158
Number of 'good' native counterfactuals in data: 1378
Data distribution after iteration 0:
	Majority (1): 3183, Minority (label 0): 310
Number of 'good' native counterfactuals in data: 1125
Data distribution after iteration 1:
	Majority (1): 3183, Minority (label 0): 347
Number of 'good' native counterfactuals in data: 1136
Data distribution after iteration 2:
	Majority (1): 3183, Minority (label 0): 353
Number of 'good' native counterfactuals in data: 1141
Data distribution after iteration 3:
	Majority (1): 3183, Minority (label 0): 354
Number of 'good' native counterfactuals in data: 1141
Data distribution after iteration 4:
	Majority (1): 3183, Minority (label 0): 355
Number of 'good' native counterfactuals in data: 1141
Data distribution after iteration 5:
	Majority (1): 3183, Minority (label 0): 356
Number of 'good' native counterfactuals in data: 1141
Data distribution after iteration 6:
	Majority 

Number of 'good' native counterfactuals in data: 1137
Data distribution after iteration 65:
	Majority (1): 3183, Minority (label 0): 416
Number of 'good' native counterfactuals in data: 1137
Data distribution after iteration 66:
	Majority (1): 3183, Minority (label 0): 417
Number of 'good' native counterfactuals in data: 1137
Data distribution after iteration 67:
	Majority (1): 3183, Minority (label 0): 418
Number of 'good' native counterfactuals in data: 1137
Data distribution after iteration 68:
	Majority (1): 3183, Minority (label 0): 419
Number of 'good' native counterfactuals in data: 1137
Data distribution after iteration 69:
	Majority (1): 3183, Minority (label 0): 420
Number of 'good' native counterfactuals in data: 1137
Data distribution after iteration 70:
	Majority (1): 3183, Minority (label 0): 421
Number of 'good' native counterfactuals in data: 1137
Data distribution after iteration 71:
	Majority (1): 3183, Minority (label 0): 422
Number of 'good' native counterfactuals i

Number of 'good' native counterfactuals in data: 1133
Data distribution after iteration 131:
	Majority (1): 3183, Minority (label 0): 482
Number of 'good' native counterfactuals in data: 1133
Data distribution after iteration 132:
	Majority (1): 3183, Minority (label 0): 483
Number of 'good' native counterfactuals in data: 1133
Data distribution after iteration 133:
	Majority (1): 3183, Minority (label 0): 484
Number of 'good' native counterfactuals in data: 1133
Data distribution after iteration 134:
	Majority (1): 3183, Minority (label 0): 485
Number of 'good' native counterfactuals in data: 1133
Data distribution after iteration 135:
	Majority (1): 3183, Minority (label 0): 486
Number of 'good' native counterfactuals in data: 1133
Data distribution after iteration 136:
	Majority (1): 3183, Minority (label 0): 487
Number of 'good' native counterfactuals in data: 1133
Data distribution after iteration 137:
	Majority (1): 3183, Minority (label 0): 488
Number of 'good' native counterfac

Data distribution after iteration 190:
	Majority (1): 3183, Minority (label 0): 541
Number of 'good' native counterfactuals in data: 1124
Data distribution after iteration 191:
	Majority (1): 3183, Minority (label 0): 542
Number of 'good' native counterfactuals in data: 1124
Data distribution after iteration 192:
	Majority (1): 3183, Minority (label 0): 543
Number of 'good' native counterfactuals in data: 1124
Data distribution after iteration 193:
	Majority (1): 3183, Minority (label 0): 544
Number of 'good' native counterfactuals in data: 1124
Data distribution after iteration 194:
	Majority (1): 3183, Minority (label 0): 545
Number of 'good' native counterfactuals in data: 1124
Data distribution after iteration 195:
	Majority (1): 3183, Minority (label 0): 546
Number of 'good' native counterfactuals in data: 1124
Data distribution after iteration 196:
	Majority (1): 3183, Minority (label 0): 547
Number of 'good' native counterfactuals in data: 1099
Data distribution after iteration 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 1:
Data distribution before CFA:
	Majority (1): 3185, Minority (label 0): 156
Number of 'good' native counterfactuals in data: 1056
Data distribution after iteration 0:
	Majority (1): 3185, Minority (label 0): 467
Number of 'good' native counterfactuals in data: 1264
Data distribution after iteration 1:
	Majority (1): 3185, Minority (label 0): 568
Number of 'good' native counterfactuals in data: 1344
Data distribution after iteration 2:
	Majority (1): 3185, Minority (label 0): 581
Number of 'good' native counterfactuals in data: 1353
Data distribution after iteration 3:
	Majority (1): 3185, Minority (label 0): 585
Number of 'good' native counterfactuals in data: 1354
Data distribution after iteration 4:
	Majority (1): 3185, Minority (label 0): 587
Number of 'good' native counterfactuals in data: 1354
Data distribution after iteration 5:
	Majority (1): 3185, Minority (label 0): 589
Number of 'good' native counterfactuals in data: 1354
Data distribution after iteration 6:
	Majority 

Number of 'good' native counterfactuals in data: 1356
Data distribution after iteration 61:
	Majority (1): 3185, Minority (label 0): 701
Number of 'good' native counterfactuals in data: 1356
Data distribution after iteration 62:
	Majority (1): 3185, Minority (label 0): 703
Number of 'good' native counterfactuals in data: 1356
Data distribution after iteration 63:
	Majority (1): 3185, Minority (label 0): 705
Number of 'good' native counterfactuals in data: 1356
Data distribution after iteration 64:
	Majority (1): 3185, Minority (label 0): 707
Number of 'good' native counterfactuals in data: 1356
Data distribution after iteration 65:
	Majority (1): 3185, Minority (label 0): 709
Number of 'good' native counterfactuals in data: 1356
Data distribution after iteration 66:
	Majority (1): 3185, Minority (label 0): 711
Number of 'good' native counterfactuals in data: 1356
Data distribution after iteration 67:
	Majority (1): 3185, Minority (label 0): 713
Number of 'good' native counterfactuals i

Number of 'good' native counterfactuals in data: 1357
Data distribution after iteration 121:
	Majority (1): 3185, Minority (label 0): 821
Number of 'good' native counterfactuals in data: 1357
Data distribution after iteration 122:
	Majority (1): 3185, Minority (label 0): 823
Number of 'good' native counterfactuals in data: 1357
Data distribution after iteration 123:
	Majority (1): 3185, Minority (label 0): 825
Number of 'good' native counterfactuals in data: 1357
Data distribution after iteration 124:
	Majority (1): 3185, Minority (label 0): 827
Number of 'good' native counterfactuals in data: 1357
Data distribution after iteration 125:
	Majority (1): 3185, Minority (label 0): 829
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 126:
	Majority (1): 3185, Minority (label 0): 831
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 127:
	Majority (1): 3185, Minority (label 0): 833
Number of 'good' native counterfac

Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 182:
	Majority (1): 3185, Minority (label 0): 943
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 183:
	Majority (1): 3185, Minority (label 0): 945
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 184:
	Majority (1): 3185, Minority (label 0): 947
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 185:
	Majority (1): 3185, Minority (label 0): 949
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 186:
	Majority (1): 3185, Minority (label 0): 951
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 187:
	Majority (1): 3185, Minority (label 0): 953
Number of 'good' native counterfactuals in data: 1358
Data distribution after iteration 188:
	Majority (1): 3185, Minority (label 0): 955
Number of 'good' native counterfac

Data distribution after iteration 245:
	Majority (1): 3185, Minority (label 0): 1069
Number of 'good' native counterfactuals in data: 1362
Data distribution after iteration 246:
	Majority (1): 3185, Minority (label 0): 1071
Number of 'good' native counterfactuals in data: 1362
Data distribution after iteration 247:
	Majority (1): 3185, Minority (label 0): 1073
Number of 'good' native counterfactuals in data: 1362
Data distribution after iteration 248:
	Majority (1): 3185, Minority (label 0): 1075
Number of 'good' native counterfactuals in data: 1362
Data distribution after iteration 249:
	Majority (1): 3185, Minority (label 0): 1077
Number of 'good' native counterfactuals in data: 1362
Data distribution after iteration 250:
	Majority (1): 3185, Minority (label 0): 1079
Number of 'good' native counterfactuals in data: 1362
Data distribution after iteration 251:
	Majority (1): 3185, Minority (label 0): 1081
Number of 'good' native counterfactuals in data: 1362
Data distribution after ite

Number of 'good' native counterfactuals in data: 1377
Data distribution after iteration 308:
	Majority (1): 3185, Minority (label 0): 1198
Number of 'good' native counterfactuals in data: 1377
Data distribution after iteration 309:
	Majority (1): 3185, Minority (label 0): 1200
Number of 'good' native counterfactuals in data: 1377
Data distribution after iteration 310:
	Majority (1): 3185, Minority (label 0): 1202
Number of 'good' native counterfactuals in data: 1377
Data distribution after iteration 311:
	Majority (1): 3185, Minority (label 0): 1204
Number of 'good' native counterfactuals in data: 1377
Data distribution after iteration 312:
	Majority (1): 3185, Minority (label 0): 1206
Number of 'good' native counterfactuals in data: 1377
Data distribution after iteration 313:
	Majority (1): 3185, Minority (label 0): 1208
Number of 'good' native counterfactuals in data: 1377
Data distribution after iteration 314:
	Majority (1): 3185, Minority (label 0): 1210
Number of 'good' native cou

Data distribution after iteration 370:
	Majority (1): 3185, Minority (label 0): 1322
Number of 'good' native counterfactuals in data: 1377
Data distribution after iteration 371:
	Majority (1): 3185, Minority (label 0): 1324
Number of 'good' native counterfactuals in data: 1377
Data distribution after iteration 372:
	Majority (1): 3185, Minority (label 0): 1326
Number of 'good' native counterfactuals in data: 1377
Data distribution after iteration 373:
	Majority (1): 3185, Minority (label 0): 1328
Number of 'good' native counterfactuals in data: 1377
Data distribution after iteration 374:
	Majority (1): 3185, Minority (label 0): 1330
Number of 'good' native counterfactuals in data: 1377
Data distribution after iteration 375:
	Majority (1): 3185, Minority (label 0): 1332
Number of 'good' native counterfactuals in data: 1377
Data distribution after iteration 376:
	Majority (1): 3185, Minority (label 0): 1334
Number of 'good' native counterfactuals in data: 1377
Data distribution after ite

Number of 'good' native counterfactuals in data: 1382
Data distribution after iteration 430:
	Majority (1): 3185, Minority (label 0): 1442
Number of 'good' native counterfactuals in data: 1382
Data distribution after iteration 431:
	Majority (1): 3185, Minority (label 0): 1444
Number of 'good' native counterfactuals in data: 1382
Data distribution after iteration 432:
	Majority (1): 3185, Minority (label 0): 1446
Number of 'good' native counterfactuals in data: 1382
Data distribution after iteration 433:
	Majority (1): 3185, Minority (label 0): 1448
Number of 'good' native counterfactuals in data: 1382
Data distribution after iteration 434:
	Majority (1): 3185, Minority (label 0): 1450
Number of 'good' native counterfactuals in data: 1385
Data distribution after iteration 435:
	Majority (1): 3185, Minority (label 0): 1452
Number of 'good' native counterfactuals in data: 1385
Data distribution after iteration 436:
	Majority (1): 3185, Minority (label 0): 1454
Number of 'good' native cou

Number of 'good' native counterfactuals in data: 1385
Data distribution after iteration 490:
	Majority (1): 3185, Minority (label 0): 1562
Number of 'good' native counterfactuals in data: 1385
Data distribution after iteration 491:
	Majority (1): 3185, Minority (label 0): 1564
Number of 'good' native counterfactuals in data: 1385
Data distribution after iteration 492:
	Majority (1): 3185, Minority (label 0): 1566
Number of 'good' native counterfactuals in data: 1385
Data distribution after iteration 493:
	Majority (1): 3185, Minority (label 0): 1568
Number of 'good' native counterfactuals in data: 1385
Data distribution after iteration 494:
	Majority (1): 3185, Minority (label 0): 1570
Number of 'good' native counterfactuals in data: 1385
Data distribution after iteration 495:
	Majority (1): 3185, Minority (label 0): 1572
Number of 'good' native counterfactuals in data: 1385
Data distribution after iteration 496:
	Majority (1): 3185, Minority (label 0): 1574
Number of 'good' native cou

Number of 'good' native counterfactuals in data: 1427
Data distribution after iteration 550:
	Majority (1): 3185, Minority (label 0): 1682
Number of 'good' native counterfactuals in data: 1427
Data distribution after iteration 551:
	Majority (1): 3185, Minority (label 0): 1684
Number of 'good' native counterfactuals in data: 1427
Data distribution after iteration 552:
	Majority (1): 3185, Minority (label 0): 1686
Number of 'good' native counterfactuals in data: 1427
Data distribution after iteration 553:
	Majority (1): 3185, Minority (label 0): 1688
Number of 'good' native counterfactuals in data: 1427
Data distribution after iteration 554:
	Majority (1): 3185, Minority (label 0): 1690
Number of 'good' native counterfactuals in data: 1427
Data distribution after iteration 555:
	Majority (1): 3185, Minority (label 0): 1692
Number of 'good' native counterfactuals in data: 1427
Data distribution after iteration 556:
	Majority (1): 3185, Minority (label 0): 1694
Number of 'good' native cou

Number of 'good' native counterfactuals in data: 1431
Data distribution after iteration 609:
	Majority (1): 3185, Minority (label 0): 1800
Number of 'good' native counterfactuals in data: 1431
Data distribution after iteration 610:
	Majority (1): 3185, Minority (label 0): 1802
Number of 'good' native counterfactuals in data: 1431
Data distribution after iteration 611:
	Majority (1): 3185, Minority (label 0): 1804
Number of 'good' native counterfactuals in data: 1431
Data distribution after iteration 612:
	Majority (1): 3185, Minority (label 0): 1806
Number of 'good' native counterfactuals in data: 1431
Data distribution after iteration 613:
	Majority (1): 3185, Minority (label 0): 1808
Number of 'good' native counterfactuals in data: 1431
Data distribution after iteration 614:
	Majority (1): 3185, Minority (label 0): 1810
Number of 'good' native counterfactuals in data: 1431
Data distribution after iteration 615:
	Majority (1): 3185, Minority (label 0): 1812
Number of 'good' native cou

Number of 'good' native counterfactuals in data: 1432
Data distribution after iteration 668:
	Majority (1): 3185, Minority (label 0): 1918
Number of 'good' native counterfactuals in data: 1432
Data distribution after iteration 669:
	Majority (1): 3185, Minority (label 0): 1920
Number of 'good' native counterfactuals in data: 1432
Data distribution after iteration 670:
	Majority (1): 3185, Minority (label 0): 1922
Number of 'good' native counterfactuals in data: 1432
Data distribution after iteration 671:
	Majority (1): 3185, Minority (label 0): 1924
Number of 'good' native counterfactuals in data: 1432
Data distribution after iteration 672:
	Majority (1): 3185, Minority (label 0): 1926
Number of 'good' native counterfactuals in data: 1432
Data distribution after iteration 673:
	Majority (1): 3185, Minority (label 0): 1928
Number of 'good' native counterfactuals in data: 1432
Data distribution after iteration 674:
	Majority (1): 3185, Minority (label 0): 1930
Number of 'good' native cou

Number of 'good' native counterfactuals in data: 1437
Data distribution after iteration 728:
	Majority (1): 3185, Minority (label 0): 2038
Number of 'good' native counterfactuals in data: 1437
Data distribution after iteration 729:
	Majority (1): 3185, Minority (label 0): 2040
Number of 'good' native counterfactuals in data: 1437
Data distribution after iteration 730:
	Majority (1): 3185, Minority (label 0): 2042
Number of 'good' native counterfactuals in data: 1437
Data distribution after iteration 731:
	Majority (1): 3185, Minority (label 0): 2044
Number of 'good' native counterfactuals in data: 1437
Data distribution after iteration 732:
	Majority (1): 3185, Minority (label 0): 2046
Number of 'good' native counterfactuals in data: 1437
Data distribution after iteration 733:
	Majority (1): 3185, Minority (label 0): 2048
Number of 'good' native counterfactuals in data: 1437
Data distribution after iteration 734:
	Majority (1): 3185, Minority (label 0): 2050
Number of 'good' native cou

Number of 'good' native counterfactuals in data: 1438
Data distribution after iteration 788:
	Majority (1): 3185, Minority (label 0): 2158
Number of 'good' native counterfactuals in data: 1438
Data distribution after iteration 789:
	Majority (1): 3185, Minority (label 0): 2160
Number of 'good' native counterfactuals in data: 1438
Data distribution after iteration 790:
	Majority (1): 3185, Minority (label 0): 2162
Number of 'good' native counterfactuals in data: 1438
Data distribution after iteration 791:
	Majority (1): 3185, Minority (label 0): 2164
Number of 'good' native counterfactuals in data: 1438
Data distribution after iteration 792:
	Majority (1): 3185, Minority (label 0): 2166
Number of 'good' native counterfactuals in data: 1438
Data distribution after iteration 793:
	Majority (1): 3185, Minority (label 0): 2168
Number of 'good' native counterfactuals in data: 1438
Data distribution after iteration 794:
	Majority (1): 3185, Minority (label 0): 2170
Number of 'good' native cou

Data distribution after iteration 847:
	Majority (1): 3185, Minority (label 0): 2276
Number of 'good' native counterfactuals in data: 1439
Data distribution after iteration 848:
	Majority (1): 3185, Minority (label 0): 2278
Number of 'good' native counterfactuals in data: 1439
Data distribution after iteration 849:
	Majority (1): 3185, Minority (label 0): 2280
Number of 'good' native counterfactuals in data: 1439
Data distribution after iteration 850:
	Majority (1): 3185, Minority (label 0): 2282
Number of 'good' native counterfactuals in data: 1439
Data distribution after iteration 851:
	Majority (1): 3185, Minority (label 0): 2284
Number of 'good' native counterfactuals in data: 1439
Data distribution after iteration 852:
	Majority (1): 3185, Minority (label 0): 2286
Number of 'good' native counterfactuals in data: 1439
Data distribution after iteration 853:
	Majority (1): 3185, Minority (label 0): 2288
Number of 'good' native counterfactuals in data: 1439
Data distribution after ite

Number of 'good' native counterfactuals in data: 1444
Data distribution after iteration 908:
	Majority (1): 3185, Minority (label 0): 2398
Number of 'good' native counterfactuals in data: 1444
Data distribution after iteration 909:
	Majority (1): 3185, Minority (label 0): 2400
Number of 'good' native counterfactuals in data: 1444
Data distribution after iteration 910:
	Majority (1): 3185, Minority (label 0): 2402
Number of 'good' native counterfactuals in data: 1444
Data distribution after iteration 911:
	Majority (1): 3185, Minority (label 0): 2404
Number of 'good' native counterfactuals in data: 1444
Data distribution after iteration 912:
	Majority (1): 3185, Minority (label 0): 2406
Number of 'good' native counterfactuals in data: 1444
Data distribution after iteration 913:
	Majority (1): 3185, Minority (label 0): 2408
Number of 'good' native counterfactuals in data: 1444
Data distribution after iteration 914:
	Majority (1): 3185, Minority (label 0): 2410
Number of 'good' native cou

Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 970:
	Majority (1): 3185, Minority (label 0): 2522
Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 971:
	Majority (1): 3185, Minority (label 0): 2524
Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 972:
	Majority (1): 3185, Minority (label 0): 2526
Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 973:
	Majority (1): 3185, Minority (label 0): 2528
Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 974:
	Majority (1): 3185, Minority (label 0): 2530
Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 975:
	Majority (1): 3185, Minority (label 0): 2532
Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 976:
	Majority (1): 3185, Minority (label 0): 2534
Number of 'good' native cou

Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 1030:
	Majority (1): 3185, Minority (label 0): 2642
Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 1031:
	Majority (1): 3185, Minority (label 0): 2644
Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 1032:
	Majority (1): 3185, Minority (label 0): 2646
Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 1033:
	Majority (1): 3185, Minority (label 0): 2648
Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 1034:
	Majority (1): 3185, Minority (label 0): 2650
Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 1035:
	Majority (1): 3185, Minority (label 0): 2652
Number of 'good' native counterfactuals in data: 1445
Data distribution after iteration 1036:
	Majority (1): 3185, Minority (label 0): 2654
Number of 'good' nat

Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1090:
	Majority (1): 3185, Minority (label 0): 2762
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1091:
	Majority (1): 3185, Minority (label 0): 2764
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1092:
	Majority (1): 3185, Minority (label 0): 2766
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1093:
	Majority (1): 3185, Minority (label 0): 2768
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1094:
	Majority (1): 3185, Minority (label 0): 2770
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1095:
	Majority (1): 3185, Minority (label 0): 2772
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1096:
	Majority (1): 3185, Minority (label 0): 2774
Number of 'good' nat

Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1149:
	Majority (1): 3185, Minority (label 0): 2880
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1150:
	Majority (1): 3185, Minority (label 0): 2882
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1151:
	Majority (1): 3185, Minority (label 0): 2884
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1152:
	Majority (1): 3185, Minority (label 0): 2886
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1153:
	Majority (1): 3185, Minority (label 0): 2888
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1154:
	Majority (1): 3185, Minority (label 0): 2890
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1155:
	Majority (1): 3185, Minority (label 0): 2892
Number of 'good' nat

Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1211:
	Majority (1): 3185, Minority (label 0): 3004
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1212:
	Majority (1): 3185, Minority (label 0): 3006
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1213:
	Majority (1): 3185, Minority (label 0): 3008
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1214:
	Majority (1): 3185, Minority (label 0): 3010
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1215:
	Majority (1): 3185, Minority (label 0): 3012
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1216:
	Majority (1): 3185, Minority (label 0): 3014
Number of 'good' native counterfactuals in data: 1451
Data distribution after iteration 1217:
	Majority (1): 3185, Minority (label 0): 3016
Number of 'good' nat

Number of 'good' native counterfactuals in data: 1458
Data distribution after iteration 1270:
	Majority (1): 3185, Minority (label 0): 3122
Number of 'good' native counterfactuals in data: 1458
Data distribution after iteration 1271:
	Majority (1): 3185, Minority (label 0): 3124
Number of 'good' native counterfactuals in data: 1458
Data distribution after iteration 1272:
	Majority (1): 3185, Minority (label 0): 3126
Number of 'good' native counterfactuals in data: 1458
Data distribution after iteration 1273:
	Majority (1): 3185, Minority (label 0): 3128
Number of 'good' native counterfactuals in data: 1458
Data distribution after iteration 1274:
	Majority (1): 3185, Minority (label 0): 3130
Number of 'good' native counterfactuals in data: 1458
Data distribution after iteration 1275:
	Majority (1): 3185, Minority (label 0): 3132
Number of 'good' native counterfactuals in data: 1458
Data distribution after iteration 1276:
	Majority (1): 3185, Minority (label 0): 3134
Number of 'good' nat

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 2:
Data distribution before CFA:
	Majority (1): 3173, Minority (label 0): 169
Number of 'good' native counterfactuals in data: 1029
Data distribution after iteration 0:
	Majority (1): 3173, Minority (label 0): 346
Number of 'good' native counterfactuals in data: 1100
Data distribution after iteration 1:
	Majority (1): 3173, Minority (label 0): 386
Number of 'good' native counterfactuals in data: 1122
Data distribution after iteration 2:
	Majority (1): 3173, Minority (label 0): 390
Number of 'good' native counterfactuals in data: 1122
Data distribution after iteration 3:
	Majority (1): 3173, Minority (label 0): 394
Number of 'good' native counterfactuals in data: 1092
Data distribution after iteration 4:
	Majority (1): 3173, Minority (label 0): 398
Number of 'good' native counterfactuals in data: 1092
Data distribution after iteration 5:
	Majority (1): 3173, Minority (label 0): 402
Number of 'good' native counterfactuals in data: 1092
Data distribution after iteration 6:
	Majority 

Number of 'good' native counterfactuals in data: 1014
Data distribution after iteration 63:
	Majority (1): 3173, Minority (label 0): 634
Number of 'good' native counterfactuals in data: 1014
Data distribution after iteration 64:
	Majority (1): 3173, Minority (label 0): 638
Number of 'good' native counterfactuals in data: 1014
Data distribution after iteration 65:
	Majority (1): 3173, Minority (label 0): 642
Number of 'good' native counterfactuals in data: 1014
Data distribution after iteration 66:
	Majority (1): 3173, Minority (label 0): 646
Number of 'good' native counterfactuals in data: 1014
Data distribution after iteration 67:
	Majority (1): 3173, Minority (label 0): 650
Number of 'good' native counterfactuals in data: 1014
Data distribution after iteration 68:
	Majority (1): 3173, Minority (label 0): 654
Number of 'good' native counterfactuals in data: 1014
Data distribution after iteration 69:
	Majority (1): 3173, Minority (label 0): 658
Number of 'good' native counterfactuals i

Number of 'good' native counterfactuals in data: 1001
Data distribution after iteration 126:
	Majority (1): 3173, Minority (label 0): 843
Number of 'good' native counterfactuals in data: 1001
Data distribution after iteration 127:
	Majority (1): 3173, Minority (label 0): 846
Number of 'good' native counterfactuals in data: 1001
Data distribution after iteration 128:
	Majority (1): 3173, Minority (label 0): 849
Number of 'good' native counterfactuals in data: 1001
Data distribution after iteration 129:
	Majority (1): 3173, Minority (label 0): 852
Number of 'good' native counterfactuals in data: 1001
Data distribution after iteration 130:
	Majority (1): 3173, Minority (label 0): 855
Number of 'good' native counterfactuals in data: 1001
Data distribution after iteration 131:
	Majority (1): 3173, Minority (label 0): 858
Number of 'good' native counterfactuals in data: 1001
Data distribution after iteration 132:
	Majority (1): 3173, Minority (label 0): 861
Number of 'good' native counterfac

Number of 'good' native counterfactuals in data: 990
Data distribution after iteration 192:
	Majority (1): 3173, Minority (label 0): 1041
Number of 'good' native counterfactuals in data: 990
Data distribution after iteration 193:
	Majority (1): 3173, Minority (label 0): 1044
Number of 'good' native counterfactuals in data: 990
Data distribution after iteration 194:
	Majority (1): 3173, Minority (label 0): 1047
Number of 'good' native counterfactuals in data: 990
Data distribution after iteration 195:
	Majority (1): 3173, Minority (label 0): 1050
Number of 'good' native counterfactuals in data: 990
Data distribution after iteration 196:
	Majority (1): 3173, Minority (label 0): 1053
Number of 'good' native counterfactuals in data: 990
Data distribution after iteration 197:
	Majority (1): 3173, Minority (label 0): 1056
Number of 'good' native counterfactuals in data: 990
Data distribution after iteration 198:
	Majority (1): 3173, Minority (label 0): 1059
Number of 'good' native counterfac

Number of 'good' native counterfactuals in data: 983
Data distribution after iteration 252:
	Majority (1): 3173, Minority (label 0): 1221
Number of 'good' native counterfactuals in data: 983
Data distribution after iteration 253:
	Majority (1): 3173, Minority (label 0): 1224
Number of 'good' native counterfactuals in data: 983
Data distribution after iteration 254:
	Majority (1): 3173, Minority (label 0): 1227
Number of 'good' native counterfactuals in data: 982
Data distribution after iteration 255:
	Majority (1): 3173, Minority (label 0): 1230
Number of 'good' native counterfactuals in data: 982
Data distribution after iteration 256:
	Majority (1): 3173, Minority (label 0): 1233
Number of 'good' native counterfactuals in data: 982
Data distribution after iteration 257:
	Majority (1): 3173, Minority (label 0): 1236
Number of 'good' native counterfactuals in data: 982
Data distribution after iteration 258:
	Majority (1): 3173, Minority (label 0): 1239
Number of 'good' native counterfac

Number of 'good' native counterfactuals in data: 976
Data distribution after iteration 312:
	Majority (1): 3173, Minority (label 0): 1401
Number of 'good' native counterfactuals in data: 976
Data distribution after iteration 313:
	Majority (1): 3173, Minority (label 0): 1404
Number of 'good' native counterfactuals in data: 976
Data distribution after iteration 314:
	Majority (1): 3173, Minority (label 0): 1407
Number of 'good' native counterfactuals in data: 976
Data distribution after iteration 315:
	Majority (1): 3173, Minority (label 0): 1410
Number of 'good' native counterfactuals in data: 973
Data distribution after iteration 316:
	Majority (1): 3173, Minority (label 0): 1413
Number of 'good' native counterfactuals in data: 973
Data distribution after iteration 317:
	Majority (1): 3173, Minority (label 0): 1416
Number of 'good' native counterfactuals in data: 943
Data distribution after iteration 318:
	Majority (1): 3173, Minority (label 0): 1419
Number of 'good' native counterfac

Number of 'good' native counterfactuals in data: 933
Data distribution after iteration 372:
	Majority (1): 3173, Minority (label 0): 1581
Number of 'good' native counterfactuals in data: 933
Data distribution after iteration 373:
	Majority (1): 3173, Minority (label 0): 1584
Number of 'good' native counterfactuals in data: 933
Data distribution after iteration 374:
	Majority (1): 3173, Minority (label 0): 1587
Number of 'good' native counterfactuals in data: 933
Data distribution after iteration 375:
	Majority (1): 3173, Minority (label 0): 1590
Number of 'good' native counterfactuals in data: 933
Data distribution after iteration 376:
	Majority (1): 3173, Minority (label 0): 1593
Number of 'good' native counterfactuals in data: 933
Data distribution after iteration 377:
	Majority (1): 3173, Minority (label 0): 1596
Number of 'good' native counterfactuals in data: 933
Data distribution after iteration 378:
	Majority (1): 3173, Minority (label 0): 1599
Number of 'good' native counterfac

Number of 'good' native counterfactuals in data: 932
Data distribution after iteration 432:
	Majority (1): 3173, Minority (label 0): 1761
Number of 'good' native counterfactuals in data: 932
Data distribution after iteration 433:
	Majority (1): 3173, Minority (label 0): 1764
Number of 'good' native counterfactuals in data: 932
Data distribution after iteration 434:
	Majority (1): 3173, Minority (label 0): 1767
Number of 'good' native counterfactuals in data: 932
Data distribution after iteration 435:
	Majority (1): 3173, Minority (label 0): 1770
Number of 'good' native counterfactuals in data: 932
Data distribution after iteration 436:
	Majority (1): 3173, Minority (label 0): 1773
Number of 'good' native counterfactuals in data: 932
Data distribution after iteration 437:
	Majority (1): 3173, Minority (label 0): 1776
Number of 'good' native counterfactuals in data: 708
Data distribution after iteration 438:
	Majority (1): 3173, Minority (label 0): 1779
Number of 'good' native counterfac

Number of 'good' native counterfactuals in data: 706
Data distribution after iteration 494:
	Majority (1): 3173, Minority (label 0): 1947
Number of 'good' native counterfactuals in data: 706
Data distribution after iteration 495:
	Majority (1): 3173, Minority (label 0): 1950
Number of 'good' native counterfactuals in data: 706
Data distribution after iteration 496:
	Majority (1): 3173, Minority (label 0): 1953
Number of 'good' native counterfactuals in data: 706
Data distribution after iteration 497:
	Majority (1): 3173, Minority (label 0): 1956
Number of 'good' native counterfactuals in data: 706
Data distribution after iteration 498:
	Majority (1): 3173, Minority (label 0): 1959
Number of 'good' native counterfactuals in data: 706
Data distribution after iteration 499:
	Majority (1): 3173, Minority (label 0): 1962
Number of 'good' native counterfactuals in data: 706
Data distribution after iteration 500:
	Majority (1): 3173, Minority (label 0): 1965
Number of 'good' native counterfac

Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 558:
	Majority (1): 3173, Minority (label 0): 2139
Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 559:
	Majority (1): 3173, Minority (label 0): 2142
Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 560:
	Majority (1): 3173, Minority (label 0): 2145
Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 561:
	Majority (1): 3173, Minority (label 0): 2148
Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 562:
	Majority (1): 3173, Minority (label 0): 2151
Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 563:
	Majority (1): 3173, Minority (label 0): 2154
Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 564:
	Majority (1): 3173, Minority (label 0): 2157
Number of 'good' native counterfac

Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 620:
	Majority (1): 3173, Minority (label 0): 2325
Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 621:
	Majority (1): 3173, Minority (label 0): 2328
Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 622:
	Majority (1): 3173, Minority (label 0): 2331
Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 623:
	Majority (1): 3173, Minority (label 0): 2334
Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 624:
	Majority (1): 3173, Minority (label 0): 2337
Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 625:
	Majority (1): 3173, Minority (label 0): 2340
Number of 'good' native counterfactuals in data: 704
Data distribution after iteration 626:
	Majority (1): 3173, Minority (label 0): 2343
Number of 'good' native counterfac

Data distribution after iteration 682:
	Majority (1): 3173, Minority (label 0): 2511
Number of 'good' native counterfactuals in data: 716
Data distribution after iteration 683:
	Majority (1): 3173, Minority (label 0): 2514
Number of 'good' native counterfactuals in data: 716
Data distribution after iteration 684:
	Majority (1): 3173, Minority (label 0): 2517
Number of 'good' native counterfactuals in data: 716
Data distribution after iteration 685:
	Majority (1): 3173, Minority (label 0): 2520
Number of 'good' native counterfactuals in data: 716
Data distribution after iteration 686:
	Majority (1): 3173, Minority (label 0): 2523
Number of 'good' native counterfactuals in data: 716
Data distribution after iteration 687:
	Majority (1): 3173, Minority (label 0): 2526
Number of 'good' native counterfactuals in data: 716
Data distribution after iteration 688:
	Majority (1): 3173, Minority (label 0): 2529
Number of 'good' native counterfactuals in data: 716
Data distribution after iteration 

Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 745:
	Majority (1): 3173, Minority (label 0): 2700
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 746:
	Majority (1): 3173, Minority (label 0): 2703
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 747:
	Majority (1): 3173, Minority (label 0): 2706
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 748:
	Majority (1): 3173, Minority (label 0): 2709
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 749:
	Majority (1): 3173, Minority (label 0): 2712
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 750:
	Majority (1): 3173, Minority (label 0): 2715
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 751:
	Majority (1): 3173, Minority (label 0): 2718
Number of 'good' native counterfac

Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 808:
	Majority (1): 3173, Minority (label 0): 2889
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 809:
	Majority (1): 3173, Minority (label 0): 2892
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 810:
	Majority (1): 3173, Minority (label 0): 2895
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 811:
	Majority (1): 3173, Minority (label 0): 2898
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 812:
	Majority (1): 3173, Minority (label 0): 2901
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 813:
	Majority (1): 3173, Minority (label 0): 2904
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 814:
	Majority (1): 3173, Minority (label 0): 2907
Number of 'good' native counterfac

Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 871:
	Majority (1): 3173, Minority (label 0): 3078
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 872:
	Majority (1): 3173, Minority (label 0): 3081
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 873:
	Majority (1): 3173, Minority (label 0): 3084
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 874:
	Majority (1): 3173, Minority (label 0): 3087
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 875:
	Majority (1): 3173, Minority (label 0): 3090
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 876:
	Majority (1): 3173, Minority (label 0): 3093
Number of 'good' native counterfactuals in data: 747
Data distribution after iteration 877:
	Majority (1): 3173, Minority (label 0): 3096
Number of 'good' native counterfac

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 3:
Data distribution before CFA:
	Majority (1): 3172, Minority (label 0): 170
Number of 'good' native counterfactuals in data: 1031
Data distribution after iteration 0:
	Majority (1): 3172, Minority (label 0): 277
Number of 'good' native counterfactuals in data: 1027
Data distribution after iteration 1:
	Majority (1): 3172, Minority (label 0): 293
Number of 'good' native counterfactuals in data: 1032
Data distribution after iteration 2:
	Majority (1): 3172, Minority (label 0): 297
Number of 'good' native counterfactuals in data: 1033
Data distribution after iteration 3:
	Majority (1): 3172, Minority (label 0): 299
Number of 'good' native counterfactuals in data: 1031
Data distribution after iteration 4:
	Majority (1): 3172, Minority (label 0): 301
Number of 'good' native counterfactuals in data: 1031
Data distribution after iteration 5:
	Majority (1): 3172, Minority (label 0): 303
Number of 'good' native counterfactuals in data: 1030
Data distribution after iteration 6:
	Majority 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fold 4:
Data distribution before CFA:
	Majority (1): 3183, Minority (label 0): 159
Number of 'good' native counterfactuals in data: 1015
Data distribution after iteration 0:
	Majority (1): 3183, Minority (label 0): 279
Number of 'good' native counterfactuals in data: 1076
Data distribution after iteration 1:
	Majority (1): 3183, Minority (label 0): 292
Number of 'good' native counterfactuals in data: 1080
Data distribution after iteration 2:
	Majority (1): 3183, Minority (label 0): 293
Number of 'good' native counterfactuals in data: 1079
No unpaired instances found, no augmentation possible! Not applying CFA...
Data distribution after CFA:
	Majority (1): 3183, Minority (label 0): 293


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [29]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'C': 1, 'max_iter': 100, 'solver': 'lbfgs'}
Best AUC: 0.626377917754169


#### Approach 2: Verification of new synthetic counterfactuals with baseline model

In [15]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    
    print("Fitting baseline model needed for verification... ")
    baseline_search = GridSearchCV(LogisticRegression(random_state=19231823), lr_param_grid, scoring="roc_auc")
    baseline_search.fit(X_train, y_train)
    
    X_train_augmented, y_train_augmented = Iterative_CFA(X_train, 
                                                         y_train, 
                                                         stddev_percent=50, 
                                                         verify_with_baseline_model=True, 
                                                         baseline_model=baseline_search,
                                                         visualize_with_pca=False)

    for param_comb in combination_dicts:
        clf = LogisticRegression(random_state=19231823, **param_comb)
        clf.fit(X_train_augmented, y_train_augmented)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

Fold 0:
Fitting baseline model needed for verification... 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Data distribution before CFA:
	Majority (1): 3183, Minority (label 0): 158
Number of 'good' native counterfactuals in data: 1378
Classifier predicted all new synthetic counterfactuals to be in the majority class! => No new minority instances => Terminating...
Data distribution after CFA:
	Majority (1): 3183, Minority (label 0): 158


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 1:
Fitting baseline model needed for verification... 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Data distribution before CFA:
	Majority (1): 3185, Minority (label 0): 156
Number of 'good' native counterfactuals in data: 1056
Classifier predicted all new synthetic counterfactuals to be in the majority class! => No new minority instances => Terminating...
Data distribution after CFA:
	Majority (1): 3185, Minority (label 0): 156


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 2:
Fitting baseline model needed for verification... 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Data distribution before CFA:
	Majority (1): 3173, Minority (label 0): 169
Number of 'good' native counterfactuals in data: 1029
Classifier predicted all new synthetic counterfactuals to be in the majority class! => No new minority instances => Terminating...
Data distribution after CFA:
	Majority (1): 3173, Minority (label 0): 169




Fold 3:
Fitting baseline model needed for verification... 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Data distribution before CFA:
	Majority (1): 3172, Minority (label 0): 170
Number of 'good' native counterfactuals in data: 1031
Classifier predicted all new synthetic counterfactuals to be in the majority class! => No new minority instances => Terminating...
Data distribution after CFA:
	Majority (1): 3172, Minority (label 0): 170


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 4:
Fitting baseline model needed for verification... 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Data distribution before CFA:
	Majority (1): 3183, Minority (label 0): 159
Number of 'good' native counterfactuals in data: 1015
Classifier predicted all new synthetic counterfactuals to be in the majority class! => No new minority instances => Terminating...
Data distribution after CFA:
	Majority (1): 3183, Minority (label 0): 159




In [31]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'C': 10, 'max_iter': 100, 'solver': 'liblinear'}
Best AUC: 0.7342425375116226


### ADASYN

In [None]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    #print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        
    X_train_resampled, y_train_resampled = ada.fit_resample(X_train, y_train)
        
    for param_comb in combination_dicts:
        clf = LogisticRegression(random_state=19231823, **param_comb)
        clf.fit(X_train_resampled, y_train_resampled)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

In [33]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'C': 1, 'max_iter': 100, 'solver': 'liblinear'}
Best AUC: 0.7379872972484532


### Random Oversampling

In [None]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    #print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
        
    for param_comb in combination_dicts:
        clf = LogisticRegression(random_state=19231823, **param_comb)
        clf.fit(X_train_resampled, y_train_resampled)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

In [35]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'C': 1, 'max_iter': 100, 'solver': 'liblinear'}
Best AUC: 0.7374792368165206
