# Reproduction of Experiments on Dataset D9

## Read data

In [1]:
import sys 
import os
sys.path.insert(0, os.path.join("..", "src"))
import pandas as pd
%matplotlib inline

names = ["Sex", "Length", "Diameter", "Height", "Whole weight", "Shucked weight", "Viscera weight", "Shell weight", "Rings"] 

path = 'http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data'
df = pd.read_csv(path, header=None, names=names)
df

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [2]:
# preprocess categorical column
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

df["Sex"] = label_encoder.fit_transform(df["Sex"])
df

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,2,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,2,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,0,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,2,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,1,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,0,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,2,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,2,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,0,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [3]:
label_column = "Rings"

majority_labels = [9]
minority_labels = [16]

df_filtered = df[df[label_column].isin(majority_labels + minority_labels)].copy()

df_filtered.loc[df_filtered[label_column].isin(majority_labels), label_column] = 1
df_filtered.loc[df_filtered[label_column].isin(minority_labels), label_column] = 0

X = df_filtered.drop(label_column, axis=1)
y = df_filtered[label_column]

In [4]:
y.value_counts()

1    689
0     67
Name: Rings, dtype: int64

# Classification

In [5]:
from sklearn.model_selection import ParameterGrid, KFold, GridSearchCV
from sklearn.metrics import roc_auc_score
import numpy as np

> **Note:** In the following, we use a custom piece of code for the grid search and cross-validation. We do this to enforce that the augmentation/oversampling step is only applied to training data and the test data is actually kept separate. For the baseline model, this means that we could also simply make use of the functions provided by sklearn instead of our custom code. However, in order to make the results comparable, we use the same code in each case.

In [6]:
kf = KFold(n_splits=5, shuffle=True, random_state=7018321)

## Random Forest

In [7]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
rf_param_grid = {
    'n_estimators': [50, 100, 200, 400, 600],
    'max_depth': [None, 4, 6, 10, 20, 30, 50, 80, 100],
}

# compute all combinations of parameters
combination_dicts = list(ParameterGrid(rf_param_grid))

### Baseline (no augmentation)

In [None]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    #print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        
    for param_comb in combination_dicts:
        clf = RandomForestClassifier(random_state=19231823, **param_comb)
        clf.fit(X_train, y_train)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

In [10]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'max_depth': 10, 'n_estimators': 200}
Best AUC: 0.8954351007158007


To verify that our cross-validation code works as expected, we can compare to a regular sklearn grid search here:

In [11]:
baseline_search = GridSearchCV(RandomForestClassifier(random_state=19231823), rf_param_grid, scoring="roc_auc")
baseline_search.fit(X, y)
baseline_search.best_score_

0.8700388736409046

We can see that the best model found by a sklearn grid search performs similarly well to our best model, which indicates that the code works as expected.

### With CFA

In [9]:
from cfa import Iterative_CFA

For CFA, we try two different approaches, since it is not entirely clear from the paper what the authors actually did.
1. The first approach strictly follows the pseudo-code and description given in section 3.2 of the paper. 
2. In an earlier section of the paper, the authors say that _"the class of [a] new [synthetic counterfactual] instance needs to be verified by the underlying ML model."_ This means that we use some ML model trained on the data (without CFA) to assign a class to a new synthetic counterfactual, and only keep those that were classified to be a minority instance. This seems to lead to much more reasonable synthetic counterfactuals (see visualizations of the algorithm in exp001) but, at the same time, often means that the algorithm is unable to produce a fully balanced dataset (since it may terminate early).

> **Note:** We also use a different tolerance level of 50% here, since the 10%-threshold proposed by the authors does not yield any "good" native counterfactuals, which makes the algorithm unusable.

#### Approach 1: No verification 

In [11]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    
    X_train_augmented, y_train_augmented = Iterative_CFA(X_train, 
                                                         y_train, 
                                                         stddev_percent=50, 
                                                         verify_with_baseline_model=False, 
                                                         visualize_with_pca=False)
    
    for param_comb in combination_dicts:
        clf = RandomForestClassifier(random_state=19231823, **param_comb)
        clf.fit(X_train_augmented, y_train_augmented)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

Fold 0:
Data distribution before CFA:
	Majority (1): 553, Minority (label 0): 51
Number of 'good' native counterfactuals in data: 322
Data distribution after iteration 0:
	Majority (1): 553, Minority (label 0): 218
Number of 'good' native counterfactuals in data: 443
Data distribution after iteration 1:
	Majority (1): 553, Minority (label 0): 237
Number of 'good' native counterfactuals in data: 452
Data distribution after iteration 2:
	Majority (1): 553, Minority (label 0): 241
Number of 'good' native counterfactuals in data: 450
Data distribution after iteration 3:
	Majority (1): 553, Minority (label 0): 245
Number of 'good' native counterfactuals in data: 449
Data distribution after iteration 4:
	Majority (1): 553, Minority (label 0): 249
Number of 'good' native counterfactuals in data: 445
Data distribution after iteration 5:
	Majority (1): 553, Minority (label 0): 256
Number of 'good' native counterfactuals in data: 447
Data distribution after iteration 6:
	Majority (1): 553, Minor

Number of 'good' native counterfactuals in data: 418
Data distribution after iteration 52:
	Majority (1): 549, Minority (label 0): 410
Number of 'good' native counterfactuals in data: 418
Data distribution after iteration 53:
	Majority (1): 549, Minority (label 0): 412
Number of 'good' native counterfactuals in data: 418
Data distribution after iteration 54:
	Majority (1): 549, Minority (label 0): 414
Number of 'good' native counterfactuals in data: 418
Data distribution after iteration 55:
	Majority (1): 549, Minority (label 0): 416
Number of 'good' native counterfactuals in data: 416
Data distribution after iteration 56:
	Majority (1): 549, Minority (label 0): 418
Number of 'good' native counterfactuals in data: 415
Data distribution after iteration 57:
	Majority (1): 549, Minority (label 0): 420
Number of 'good' native counterfactuals in data: 415
Data distribution after iteration 58:
	Majority (1): 549, Minority (label 0): 422
Number of 'good' native counterfactuals in data: 415
Da

Data distribution after iteration 115:
	Majority (1): 549, Minority (label 0): 536
Number of 'good' native counterfactuals in data: 409
Data distribution after iteration 116:
	Majority (1): 549, Minority (label 0): 538
Number of 'good' native counterfactuals in data: 409
Data distribution after iteration 117:
	Majority (1): 549, Minority (label 0): 540
Number of 'good' native counterfactuals in data: 409
Data distribution after iteration 118:
	Majority (1): 549, Minority (label 0): 542
Number of 'good' native counterfactuals in data: 409
Data distribution after iteration 119:
	Majority (1): 549, Minority (label 0): 544
Data distribution after CFA:
	Majority (1): 549, Minority (label 0): 544
Fold 2:
Data distribution before CFA:
	Majority (1): 552, Minority (label 0): 53
Number of 'good' native counterfactuals in data: 332
Data distribution after iteration 0:
	Majority (1): 552, Minority (label 0): 206
Number of 'good' native counterfactuals in data: 413
Data distribution after iteratio

Data distribution after iteration 8:
	Majority (1): 553, Minority (label 0): 298
Number of 'good' native counterfactuals in data: 463
Data distribution after iteration 9:
	Majority (1): 553, Minority (label 0): 299
Number of 'good' native counterfactuals in data: 463
Data distribution after iteration 10:
	Majority (1): 553, Minority (label 0): 300
Number of 'good' native counterfactuals in data: 463
Data distribution after iteration 11:
	Majority (1): 553, Minority (label 0): 301
Number of 'good' native counterfactuals in data: 463
Data distribution after iteration 12:
	Majority (1): 553, Minority (label 0): 302
Number of 'good' native counterfactuals in data: 463
Data distribution after iteration 13:
	Majority (1): 553, Minority (label 0): 303
Number of 'good' native counterfactuals in data: 463
Data distribution after iteration 14:
	Majority (1): 553, Minority (label 0): 304
Number of 'good' native counterfactuals in data: 463
Data distribution after iteration 15:
	Majority (1): 553,

Number of 'good' native counterfactuals in data: 468
Data distribution after iteration 75:
	Majority (1): 553, Minority (label 0): 365
Number of 'good' native counterfactuals in data: 468
Data distribution after iteration 76:
	Majority (1): 553, Minority (label 0): 366
Number of 'good' native counterfactuals in data: 468
Data distribution after iteration 77:
	Majority (1): 553, Minority (label 0): 367
Number of 'good' native counterfactuals in data: 468
Data distribution after iteration 78:
	Majority (1): 553, Minority (label 0): 368
Number of 'good' native counterfactuals in data: 468
Data distribution after iteration 79:
	Majority (1): 553, Minority (label 0): 369
Number of 'good' native counterfactuals in data: 468
Data distribution after iteration 80:
	Majority (1): 553, Minority (label 0): 370
Number of 'good' native counterfactuals in data: 465
Data distribution after iteration 81:
	Majority (1): 553, Minority (label 0): 374
Number of 'good' native counterfactuals in data: 468
Da

Number of 'good' native counterfactuals in data: 465
Data distribution after iteration 141:
	Majority (1): 553, Minority (label 0): 435
Number of 'good' native counterfactuals in data: 465
Data distribution after iteration 142:
	Majority (1): 553, Minority (label 0): 436
Number of 'good' native counterfactuals in data: 465
Data distribution after iteration 143:
	Majority (1): 553, Minority (label 0): 437
Number of 'good' native counterfactuals in data: 465
Data distribution after iteration 144:
	Majority (1): 553, Minority (label 0): 438
Number of 'good' native counterfactuals in data: 465
Data distribution after iteration 145:
	Majority (1): 553, Minority (label 0): 439
Number of 'good' native counterfactuals in data: 465
Data distribution after iteration 146:
	Majority (1): 553, Minority (label 0): 440
Number of 'good' native counterfactuals in data: 465
Data distribution after iteration 147:
	Majority (1): 553, Minority (label 0): 441
Number of 'good' native counterfactuals in data:

Data distribution after iteration 203:
	Majority (1): 553, Minority (label 0): 537
Number of 'good' native counterfactuals in data: 471
Data distribution after iteration 204:
	Majority (1): 553, Minority (label 0): 539
Number of 'good' native counterfactuals in data: 471
Data distribution after iteration 205:
	Majority (1): 553, Minority (label 0): 541
Number of 'good' native counterfactuals in data: 471
Data distribution after iteration 206:
	Majority (1): 553, Minority (label 0): 543
Number of 'good' native counterfactuals in data: 471
Data distribution after iteration 207:
	Majority (1): 553, Minority (label 0): 545
Number of 'good' native counterfactuals in data: 471
Data distribution after iteration 208:
	Majority (1): 553, Minority (label 0): 547
Number of 'good' native counterfactuals in data: 471
Data distribution after iteration 209:
	Majority (1): 553, Minority (label 0): 549
Data distribution after CFA:
	Majority (1): 553, Minority (label 0): 549


In [14]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'max_depth': 20, 'n_estimators': 600}
Best AUC: 0.8629593395503523


#### Approach 2: Verification of new synthetic counterfactuals with baseline model

In [12]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    
    print("Fitting baseline model needed for verification... ")
    baseline_search = GridSearchCV(RandomForestClassifier(random_state=19231823), rf_param_grid, scoring="roc_auc")
    baseline_search.fit(X_train, y_train)
    
    X_train_augmented, y_train_augmented = Iterative_CFA(X_train, 
                                                         y_train, 
                                                         stddev_percent=50, 
                                                         verify_with_baseline_model=True, 
                                                         baseline_model=baseline_search,
                                                         visualize_with_pca=False)
    
    for param_comb in combination_dicts:
        clf = RandomForestClassifier(random_state=19231823, **param_comb)
        clf.fit(X_train_augmented, y_train_augmented)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

Fold 0:
Fitting baseline model needed for verification... 
Data distribution before CFA:
	Majority (1): 553, Minority (label 0): 51
Number of 'good' native counterfactuals in data: 322
Data distribution after iteration 0:
	Majority (1): 553, Minority (label 0): 68
Number of 'good' native counterfactuals in data: 333
Data distribution after iteration 1:
	Majority (1): 553, Minority (label 0): 77
Number of 'good' native counterfactuals in data: 332
Data distribution after iteration 2:
	Majority (1): 553, Minority (label 0): 84
Number of 'good' native counterfactuals in data: 335
Data distribution after iteration 3:
	Majority (1): 553, Minority (label 0): 91
Number of 'good' native counterfactuals in data: 339
Data distribution after iteration 4:
	Majority (1): 553, Minority (label 0): 97
Number of 'good' native counterfactuals in data: 340
Data distribution after iteration 5:
	Majority (1): 553, Minority (label 0): 103
Number of 'good' native counterfactuals in data: 340
Data distributio

Number of 'good' native counterfactuals in data: 323
Data distribution after iteration 62:
	Majority (1): 553, Minority (label 0): 334
Number of 'good' native counterfactuals in data: 323
Data distribution after iteration 63:
	Majority (1): 553, Minority (label 0): 337
Number of 'good' native counterfactuals in data: 322
Data distribution after iteration 64:
	Majority (1): 553, Minority (label 0): 340
Number of 'good' native counterfactuals in data: 322
Data distribution after iteration 65:
	Majority (1): 553, Minority (label 0): 343
Number of 'good' native counterfactuals in data: 322
Data distribution after iteration 66:
	Majority (1): 553, Minority (label 0): 346
Number of 'good' native counterfactuals in data: 322
Data distribution after iteration 67:
	Majority (1): 553, Minority (label 0): 349
Number of 'good' native counterfactuals in data: 322
Data distribution after iteration 68:
	Majority (1): 553, Minority (label 0): 352
Number of 'good' native counterfactuals in data: 322
Da

Number of 'good' native counterfactuals in data: 322
Data distribution after iteration 124:
	Majority (1): 553, Minority (label 0): 521
Number of 'good' native counterfactuals in data: 322
Data distribution after iteration 125:
	Majority (1): 553, Minority (label 0): 524
Number of 'good' native counterfactuals in data: 322
Data distribution after iteration 126:
	Majority (1): 553, Minority (label 0): 527
Number of 'good' native counterfactuals in data: 322
Data distribution after iteration 127:
	Majority (1): 553, Minority (label 0): 530
Number of 'good' native counterfactuals in data: 322
Data distribution after iteration 128:
	Majority (1): 553, Minority (label 0): 533
Number of 'good' native counterfactuals in data: 322
Data distribution after iteration 129:
	Majority (1): 553, Minority (label 0): 536
Number of 'good' native counterfactuals in data: 322
Data distribution after iteration 130:
	Majority (1): 553, Minority (label 0): 539
Number of 'good' native counterfactuals in data:

Number of 'good' native counterfactuals in data: 347
Data distribution after iteration 51:
	Majority (1): 549, Minority (label 0): 212
Number of 'good' native counterfactuals in data: 348
Data distribution after iteration 52:
	Majority (1): 549, Minority (label 0): 213
Number of 'good' native counterfactuals in data: 348
Data distribution after iteration 53:
	Majority (1): 549, Minority (label 0): 214
Number of 'good' native counterfactuals in data: 348
Data distribution after iteration 54:
	Majority (1): 549, Minority (label 0): 215
Number of 'good' native counterfactuals in data: 348
Data distribution after iteration 55:
	Majority (1): 549, Minority (label 0): 216
Number of 'good' native counterfactuals in data: 347
Data distribution after iteration 56:
	Majority (1): 549, Minority (label 0): 217
Number of 'good' native counterfactuals in data: 347
Data distribution after iteration 57:
	Majority (1): 549, Minority (label 0): 218
Number of 'good' native counterfactuals in data: 347
Da

Number of 'good' native counterfactuals in data: 337
Data distribution after iteration 114:
	Majority (1): 549, Minority (label 0): 277
Number of 'good' native counterfactuals in data: 337
Data distribution after iteration 115:
	Majority (1): 549, Minority (label 0): 278
Number of 'good' native counterfactuals in data: 337
Data distribution after iteration 116:
	Majority (1): 549, Minority (label 0): 279
Number of 'good' native counterfactuals in data: 337
Data distribution after iteration 117:
	Majority (1): 549, Minority (label 0): 280
Number of 'good' native counterfactuals in data: 337
Data distribution after iteration 118:
	Majority (1): 549, Minority (label 0): 281
Number of 'good' native counterfactuals in data: 337
Data distribution after iteration 119:
	Majority (1): 549, Minority (label 0): 282
Number of 'good' native counterfactuals in data: 337
Data distribution after iteration 120:
	Majority (1): 549, Minority (label 0): 283
Number of 'good' native counterfactuals in data:

Number of 'good' native counterfactuals in data: 333
Data distribution after iteration 176:
	Majority (1): 549, Minority (label 0): 339
Number of 'good' native counterfactuals in data: 333
Data distribution after iteration 177:
	Majority (1): 549, Minority (label 0): 340
Number of 'good' native counterfactuals in data: 333
Data distribution after iteration 178:
	Majority (1): 549, Minority (label 0): 341
Number of 'good' native counterfactuals in data: 333
Data distribution after iteration 179:
	Majority (1): 549, Minority (label 0): 342
Number of 'good' native counterfactuals in data: 333
Data distribution after iteration 180:
	Majority (1): 549, Minority (label 0): 343
Number of 'good' native counterfactuals in data: 333
Data distribution after iteration 181:
	Majority (1): 549, Minority (label 0): 344
Number of 'good' native counterfactuals in data: 333
Data distribution after iteration 182:
	Majority (1): 549, Minority (label 0): 345
Number of 'good' native counterfactuals in data:

Data distribution after iteration 237:
	Majority (1): 549, Minority (label 0): 400
Number of 'good' native counterfactuals in data: 331
Data distribution after iteration 238:
	Majority (1): 549, Minority (label 0): 401
Number of 'good' native counterfactuals in data: 331
Data distribution after iteration 239:
	Majority (1): 549, Minority (label 0): 402
Number of 'good' native counterfactuals in data: 331
Data distribution after iteration 240:
	Majority (1): 549, Minority (label 0): 403
Number of 'good' native counterfactuals in data: 331
Data distribution after iteration 241:
	Majority (1): 549, Minority (label 0): 404
Number of 'good' native counterfactuals in data: 330
Data distribution after iteration 242:
	Majority (1): 549, Minority (label 0): 405
Number of 'good' native counterfactuals in data: 330
Data distribution after iteration 243:
	Majority (1): 549, Minority (label 0): 406
Number of 'good' native counterfactuals in data: 330
Data distribution after iteration 244:
	Majority

Data distribution after iteration 299:
	Majority (1): 549, Minority (label 0): 463
Number of 'good' native counterfactuals in data: 330
Data distribution after iteration 300:
	Majority (1): 549, Minority (label 0): 464
Number of 'good' native counterfactuals in data: 330
Data distribution after iteration 301:
	Majority (1): 549, Minority (label 0): 465
Number of 'good' native counterfactuals in data: 330
Data distribution after iteration 302:
	Majority (1): 549, Minority (label 0): 466
Number of 'good' native counterfactuals in data: 330
Data distribution after iteration 303:
	Majority (1): 549, Minority (label 0): 467
Number of 'good' native counterfactuals in data: 330
Data distribution after iteration 304:
	Majority (1): 549, Minority (label 0): 468
Number of 'good' native counterfactuals in data: 330
Data distribution after iteration 305:
	Majority (1): 549, Minority (label 0): 469
Number of 'good' native counterfactuals in data: 330
Data distribution after iteration 306:
	Majority

Number of 'good' native counterfactuals in data: 331
Data distribution after iteration 360:
	Majority (1): 549, Minority (label 0): 524
Number of 'good' native counterfactuals in data: 332
Data distribution after iteration 361:
	Majority (1): 549, Minority (label 0): 525
Number of 'good' native counterfactuals in data: 332
Data distribution after iteration 362:
	Majority (1): 549, Minority (label 0): 526
Number of 'good' native counterfactuals in data: 332
Data distribution after iteration 363:
	Majority (1): 549, Minority (label 0): 527
Number of 'good' native counterfactuals in data: 332
Data distribution after iteration 364:
	Majority (1): 549, Minority (label 0): 528
Number of 'good' native counterfactuals in data: 332
Data distribution after iteration 365:
	Majority (1): 549, Minority (label 0): 529
Number of 'good' native counterfactuals in data: 332
Data distribution after iteration 366:
	Majority (1): 549, Minority (label 0): 530
Number of 'good' native counterfactuals in data:

Data distribution after iteration 39:
	Majority (1): 552, Minority (label 0): 110
Number of 'good' native counterfactuals in data: 350
Data distribution after iteration 40:
	Majority (1): 552, Minority (label 0): 111
Number of 'good' native counterfactuals in data: 344
Data distribution after iteration 41:
	Majority (1): 552, Minority (label 0): 112
Number of 'good' native counterfactuals in data: 344
Data distribution after iteration 42:
	Majority (1): 552, Minority (label 0): 113
Number of 'good' native counterfactuals in data: 344
Data distribution after iteration 43:
	Majority (1): 552, Minority (label 0): 114
Number of 'good' native counterfactuals in data: 344
Data distribution after iteration 44:
	Majority (1): 552, Minority (label 0): 115
Number of 'good' native counterfactuals in data: 344
Data distribution after iteration 45:
	Majority (1): 552, Minority (label 0): 116
Number of 'good' native counterfactuals in data: 344
Data distribution after iteration 46:
	Majority (1): 55

Data distribution after iteration 101:
	Majority (1): 552, Minority (label 0): 172
Number of 'good' native counterfactuals in data: 347
Data distribution after iteration 102:
	Majority (1): 552, Minority (label 0): 173
Number of 'good' native counterfactuals in data: 348
Data distribution after iteration 103:
	Majority (1): 552, Minority (label 0): 174
Number of 'good' native counterfactuals in data: 348
Data distribution after iteration 104:
	Majority (1): 552, Minority (label 0): 175
Number of 'good' native counterfactuals in data: 348
Data distribution after iteration 105:
	Majority (1): 552, Minority (label 0): 176
Number of 'good' native counterfactuals in data: 348
Data distribution after iteration 106:
	Majority (1): 552, Minority (label 0): 177
Number of 'good' native counterfactuals in data: 348
Data distribution after iteration 107:
	Majority (1): 552, Minority (label 0): 178
Number of 'good' native counterfactuals in data: 348
Data distribution after iteration 108:
	Majority

Data distribution after iteration 164:
	Majority (1): 552, Minority (label 0): 235
Number of 'good' native counterfactuals in data: 350
Data distribution after iteration 165:
	Majority (1): 552, Minority (label 0): 236
Number of 'good' native counterfactuals in data: 350
Data distribution after iteration 166:
	Majority (1): 552, Minority (label 0): 237
Number of 'good' native counterfactuals in data: 349
Data distribution after iteration 167:
	Majority (1): 552, Minority (label 0): 238
Number of 'good' native counterfactuals in data: 349
Data distribution after iteration 168:
	Majority (1): 552, Minority (label 0): 239
Number of 'good' native counterfactuals in data: 349
Data distribution after iteration 169:
	Majority (1): 552, Minority (label 0): 240
Number of 'good' native counterfactuals in data: 349
Data distribution after iteration 170:
	Majority (1): 552, Minority (label 0): 241
Number of 'good' native counterfactuals in data: 349
Data distribution after iteration 171:
	Majority

Data distribution after iteration 227:
	Majority (1): 552, Minority (label 0): 298
Number of 'good' native counterfactuals in data: 349
Data distribution after iteration 228:
	Majority (1): 552, Minority (label 0): 299
Number of 'good' native counterfactuals in data: 349
Data distribution after iteration 229:
	Majority (1): 552, Minority (label 0): 300
Number of 'good' native counterfactuals in data: 349
Data distribution after iteration 230:
	Majority (1): 552, Minority (label 0): 301
Number of 'good' native counterfactuals in data: 349
Data distribution after iteration 231:
	Majority (1): 552, Minority (label 0): 302
Number of 'good' native counterfactuals in data: 349
Data distribution after iteration 232:
	Majority (1): 552, Minority (label 0): 303
Number of 'good' native counterfactuals in data: 349
Data distribution after iteration 233:
	Majority (1): 552, Minority (label 0): 304
Number of 'good' native counterfactuals in data: 349
Data distribution after iteration 234:
	Majority

Data distribution after iteration 291:
	Majority (1): 552, Minority (label 0): 362
Number of 'good' native counterfactuals in data: 346
Data distribution after iteration 292:
	Majority (1): 552, Minority (label 0): 363
Number of 'good' native counterfactuals in data: 346
Data distribution after iteration 293:
	Majority (1): 552, Minority (label 0): 364
Number of 'good' native counterfactuals in data: 346
Data distribution after iteration 294:
	Majority (1): 552, Minority (label 0): 365
Number of 'good' native counterfactuals in data: 346
Data distribution after iteration 295:
	Majority (1): 552, Minority (label 0): 366
Number of 'good' native counterfactuals in data: 346
Data distribution after iteration 296:
	Majority (1): 552, Minority (label 0): 367
Number of 'good' native counterfactuals in data: 346
Data distribution after iteration 297:
	Majority (1): 552, Minority (label 0): 368
Number of 'good' native counterfactuals in data: 346
Data distribution after iteration 298:
	Majority

Data distribution after iteration 355:
	Majority (1): 552, Minority (label 0): 426
Number of 'good' native counterfactuals in data: 330
Data distribution after iteration 356:
	Majority (1): 552, Minority (label 0): 427
Number of 'good' native counterfactuals in data: 330
Data distribution after iteration 357:
	Majority (1): 552, Minority (label 0): 428
Number of 'good' native counterfactuals in data: 330
Data distribution after iteration 358:
	Majority (1): 552, Minority (label 0): 429
Number of 'good' native counterfactuals in data: 330
Data distribution after iteration 359:
	Majority (1): 552, Minority (label 0): 430
Number of 'good' native counterfactuals in data: 330
Data distribution after iteration 360:
	Majority (1): 552, Minority (label 0): 431
Number of 'good' native counterfactuals in data: 330
Data distribution after iteration 361:
	Majority (1): 552, Minority (label 0): 432
Number of 'good' native counterfactuals in data: 320
Data distribution after iteration 362:
	Majority

Number of 'good' native counterfactuals in data: 321
Data distribution after iteration 419:
	Majority (1): 552, Minority (label 0): 493
Number of 'good' native counterfactuals in data: 321
Data distribution after iteration 420:
	Majority (1): 552, Minority (label 0): 494
Number of 'good' native counterfactuals in data: 321
Data distribution after iteration 421:
	Majority (1): 552, Minority (label 0): 495
Number of 'good' native counterfactuals in data: 321
Data distribution after iteration 422:
	Majority (1): 552, Minority (label 0): 496
Number of 'good' native counterfactuals in data: 321
Data distribution after iteration 423:
	Majority (1): 552, Minority (label 0): 497
Number of 'good' native counterfactuals in data: 321
Data distribution after iteration 424:
	Majority (1): 552, Minority (label 0): 498
Number of 'good' native counterfactuals in data: 321
Data distribution after iteration 425:
	Majority (1): 552, Minority (label 0): 499
Number of 'good' native counterfactuals in data:

Data distribution after iteration 5:
	Majority (1): 549, Minority (label 0): 93
Number of 'good' native counterfactuals in data: 354
Data distribution after iteration 6:
	Majority (1): 549, Minority (label 0): 95
Number of 'good' native counterfactuals in data: 354
Data distribution after iteration 7:
	Majority (1): 549, Minority (label 0): 97
Number of 'good' native counterfactuals in data: 354
Data distribution after iteration 8:
	Majority (1): 549, Minority (label 0): 99
Number of 'good' native counterfactuals in data: 354
Data distribution after iteration 9:
	Majority (1): 549, Minority (label 0): 101
Number of 'good' native counterfactuals in data: 354
Data distribution after iteration 10:
	Majority (1): 549, Minority (label 0): 103
Number of 'good' native counterfactuals in data: 353
Data distribution after iteration 11:
	Majority (1): 549, Minority (label 0): 105
Number of 'good' native counterfactuals in data: 353
Data distribution after iteration 12:
	Majority (1): 549, Minori

Data distribution after iteration 66:
	Majority (1): 549, Minority (label 0): 225
Number of 'good' native counterfactuals in data: 323
Data distribution after iteration 67:
	Majority (1): 549, Minority (label 0): 226
Number of 'good' native counterfactuals in data: 324
Data distribution after iteration 68:
	Majority (1): 549, Minority (label 0): 227
Number of 'good' native counterfactuals in data: 324
Data distribution after iteration 69:
	Majority (1): 549, Minority (label 0): 228
Number of 'good' native counterfactuals in data: 324
Data distribution after iteration 70:
	Majority (1): 549, Minority (label 0): 229
Number of 'good' native counterfactuals in data: 325
Data distribution after iteration 71:
	Majority (1): 549, Minority (label 0): 230
Number of 'good' native counterfactuals in data: 325
Data distribution after iteration 72:
	Majority (1): 549, Minority (label 0): 231
Number of 'good' native counterfactuals in data: 325
Data distribution after iteration 73:
	Majority (1): 54

Data distribution after iteration 127:
	Majority (1): 549, Minority (label 0): 286
Number of 'good' native counterfactuals in data: 319
Data distribution after iteration 128:
	Majority (1): 549, Minority (label 0): 287
Number of 'good' native counterfactuals in data: 319
Data distribution after iteration 129:
	Majority (1): 549, Minority (label 0): 288
Number of 'good' native counterfactuals in data: 319
Data distribution after iteration 130:
	Majority (1): 549, Minority (label 0): 289
Number of 'good' native counterfactuals in data: 318
Data distribution after iteration 131:
	Majority (1): 549, Minority (label 0): 290
Number of 'good' native counterfactuals in data: 318
Data distribution after iteration 132:
	Majority (1): 549, Minority (label 0): 291
Number of 'good' native counterfactuals in data: 317
Data distribution after iteration 133:
	Majority (1): 549, Minority (label 0): 292
Number of 'good' native counterfactuals in data: 317
Data distribution after iteration 134:
	Majority

Data distribution after iteration 189:
	Majority (1): 549, Minority (label 0): 348
Number of 'good' native counterfactuals in data: 314
Data distribution after iteration 190:
	Majority (1): 549, Minority (label 0): 349
Number of 'good' native counterfactuals in data: 314
Data distribution after iteration 191:
	Majority (1): 549, Minority (label 0): 350
Number of 'good' native counterfactuals in data: 314
Data distribution after iteration 192:
	Majority (1): 549, Minority (label 0): 351
Number of 'good' native counterfactuals in data: 313
Data distribution after iteration 193:
	Majority (1): 549, Minority (label 0): 352
Number of 'good' native counterfactuals in data: 313
Data distribution after iteration 194:
	Majority (1): 549, Minority (label 0): 353
Number of 'good' native counterfactuals in data: 313
Data distribution after iteration 195:
	Majority (1): 549, Minority (label 0): 354
Number of 'good' native counterfactuals in data: 313
Data distribution after iteration 196:
	Majority

Number of 'good' native counterfactuals in data: 311
Data distribution after iteration 252:
	Majority (1): 549, Minority (label 0): 411
Number of 'good' native counterfactuals in data: 311
Data distribution after iteration 253:
	Majority (1): 549, Minority (label 0): 412
Number of 'good' native counterfactuals in data: 311
Data distribution after iteration 254:
	Majority (1): 549, Minority (label 0): 413
Number of 'good' native counterfactuals in data: 311
Data distribution after iteration 255:
	Majority (1): 549, Minority (label 0): 414
Number of 'good' native counterfactuals in data: 311
Data distribution after iteration 256:
	Majority (1): 549, Minority (label 0): 415
Number of 'good' native counterfactuals in data: 311
Data distribution after iteration 257:
	Majority (1): 549, Minority (label 0): 416
Number of 'good' native counterfactuals in data: 311
Data distribution after iteration 258:
	Majority (1): 549, Minority (label 0): 417
Number of 'good' native counterfactuals in data:

Data distribution after iteration 313:
	Majority (1): 549, Minority (label 0): 472
Number of 'good' native counterfactuals in data: 309
Data distribution after iteration 314:
	Majority (1): 549, Minority (label 0): 473
Number of 'good' native counterfactuals in data: 309
Data distribution after iteration 315:
	Majority (1): 549, Minority (label 0): 474
Number of 'good' native counterfactuals in data: 309
Data distribution after iteration 316:
	Majority (1): 549, Minority (label 0): 475
Number of 'good' native counterfactuals in data: 309
Data distribution after iteration 317:
	Majority (1): 549, Minority (label 0): 476
Number of 'good' native counterfactuals in data: 309
Data distribution after iteration 318:
	Majority (1): 549, Minority (label 0): 477
Number of 'good' native counterfactuals in data: 309
Data distribution after iteration 319:
	Majority (1): 549, Minority (label 0): 478
Number of 'good' native counterfactuals in data: 309
Data distribution after iteration 320:
	Majority

Data distribution after iteration 374:
	Majority (1): 549, Minority (label 0): 533
Number of 'good' native counterfactuals in data: 310
Data distribution after iteration 375:
	Majority (1): 549, Minority (label 0): 534
Number of 'good' native counterfactuals in data: 310
Data distribution after iteration 376:
	Majority (1): 549, Minority (label 0): 535
Number of 'good' native counterfactuals in data: 310
Data distribution after iteration 377:
	Majority (1): 549, Minority (label 0): 536
Number of 'good' native counterfactuals in data: 310
Data distribution after iteration 378:
	Majority (1): 549, Minority (label 0): 537
Number of 'good' native counterfactuals in data: 310
Data distribution after iteration 379:
	Majority (1): 549, Minority (label 0): 538
Number of 'good' native counterfactuals in data: 310
Data distribution after iteration 380:
	Majority (1): 549, Minority (label 0): 539
Number of 'good' native counterfactuals in data: 310
Data distribution after iteration 381:
	Majority

In [16]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'max_depth': 10, 'n_estimators': 600}
Best AUC: 0.8895237769958679


### ADASYN instead of CFA

In [17]:
from imblearn.over_sampling import ADASYN
ada = ADASYN(random_state=9317231)

In [None]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    #print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        
    X_train_resampled, y_train_resampled = ada.fit_resample(X_train, y_train)
        
    for param_comb in combination_dicts:
        clf = RandomForestClassifier(random_state=19231823, **param_comb)
        clf.fit(X_train_resampled, y_train_resampled)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

In [19]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'max_depth': 20, 'n_estimators': 400}
Best AUC: 0.889867836650254


### Random Oversampling

In [20]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=52012318)

In [None]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    #print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
        
    for param_comb in combination_dicts:
        clf = RandomForestClassifier(random_state=19231823, **param_comb)
        clf.fit(X_train_resampled, y_train_resampled)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

In [22]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'max_depth': None, 'n_estimators': 400}
Best AUC: 0.870750606936537


## Logistic Regression 

In [13]:
from sklearn.linear_model import LogisticRegression

In [14]:
lr_param_grid = {"max_iter": [100, 200, 1000], 
                 "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                 "solver": ["newton-cg", "lbfgs", "liblinear", "sag"]}

# compute all combinations of parameters
combination_dicts = list(ParameterGrid(lr_param_grid))

### Baseline (no augmentation)

In [None]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    #print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        
    for param_comb in combination_dicts:
        clf = LogisticRegression(random_state=19231823, **param_comb)
        clf.fit(X_train, y_train)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

In [26]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'C': 1000, 'max_iter': 100, 'solver': 'newton-cg'}
Best AUC: 0.9379082715534788


Again, we can compare to an out-of-the-box grid search to make sure:

In [27]:
baseline_search = GridSearchCV(LogisticRegression(random_state=19231823), lr_param_grid, scoring="roc_auc")
baseline_search.fit(X, y)
baseline_search.best_score_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.9276145836602835

### With CFA

#### Approach 1: No verification 

In [15]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    
    X_train_augmented, y_train_augmented = Iterative_CFA(X_train, 
                                                         y_train, 
                                                         stddev_percent=50, 
                                                         verify_with_baseline_model=False,
                                                         visualize_with_pca=False)

    for param_comb in combination_dicts:
        clf = LogisticRegression(random_state=19231823, **param_comb)
        clf.fit(X_train_augmented, y_train_augmented)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

Fold 0:
Data distribution before CFA:
	Majority (1): 553, Minority (label 0): 51
Number of 'good' native counterfactuals in data: 322
Data distribution after iteration 0:
	Majority (1): 553, Minority (label 0): 218
Number of 'good' native counterfactuals in data: 443
Data distribution after iteration 1:
	Majority (1): 553, Minority (label 0): 237
Number of 'good' native counterfactuals in data: 452
Data distribution after iteration 2:
	Majority (1): 553, Minority (label 0): 241
Number of 'good' native counterfactuals in data: 450
Data distribution after iteration 3:
	Majority (1): 553, Minority (label 0): 245
Number of 'good' native counterfactuals in data: 449
Data distribution after iteration 4:
	Majority (1): 553, Minority (label 0): 249
Number of 'good' native counterfactuals in data: 445
Data distribution after iteration 5:
	Majority (1): 553, Minority (label 0): 256
Number of 'good' native counterfactuals in data: 447
Data distribution after iteration 6:
	Majority (1): 553, Minor



Fold 1:
Data distribution before CFA:
	Majority (1): 549, Minority (label 0): 56
Number of 'good' native counterfactuals in data: 326
Data distribution after iteration 0:
	Majority (1): 549, Minority (label 0): 215
Number of 'good' native counterfactuals in data: 432
Data distribution after iteration 1:
	Majority (1): 549, Minority (label 0): 231
Number of 'good' native counterfactuals in data: 439
Data distribution after iteration 2:
	Majority (1): 549, Minority (label 0): 240
Number of 'good' native counterfactuals in data: 441
Data distribution after iteration 3:
	Majority (1): 549, Minority (label 0): 246
Number of 'good' native counterfactuals in data: 440
Data distribution after iteration 4:
	Majority (1): 549, Minority (label 0): 252
Number of 'good' native counterfactuals in data: 437
Data distribution after iteration 5:
	Majority (1): 549, Minority (label 0): 258
Number of 'good' native counterfactuals in data: 434
Data distribution after iteration 6:
	Majority (1): 549, Minor

Data distribution after iteration 61:
	Majority (1): 549, Minority (label 0): 428
Number of 'good' native counterfactuals in data: 415
Data distribution after iteration 62:
	Majority (1): 549, Minority (label 0): 430
Number of 'good' native counterfactuals in data: 415
Data distribution after iteration 63:
	Majority (1): 549, Minority (label 0): 432
Number of 'good' native counterfactuals in data: 415
Data distribution after iteration 64:
	Majority (1): 549, Minority (label 0): 434
Number of 'good' native counterfactuals in data: 415
Data distribution after iteration 65:
	Majority (1): 549, Minority (label 0): 436
Number of 'good' native counterfactuals in data: 415
Data distribution after iteration 66:
	Majority (1): 549, Minority (label 0): 438
Number of 'good' native counterfactuals in data: 415
Data distribution after iteration 67:
	Majority (1): 549, Minority (label 0): 440
Number of 'good' native counterfactuals in data: 415
Data distribution after iteration 68:
	Majority (1): 54

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 2:
Data distribution before CFA:
	Majority (1): 552, Minority (label 0): 53
Number of 'good' native counterfactuals in data: 332
Data distribution after iteration 0:
	Majority (1): 552, Minority (label 0): 206
Number of 'good' native counterfactuals in data: 413
Data distribution after iteration 1:
	Majority (1): 552, Minority (label 0): 224
Number of 'good' native counterfactuals in data: 417
Data distribution after iteration 2:
	Majority (1): 552, Minority (label 0): 233
Number of 'good' native counterfactuals in data: 424
Data distribution after iteration 3:
	Majority (1): 552, Minority (label 0): 234
Number of 'good' native counterfactuals in data: 427
No unpaired instances found, no augmentation possible! Not applying CFA...
Data distribution after CFA:
	Majority (1): 552, Minority (label 0): 234


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 3:
Data distribution before CFA:
	Majority (1): 549, Minority (label 0): 56
Number of 'good' native counterfactuals in data: 313
Data distribution after iteration 0:
	Majority (1): 549, Minority (label 0): 224
Number of 'good' native counterfactuals in data: 431
Data distribution after iteration 1:
	Majority (1): 549, Minority (label 0): 249
Number of 'good' native counterfactuals in data: 443
Data distribution after iteration 2:
	Majority (1): 549, Minority (label 0): 258
Number of 'good' native counterfactuals in data: 444
Data distribution after iteration 3:
	Majority (1): 549, Minority (label 0): 266
Number of 'good' native counterfactuals in data: 444
Data distribution after iteration 4:
	Majority (1): 549, Minority (label 0): 274
Number of 'good' native counterfactuals in data: 444
Data distribution after iteration 5:
	Majority (1): 549, Minority (label 0): 281
Number of 'good' native counterfactuals in data: 444
Data distribution after iteration 6:
	Majority (1): 549, Minor

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 4:
Data distribution before CFA:
	Majority (1): 553, Minority (label 0): 52
Number of 'good' native counterfactuals in data: 293
Data distribution after iteration 0:
	Majority (1): 553, Minority (label 0): 249
Number of 'good' native counterfactuals in data: 438
Data distribution after iteration 1:
	Majority (1): 553, Minority (label 0): 277
Number of 'good' native counterfactuals in data: 450
Data distribution after iteration 2:
	Majority (1): 553, Minority (label 0): 291
Number of 'good' native counterfactuals in data: 461
Data distribution after iteration 3:
	Majority (1): 553, Minority (label 0): 293
Number of 'good' native counterfactuals in data: 462
Data distribution after iteration 4:
	Majority (1): 553, Minority (label 0): 294
Number of 'good' native counterfactuals in data: 462
Data distribution after iteration 5:
	Majority (1): 553, Minority (label 0): 295
Number of 'good' native counterfactuals in data: 462
Data distribution after iteration 6:
	Majority (1): 553, Minor

Data distribution after iteration 60:
	Majority (1): 553, Minority (label 0): 350
Number of 'good' native counterfactuals in data: 466
Data distribution after iteration 61:
	Majority (1): 553, Minority (label 0): 351
Number of 'good' native counterfactuals in data: 466
Data distribution after iteration 62:
	Majority (1): 553, Minority (label 0): 352
Number of 'good' native counterfactuals in data: 466
Data distribution after iteration 63:
	Majority (1): 553, Minority (label 0): 353
Number of 'good' native counterfactuals in data: 466
Data distribution after iteration 64:
	Majority (1): 553, Minority (label 0): 354
Number of 'good' native counterfactuals in data: 466
Data distribution after iteration 65:
	Majority (1): 553, Minority (label 0): 355
Number of 'good' native counterfactuals in data: 468
Data distribution after iteration 66:
	Majority (1): 553, Minority (label 0): 356
Number of 'good' native counterfactuals in data: 468
Data distribution after iteration 67:
	Majority (1): 55

Number of 'good' native counterfactuals in data: 465
Data distribution after iteration 121:
	Majority (1): 553, Minority (label 0): 415
Number of 'good' native counterfactuals in data: 465
Data distribution after iteration 122:
	Majority (1): 553, Minority (label 0): 416
Number of 'good' native counterfactuals in data: 465
Data distribution after iteration 123:
	Majority (1): 553, Minority (label 0): 417
Number of 'good' native counterfactuals in data: 465
Data distribution after iteration 124:
	Majority (1): 553, Minority (label 0): 418
Number of 'good' native counterfactuals in data: 465
Data distribution after iteration 125:
	Majority (1): 553, Minority (label 0): 419
Number of 'good' native counterfactuals in data: 465
Data distribution after iteration 126:
	Majority (1): 553, Minority (label 0): 420
Number of 'good' native counterfactuals in data: 465
Data distribution after iteration 127:
	Majority (1): 553, Minority (label 0): 421
Number of 'good' native counterfactuals in data:

Data distribution after iteration 182:
	Majority (1): 553, Minority (label 0): 494
Number of 'good' native counterfactuals in data: 465
Data distribution after iteration 183:
	Majority (1): 553, Minority (label 0): 496
Number of 'good' native counterfactuals in data: 465
Data distribution after iteration 184:
	Majority (1): 553, Minority (label 0): 498
Number of 'good' native counterfactuals in data: 466
Data distribution after iteration 185:
	Majority (1): 553, Minority (label 0): 501
Number of 'good' native counterfactuals in data: 467
Data distribution after iteration 186:
	Majority (1): 553, Minority (label 0): 503
Number of 'good' native counterfactuals in data: 462
Data distribution after iteration 187:
	Majority (1): 553, Minority (label 0): 505
Number of 'good' native counterfactuals in data: 462
Data distribution after iteration 188:
	Majority (1): 553, Minority (label 0): 507
Number of 'good' native counterfactuals in data: 462
Data distribution after iteration 189:
	Majority

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'C': 100, 'max_iter': 200, 'solver': 'sag'}
Best AUC: 0.9138330235046906


#### Approach 2: Verification of new synthetic counterfactuals with baseline model

In [16]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    
    print("Fitting baseline model needed for verification... ")
    baseline_search = GridSearchCV(LogisticRegression(random_state=19231823), lr_param_grid, scoring="roc_auc")
    baseline_search.fit(X_train, y_train)
    
    X_train_augmented, y_train_augmented = Iterative_CFA(X_train, 
                                                         y_train, 
                                                         stddev_percent=50, 
                                                         verify_with_baseline_model=True, 
                                                         baseline_model=baseline_search, 
                                                         visualize_with_pca=False)

    for param_comb in combination_dicts:
        clf = LogisticRegression(random_state=19231823, **param_comb)
        clf.fit(X_train_augmented, y_train_augmented)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

Fold 0:
Fitting baseline model needed for verification... 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Data distribution before CFA:
	Majority (1): 553, Minority (label 0): 51
Number of 'good' native counterfactuals in data: 322
Data distribution after iteration 0:
	Majority (1): 553, Minority (label 0): 77
Number of 'good' native counterfactuals in data: 338
Data distribution after iteration 1:
	Majority (1): 553, Minority (label 0): 87
Number of 'good' native counterfactuals in data: 340
Data distribution after iteration 2:
	Majority (1): 553, Minority (label 0): 95
Number of 'good' native counterfactuals in data: 345
Data distribution after iteration 3:
	Majority (1): 553, Minority (label 0): 103
Number of 'good' native counterfactuals in data: 346
Data distribution after iteration 4:
	Majority (1): 553, Minority (label 0): 111
Number of 'good' native counterfactuals in data: 350
Data distribution after iteration 5:
	Majority (1): 553, Minority (label 0): 119
Number of 'good' native counterfactuals in data: 350
Data distribution after iteration 6:
	Majority (1): 553, Minority (label 

Number of 'good' native counterfactuals in data: 329
Data distribution after iteration 65:
	Majority (1): 553, Minority (label 0): 303
Number of 'good' native counterfactuals in data: 329
Data distribution after iteration 66:
	Majority (1): 553, Minority (label 0): 305
Number of 'good' native counterfactuals in data: 329
Data distribution after iteration 67:
	Majority (1): 553, Minority (label 0): 307
Number of 'good' native counterfactuals in data: 329
Data distribution after iteration 68:
	Majority (1): 553, Minority (label 0): 309
Number of 'good' native counterfactuals in data: 329
Data distribution after iteration 69:
	Majority (1): 553, Minority (label 0): 311
Number of 'good' native counterfactuals in data: 327
Data distribution after iteration 70:
	Majority (1): 553, Minority (label 0): 313
Number of 'good' native counterfactuals in data: 327
Data distribution after iteration 71:
	Majority (1): 553, Minority (label 0): 315
Number of 'good' native counterfactuals in data: 327
Da

Data distribution after iteration 126:
	Majority (1): 553, Minority (label 0): 426
Number of 'good' native counterfactuals in data: 341
Data distribution after iteration 127:
	Majority (1): 553, Minority (label 0): 428
Number of 'good' native counterfactuals in data: 340
Data distribution after iteration 128:
	Majority (1): 553, Minority (label 0): 430
Number of 'good' native counterfactuals in data: 340
Data distribution after iteration 129:
	Majority (1): 553, Minority (label 0): 432
Number of 'good' native counterfactuals in data: 340
Data distribution after iteration 130:
	Majority (1): 553, Minority (label 0): 434
Number of 'good' native counterfactuals in data: 340
Data distribution after iteration 131:
	Majority (1): 553, Minority (label 0): 436
Number of 'good' native counterfactuals in data: 340
Data distribution after iteration 132:
	Majority (1): 553, Minority (label 0): 438
Number of 'good' native counterfactuals in data: 340
Data distribution after iteration 133:
	Majority

Data distribution after iteration 187:
	Majority (1): 553, Minority (label 0): 548
Data distribution after CFA:
	Majority (1): 553, Minority (label 0): 548


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 1:
Fitting baseline model needed for verification... 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Data distribution before CFA:
	Majority (1): 549, Minority (label 0): 56
Number of 'good' native counterfactuals in data: 326
Data distribution after iteration 0:
	Majority (1): 549, Minority (label 0): 69
Number of 'good' native counterfactuals in data: 339
Data distribution after iteration 1:
	Majority (1): 549, Minority (label 0): 75
Number of 'good' native counterfactuals in data: 341
Data distribution after iteration 2:
	Majority (1): 549, Minority (label 0): 80
Number of 'good' native counterfactuals in data: 343
Data distribution after iteration 3:
	Majority (1): 549, Minority (label 0): 85
Number of 'good' native counterfactuals in data: 343
Data distribution after iteration 4:
	Majority (1): 549, Minority (label 0): 90
Number of 'good' native counterfactuals in data: 339
Data distribution after iteration 5:
	Majority (1): 549, Minority (label 0): 95
Number of 'good' native counterfactuals in data: 338
Data distribution after iteration 6:
	Majority (1): 549, Minority (label 0):

Data distribution after iteration 62:
	Majority (1): 549, Minority (label 0): 286
Number of 'good' native counterfactuals in data: 316
Data distribution after iteration 63:
	Majority (1): 549, Minority (label 0): 289
Number of 'good' native counterfactuals in data: 316
Data distribution after iteration 64:
	Majority (1): 549, Minority (label 0): 292
Number of 'good' native counterfactuals in data: 316
Data distribution after iteration 65:
	Majority (1): 549, Minority (label 0): 295
Number of 'good' native counterfactuals in data: 315
Data distribution after iteration 66:
	Majority (1): 549, Minority (label 0): 298
Number of 'good' native counterfactuals in data: 316
Data distribution after iteration 67:
	Majority (1): 549, Minority (label 0): 301
Number of 'good' native counterfactuals in data: 316
Data distribution after iteration 68:
	Majority (1): 549, Minority (label 0): 304
Number of 'good' native counterfactuals in data: 316
Data distribution after iteration 69:
	Majority (1): 54

Data distribution after iteration 125:
	Majority (1): 549, Minority (label 0): 501
Number of 'good' native counterfactuals in data: 317
Data distribution after iteration 126:
	Majority (1): 549, Minority (label 0): 505
Number of 'good' native counterfactuals in data: 317
Data distribution after iteration 127:
	Majority (1): 549, Minority (label 0): 509
Number of 'good' native counterfactuals in data: 317
Data distribution after iteration 128:
	Majority (1): 549, Minority (label 0): 513
Number of 'good' native counterfactuals in data: 317
Data distribution after iteration 129:
	Majority (1): 549, Minority (label 0): 517
Number of 'good' native counterfactuals in data: 317
Data distribution after iteration 130:
	Majority (1): 549, Minority (label 0): 521
Number of 'good' native counterfactuals in data: 317
Data distribution after iteration 131:
	Majority (1): 549, Minority (label 0): 525
Number of 'good' native counterfactuals in data: 317
Data distribution after iteration 132:
	Majority

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 2:
Fitting baseline model needed for verification... 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Data distribution before CFA:
	Majority (1): 552, Minority (label 0): 53
Number of 'good' native counterfactuals in data: 332
Data distribution after iteration 0:
	Majority (1): 552, Minority (label 0): 66
Number of 'good' native counterfactuals in data: 339
Data distribution after iteration 1:
	Majority (1): 552, Minority (label 0): 73
Number of 'good' native counterfactuals in data: 342
Data distribution after iteration 2:
	Majority (1): 552, Minority (label 0): 78
Number of 'good' native counterfactuals in data: 343
Data distribution after iteration 3:
	Majority (1): 552, Minority (label 0): 83
Number of 'good' native counterfactuals in data: 345
Data distribution after iteration 4:
	Majority (1): 552, Minority (label 0): 88
Number of 'good' native counterfactuals in data: 343
Data distribution after iteration 5:
	Majority (1): 552, Minority (label 0): 93
Number of 'good' native counterfactuals in data: 343
Data distribution after iteration 6:
	Majority (1): 552, Minority (label 0):

Number of 'good' native counterfactuals in data: 327
Data distribution after iteration 66:
	Majority (1): 552, Minority (label 0): 323
Number of 'good' native counterfactuals in data: 327
Data distribution after iteration 67:
	Majority (1): 552, Minority (label 0): 326
Number of 'good' native counterfactuals in data: 327
Data distribution after iteration 68:
	Majority (1): 552, Minority (label 0): 329
Number of 'good' native counterfactuals in data: 327
Data distribution after iteration 69:
	Majority (1): 552, Minority (label 0): 332
Number of 'good' native counterfactuals in data: 327
Data distribution after iteration 70:
	Majority (1): 552, Minority (label 0): 335
Number of 'good' native counterfactuals in data: 327
Data distribution after iteration 71:
	Majority (1): 552, Minority (label 0): 338
Number of 'good' native counterfactuals in data: 327
Data distribution after iteration 72:
	Majority (1): 552, Minority (label 0): 341
Number of 'good' native counterfactuals in data: 327
Da

Number of 'good' native counterfactuals in data: 325
Data distribution after iteration 128:
	Majority (1): 552, Minority (label 0): 509
Number of 'good' native counterfactuals in data: 324
Data distribution after iteration 129:
	Majority (1): 552, Minority (label 0): 512
Number of 'good' native counterfactuals in data: 324
Data distribution after iteration 130:
	Majority (1): 552, Minority (label 0): 515
Number of 'good' native counterfactuals in data: 324
Data distribution after iteration 131:
	Majority (1): 552, Minority (label 0): 518
Number of 'good' native counterfactuals in data: 324
Data distribution after iteration 132:
	Majority (1): 552, Minority (label 0): 521
Number of 'good' native counterfactuals in data: 324
Data distribution after iteration 133:
	Majority (1): 552, Minority (label 0): 524
Number of 'good' native counterfactuals in data: 324
Data distribution after iteration 134:
	Majority (1): 552, Minority (label 0): 527
Number of 'good' native counterfactuals in data:

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 3:
Fitting baseline model needed for verification... 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Data distribution before CFA:
	Majority (1): 549, Minority (label 0): 56
Number of 'good' native counterfactuals in data: 313
Data distribution after iteration 0:
	Majority (1): 549, Minority (label 0): 86
Number of 'good' native counterfactuals in data: 341
Data distribution after iteration 1:
	Majority (1): 549, Minority (label 0): 97
Number of 'good' native counterfactuals in data: 348
Data distribution after iteration 2:
	Majority (1): 549, Minority (label 0): 105
Number of 'good' native counterfactuals in data: 349
Data distribution after iteration 3:
	Majority (1): 549, Minority (label 0): 112
Number of 'good' native counterfactuals in data: 347
Data distribution after iteration 4:
	Majority (1): 549, Minority (label 0): 119
Number of 'good' native counterfactuals in data: 354
Data distribution after iteration 5:
	Majority (1): 549, Minority (label 0): 126
Number of 'good' native counterfactuals in data: 354
Data distribution after iteration 6:
	Majority (1): 549, Minority (label



Fold 4:
Fitting baseline model needed for verification... 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Data distribution before CFA:
	Majority (1): 553, Minority (label 0): 52
Number of 'good' native counterfactuals in data: 293
Data distribution after iteration 0:
	Majority (1): 553, Minority (label 0): 85
Number of 'good' native counterfactuals in data: 319
Data distribution after iteration 1:
	Majority (1): 553, Minority (label 0): 108
Number of 'good' native counterfactuals in data: 320
Data distribution after iteration 2:
	Majority (1): 553, Minority (label 0): 126
Number of 'good' native counterfactuals in data: 320
Data distribution after iteration 3:
	Majority (1): 553, Minority (label 0): 144
Number of 'good' native counterfactuals in data: 313
Data distribution after iteration 4:
	Majority (1): 553, Minority (label 0): 162
Number of 'good' native counterfactuals in data: 313
Data distribution after iteration 5:
	Majority (1): 553, Minority (label 0): 180
Number of 'good' native counterfactuals in data: 312
Data distribution after iteration 6:
	Majority (1): 553, Minority (labe



In [31]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'C': 1000, 'max_iter': 1000, 'solver': 'sag'}
Best AUC: 0.9389976985319674


### ADASYN

In [None]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    #print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        
    X_train_resampled, y_train_resampled = ada.fit_resample(X_train, y_train)
        
    for param_comb in combination_dicts:
        clf = LogisticRegression(random_state=19231823, **param_comb)
        clf.fit(X_train_resampled, y_train_resampled)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

In [33]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'C': 1000, 'max_iter': 200, 'solver': 'sag'}
Best AUC: 0.9385317659744722


### Random Oversampling

In [None]:
all_scores = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    #print(f"Fold {i}:")
    scores_for_fold = []
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        
    X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
        
    for param_comb in combination_dicts:
        clf = LogisticRegression(random_state=19231823, **param_comb)
        clf.fit(X_train_resampled, y_train_resampled)
        current_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        #print(f"Score for combination {param_comb}: {current_score}")
        scores_for_fold.append(current_score)
        
    all_scores.append(scores_for_fold)

In [35]:
score_avg_over_folds = np.array(all_scores).mean(axis=0)
best_score_idx = score_avg_over_folds.argmax()
print(f"Best params: {combination_dicts[best_score_idx]}")
print(f"Best AUC: {score_avg_over_folds[best_score_idx]}")

Best params: {'C': 1000, 'max_iter': 100, 'solver': 'lbfgs'}
Best AUC: 0.9447212088219494
