In [1]:
import pandas as pd 
from pandas import DataFrame, Series
import os, sys, re
import numpy as np

from decimal import Decimal
from tabulate import tabulate

import warnings
warnings.filterwarnings('ignore')  # action='once'

%matplotlib inline

## Utility Functions

In [2]:
from analyzer import interpret

def col_values(df, col='age', n=10):
    df_subset = df.sample(n=n, random_state=1) # for each column, sample n values (usually n=1)
    return df_subset[col].values

def summary(df, n=1): 
    msg = ""
    msg += "> sample sizes: {}\n".format(df.shape[0])
    msg += "> n(features):  {}\n".format(df.shape[1])
    # msg += "> list of features:\n{}\n".format(df.columns.values)
    print(msg)

    interpret(df, n=n, verbose=True)
def show_dict(adict, topn=-1, by='', header=[], n_samples=-1, ascending=False, print_=False): 
    # print(adict)
    
    # convert to two-column dataframe format 
    if not header: 
        header = ['key', 'value']
    else: 
        assert len(header) == 2
    D = {h:[] for h in header}
    for k, v in adict.items(): 
        D[header[0]].append(k)
        D[header[1]].append(v)
    
    df = DataFrame(D, columns=header)
    msg = ''
    if topn > 0: 
        assert by in df.columns
        df = df.sort_values([by, ], ascending=ascending)
        msg = tabulate(df[:topn], headers='keys', tablefmt='psql')
    else: 
        if n_samples < 0: 
            msg = tabulate(df, headers='keys', tablefmt='psql')
        else: 
            n = min(df.shape[0], n_samples)
            msg = tabulate(df.sample(n=n), headers='keys', tablefmt='psql')
    if print_: print(msg)
    return msg

### Determine and Retrieve Patient Cohort(s)

In [3]:
"""
1. Find rows whose column match a substring 
   https://davidhamann.de/2017/06/26/pandas-select-elements-by-string/
   
   https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.contains.html

"""
import json 
from cohort_search import gen_code_set, gen_query_str

# use module cohort_search to generate the desired query strings 
# run CohortRetrieval on Databricks (https://dbc-25924283-13f6.cloud.databricks.com/#notebook/1971793/command/1973735)

### Load Base/Positive Dataset

In [4]:
from analyzer import load_src_data, load_data, save_data, load_performance, stratify
# from transformer import canonicalize  # ... obsolete
from transformer import resolve_duplicate
import loinc as lc
# from loinc import canonicalize

######################################
# params
cohort = domain = 'hepatitis-c'
token_default = 'unknown'
col_target = 'test_result_loinc_code'
######################################

# load source data
ts0 = load_src_data(cohort=cohort, warn_bad_lines=False, canonicalized=True, processed=False)

print("> dim(ts0): {}".format(ts0.shape))  
loinc_set = codes0 = ts0[col_target].unique()
codes0_subset = np.random.choice(codes0, 30)
print("> N(loinc_set): {} | example codes (source):\n{}\n".format(len(loinc_set), list(codes0_subset)))

# stratify the data by class labels (e.g. LOINC codes)
ds = stratify(ts0, col=col_target) # code|mean|std|n_pos
Nmax0 = ds[0][1]  
# ... here we have not added extra control data yet (i.e. patients without the target disease such hepatitis C)

# compare to the previously generated performance dataframe
df_perf = load_performance(input_dir='result', cohort=cohort)
df_perf = df_perf.sort_values(by=['n_pos', ], ascending=False)
Nmax = np.max(df_perf['n_pos'].values)
Nm = np.median(df_perf['n_pos'].values)

ss_dict = dict(zip(df_perf['code'].values, df_perf['n_pos'].values))
print("> Set the baseline class sample size: Nmax: {} (Nmax0: {}, median: {})".format(Nmax, Nmax0, Nm))

topn = 10
print("> top {} sample sizes:\n{}\n".format(topn, show_dict(ss_dict, topn=topn, 
        by='n_pos', header=['code', 'n_pos'])))

######################################
"""
Conclusion 
----------

Cohort: hepatitis-c

1. max sample size with known code: 67686 => 57202
2. loinc_set: target loinc codes
   size(loinc_set): 733
3. ts0: dim=(71224, 127)


"""

[load] Loading default input data: andromeda-pond-hepatitis-c.csv
(canonicalize) Operations: fill n/a + dehyphenate + replace_values + trim_tail + fill others (non-target classes)
> dim(ts0): (71224, 127)
> N(loinc_set): 733 | example codes (source):
['64717', '21626', '422410', '143388', '265249', '163485', '268748', '178566', '139501', '265116', '264747', '303644', '433045', '264648', '24588', '93187', '109009', '295980', '301804', '298935', '264663', '177915', '192997', '17426', '193433', '169490', '320184', '64634', '339143', '502211']

> dim(performance matrix): (733, 5)
> Set the baseline class sample size: Nmax: 5593 (Nmax0: 5593, median: 3.0)
> top 10 sample sizes:
+----+---------+---------+
|    | code    |   n_pos |
|----+---------+---------|
|  0 | unknown |    5593 |
|  1 | 67686   |    3120 |
|  2 | 110114  |    2521 |
|  3 | 19752   |    2423 |
|  4 | 17426   |    2405 |
|  5 | 17517   |    2368 |
|  6 | 7187    |    2037 |
|  7 | 7773    |    2037 |
|  8 | 21600   |    2

'\nConclusion \n----------\n\nCohort: hepatitis-c\n\n1. max sample size with known code: 67686 => 57202\n2. loinc_set: target loinc codes\n   size(loinc_set): 733\n3. ts0: dim=(71224, 127)\n\n\n'

### Load Control Data

In [5]:
# patients not in given domain/disease
from cohort_search import filter_by_diagnosis
from analyzer import save_data
from transformer import resolve_duplicate

######################################
# params
tFilter = False
tSave = True
tAddCtrl = True
output_dir = 'data'
######################################

N0 = ts0.shape[0]

# load source data
ts_ctrl = load_src_data(input_file='andromeda_pond-10p.csv', canonicalized=True)
# ts_ctrl = ts_ctrl.drop_duplicates(keep='last')  # drop duplicates
# ts_ctrl = resolve_duplicate(ts_ctrl)
# ... generate a new variable: 'count'
# ts_ctrl = lc.canonicalize(ts_ctrl, col_target=col_target, token_missing=token_default, target_labels=loinc_set)
# ... now included by default within load_data()

print("> dim(ts_ctrl): {}".format(ts_ctrl.shape))  

if tFilter: 
    ts_ctrl = filter_by_diagnosis(ts_ctrl, condition='hepatitis c')
    nctrl = ts_ctrl.shape[0]
    print("> sample size | orig: {}, filtered(control): {}".format(n0, nctrl))

# summary(df_ctrl, n=1)
# df_ctrl.info()   

# save 
if tSave: 
    output_file = f"andromeda-pond-{cohort}-ctrl.csv" 
    save_data(ts_ctrl,  output_file=output_file, sep=',')
    
# mix-in 
if tAddCtrl: 
    Nctrl = ts_ctrl.shape[0]
    assert Nctrl > N0, f"Control data is too small | n({cohort})={N0} > n(ctrl)={Nctrl}"
    ts_ctrl = ts_ctrl.sample(n=N0, replace=False)
    
    ts0 = pd.concat([ts0, ts_ctrl], ignore_index=True)
    
assert np.sum(ts0[col_target].isnull()) == 0
print("> Adding control data | N: {} -> {}".format(N0, ts0.shape[0]))

[load] Loading default input data: andromeda_pond-10p.csv


b'Skipping line 256: expected 127 fields, saw 128\nSkipping line 511: expected 127 fields, saw 129\nSkipping line 5917: expected 127 fields, saw 133\nSkipping line 7086: expected 127 fields, saw 128\nSkipping line 7750: expected 127 fields, saw 128\n'
b'Skipping line 10545: expected 127 fields, saw 131\nSkipping line 14112: expected 127 fields, saw 128\nSkipping line 14113: expected 127 fields, saw 128\nSkipping line 15206: expected 127 fields, saw 135\n'
b'Skipping line 20277: expected 127 fields, saw 130\n'
b'Skipping line 25255: expected 127 fields, saw 136\nSkipping line 30914: expected 127 fields, saw 164\n'
b'Skipping line 34649: expected 127 fields, saw 130\nSkipping line 37064: expected 127 fields, saw 131\nSkipping line 39673: expected 127 fields, saw 128\nSkipping line 39674: expected 127 fields, saw 128\nSkipping line 39676: expected 127 fields, saw 128\nSkipping line 40260: expected 127 fields, saw 132\nSkipping line 40291: expected 127 fields, saw 129\nSkipping line 40614:

b'Skipping line 312711: expected 127 fields, saw 132\nSkipping line 313105: expected 127 fields, saw 128\nSkipping line 314345: expected 127 fields, saw 128\nSkipping line 315877: expected 127 fields, saw 129\nSkipping line 317175: expected 127 fields, saw 129\n'
b'Skipping line 323634: expected 127 fields, saw 130\nSkipping line 323650: expected 127 fields, saw 131\n'
b'Skipping line 331833: expected 127 fields, saw 148\nSkipping line 332177: expected 127 fields, saw 142\nSkipping line 334586: expected 127 fields, saw 130\n'
b'Skipping line 336654: expected 127 fields, saw 128\nSkipping line 341873: expected 127 fields, saw 128\n'
b'Skipping line 345036: expected 127 fields, saw 129\nSkipping line 346459: expected 127 fields, saw 131\nSkipping line 346788: expected 127 fields, saw 134\nSkipping line 348949: expected 127 fields, saw 130\nSkipping line 348994: expected 127 fields, saw 128\n'
b'Skipping line 353627: expected 127 fields, saw 130\nSkipping line 355166: expected 127 fields,

b'Skipping line 590629: expected 127 fields, saw 130\nSkipping line 595449: expected 127 fields, saw 130\nSkipping line 595463: expected 127 fields, saw 130\nSkipping line 596527: expected 127 fields, saw 130\n'
b'Skipping line 599747: expected 127 fields, saw 131\nSkipping line 599973: expected 127 fields, saw 141\nSkipping line 600752: expected 127 fields, saw 128\n'
b'Skipping line 606853: expected 127 fields, saw 129\nSkipping line 608275: expected 127 fields, saw 128\n'
b'Skipping line 615975: expected 127 fields, saw 129\nSkipping line 619136: expected 127 fields, saw 128\nSkipping line 620929: expected 127 fields, saw 130\n'
b'Skipping line 628790: expected 127 fields, saw 128\nSkipping line 628793: expected 127 fields, saw 128\nSkipping line 628794: expected 127 fields, saw 128\nSkipping line 629569: expected 127 fields, saw 128\n'
b'Skipping line 631936: expected 127 fields, saw 128\nSkipping line 632185: expected 127 fields, saw 128\nSkipping line 632331: expected 127 fields,

b'Skipping line 885532: expected 127 fields, saw 128\nSkipping line 888688: expected 127 fields, saw 130\nSkipping line 890608: expected 127 fields, saw 130\nSkipping line 891400: expected 127 fields, saw 128\n'
b'Skipping line 893962: expected 127 fields, saw 134\nSkipping line 894471: expected 127 fields, saw 128\nSkipping line 897155: expected 127 fields, saw 131\nSkipping line 897175: expected 127 fields, saw 129\n'
b'Skipping line 905330: expected 127 fields, saw 128\nSkipping line 907408: expected 127 fields, saw 128\nSkipping line 907851: expected 127 fields, saw 131\n'
b'Skipping line 910890: expected 127 fields, saw 132\nSkipping line 910933: expected 127 fields, saw 129\nSkipping line 911173: expected 127 fields, saw 135\nSkipping line 911180: expected 127 fields, saw 144\nSkipping line 913057: expected 127 fields, saw 129\nSkipping line 914740: expected 127 fields, saw 128\nSkipping line 917499: expected 127 fields, saw 128\nSkipping line 917500: expected 127 fields, saw 130

b'Skipping line 1149200: expected 127 fields, saw 128\nSkipping line 1151607: expected 127 fields, saw 128\nSkipping line 1153131: expected 127 fields, saw 129\nSkipping line 1153303: expected 127 fields, saw 135\nSkipping line 1153807: expected 127 fields, saw 134\nSkipping line 1154441: expected 127 fields, saw 130\nSkipping line 1155012: expected 127 fields, saw 128\n'
b'Skipping line 1158360: expected 127 fields, saw 130\nSkipping line 1159215: expected 127 fields, saw 128\nSkipping line 1161997: expected 127 fields, saw 129\n'
b'Skipping line 1165521: expected 127 fields, saw 131\nSkipping line 1166351: expected 127 fields, saw 128\nSkipping line 1167166: expected 127 fields, saw 134\nSkipping line 1167592: expected 127 fields, saw 148\nSkipping line 1167916: expected 127 fields, saw 138\nSkipping line 1169383: expected 127 fields, saw 130\nSkipping line 1171852: expected 127 fields, saw 128\n'
b'Skipping line 1172217: expected 127 fields, saw 130\nSkipping line 1178310: expected 

b'Skipping line 1426387: expected 127 fields, saw 128\nSkipping line 1426397: expected 127 fields, saw 132\nSkipping line 1429280: expected 127 fields, saw 131\nSkipping line 1434262: expected 127 fields, saw 130\n'
b'Skipping line 1436489: expected 127 fields, saw 128\nSkipping line 1437192: expected 127 fields, saw 133\nSkipping line 1437854: expected 127 fields, saw 128\nSkipping line 1438348: expected 127 fields, saw 130\nSkipping line 1439732: expected 127 fields, saw 130\nSkipping line 1442219: expected 127 fields, saw 129\n'
b'Skipping line 1444192: expected 127 fields, saw 128\nSkipping line 1445893: expected 127 fields, saw 129\nSkipping line 1450159: expected 127 fields, saw 130\n'
b'Skipping line 1452365: expected 127 fields, saw 128\nSkipping line 1452367: expected 127 fields, saw 128\nSkipping line 1452368: expected 127 fields, saw 128\nSkipping line 1452369: expected 127 fields, saw 128\nSkipping line 1452370: expected 127 fields, saw 128\nSkipping line 1452371: expected 

b'Skipping line 1706227: expected 127 fields, saw 130\nSkipping line 1706285: expected 127 fields, saw 130\nSkipping line 1712181: expected 127 fields, saw 128\nSkipping line 1712397: expected 127 fields, saw 130\nSkipping line 1712578: expected 127 fields, saw 134\n'
b'Skipping line 1714574: expected 127 fields, saw 128\nSkipping line 1715874: expected 127 fields, saw 129\nSkipping line 1716176: expected 127 fields, saw 131\nSkipping line 1716986: expected 127 fields, saw 128\nSkipping line 1720583: expected 127 fields, saw 130\n'
b'Skipping line 1721845: expected 127 fields, saw 138\nSkipping line 1727001: expected 127 fields, saw 128\nSkipping line 1729172: expected 127 fields, saw 129\n'
b'Skipping line 1729847: expected 127 fields, saw 130\nSkipping line 1730161: expected 127 fields, saw 128\nSkipping line 1730162: expected 127 fields, saw 128\nSkipping line 1730163: expected 127 fields, saw 128\nSkipping line 1730164: expected 127 fields, saw 128\nSkipping line 1731539: expected 

b'Skipping line 2008931: expected 127 fields, saw 130\nSkipping line 2010181: expected 127 fields, saw 128\nSkipping line 2011005: expected 127 fields, saw 129\nSkipping line 2011192: expected 127 fields, saw 134\nSkipping line 2012724: expected 127 fields, saw 132\nSkipping line 2013700: expected 127 fields, saw 128\nSkipping line 2014254: expected 127 fields, saw 130\nSkipping line 2014420: expected 127 fields, saw 130\n'
b'Skipping line 2016602: expected 127 fields, saw 128\nSkipping line 2019017: expected 127 fields, saw 128\nSkipping line 2019018: expected 127 fields, saw 128\nSkipping line 2019019: expected 127 fields, saw 128\nSkipping line 2019020: expected 127 fields, saw 128\nSkipping line 2020212: expected 127 fields, saw 128\nSkipping line 2020213: expected 127 fields, saw 128\nSkipping line 2020214: expected 127 fields, saw 128\nSkipping line 2020977: expected 127 fields, saw 146\n'
b'Skipping line 2026787: expected 127 fields, saw 128\nSkipping line 2032171: expected 127 

b'Skipping line 2280427: expected 127 fields, saw 128\nSkipping line 2280428: expected 127 fields, saw 128\nSkipping line 2280429: expected 127 fields, saw 128\nSkipping line 2281041: expected 127 fields, saw 130\nSkipping line 2282551: expected 127 fields, saw 128\nSkipping line 2282552: expected 127 fields, saw 128\nSkipping line 2282556: expected 127 fields, saw 128\nSkipping line 2282668: expected 127 fields, saw 128\nSkipping line 2282670: expected 127 fields, saw 128\nSkipping line 2282671: expected 127 fields, saw 128\nSkipping line 2284235: expected 127 fields, saw 132\nSkipping line 2284294: expected 127 fields, saw 128\nSkipping line 2284972: expected 127 fields, saw 130\nSkipping line 2285291: expected 127 fields, saw 132\n'
b'Skipping line 2287700: expected 127 fields, saw 129\nSkipping line 2290998: expected 127 fields, saw 128\nSkipping line 2291745: expected 127 fields, saw 128\nSkipping line 2292079: expected 127 fields, saw 144\n'
b'Skipping line 2296407: expected 127 

b'Skipping line 2551401: expected 127 fields, saw 130\nSkipping line 2552742: expected 127 fields, saw 131\nSkipping line 2556787: expected 127 fields, saw 128\nSkipping line 2556788: expected 127 fields, saw 128\nSkipping line 2556900: expected 127 fields, saw 134\n'
b'Skipping line 2559847: expected 127 fields, saw 128\nSkipping line 2559848: expected 127 fields, saw 128\nSkipping line 2559849: expected 127 fields, saw 128\nSkipping line 2559850: expected 127 fields, saw 128\nSkipping line 2559851: expected 127 fields, saw 128\nSkipping line 2561384: expected 127 fields, saw 128\nSkipping line 2561413: expected 127 fields, saw 130\nSkipping line 2565166: expected 127 fields, saw 130\nSkipping line 2565198: expected 127 fields, saw 129\n'
b'Skipping line 2566270: expected 127 fields, saw 132\nSkipping line 2567912: expected 127 fields, saw 130\nSkipping line 2568102: expected 127 fields, saw 144\nSkipping line 2569523: expected 127 fields, saw 130\n'
b'Skipping line 2574653: expected 

b'Skipping line 2820314: expected 127 fields, saw 131\nSkipping line 2823853: expected 127 fields, saw 128\nSkipping line 2824079: expected 127 fields, saw 130\n'
b'Skipping line 2831076: expected 127 fields, saw 129\nSkipping line 2832000: expected 127 fields, saw 128\n'
b'Skipping line 2836429: expected 127 fields, saw 130\nSkipping line 2836540: expected 127 fields, saw 135\nSkipping line 2838034: expected 127 fields, saw 128\nSkipping line 2842677: expected 127 fields, saw 128\nSkipping line 2842695: expected 127 fields, saw 131\nSkipping line 2843575: expected 127 fields, saw 130\n'
b'Skipping line 2845399: expected 127 fields, saw 128\nSkipping line 2845400: expected 127 fields, saw 128\nSkipping line 2845401: expected 127 fields, saw 128\nSkipping line 2845796: expected 127 fields, saw 128\nSkipping line 2847493: expected 127 fields, saw 130\nSkipping line 2848922: expected 127 fields, saw 128\nSkipping line 2849326: expected 127 fields, saw 128\n'
b'Skipping line 2852507: expec

(canonicalize) Operations: fill n/a + dehyphenate + replace_values + trim_tail + fill others (non-target classes)
> dim(ts_ctrl): (2891340, 127)
(save_data) Saved dataframe (dim=(2891340, 127)) to:
/Users/barnett/Documents/work/loinc_predictor/data/andromeda-pond-hepatitis-c-ctrl.csv

> Adding control data | N: 71224 -> 142448


### Define Feature Set 

Note: Subsetting columns could potentially reduce the size of the data

In [6]:
"""
Memo
----
1. medivo_test_result_type is a function of the following attributes: 
      "meta_sender_name",
      "receiving_organization_id",
      "test_order_code",
      "test_order_name",
      "test_result_code",
      "test_result_name",
      "test_result_loinc_code",
      "test_result_units_of_measure"
      
"""
from transformer import to_age
from analyzer import sample_col_values
from loinc import FeatureSet

cat_cols = FeatureSet.cat_cols

# ['patient_gender',
#  'patient_state',
#  'patient_bill_type',
#  'fasting',
#  'performing_organization_id',
#  'receiving_organization_id',
#  'test_result_status',
#  'test_order_code',
#  'test_order_name',
#  'test_result_code',
#  'test_result_name',
#  'test_result_value',
#  'test_result_range',
#  'test_result_abnormal_flag',
#  'test_result_reference_range',
#  'test_result_units_of_measure',
#  'test_result_comments',
#  'test_cpt_code',
#  'panel_order_code',
#  'panel_order_name',
#  'meta_sender_name',
#  'medivo_test_result_type']

cont_cols = FeatureSet.cont_cols  

target_cols = FeatureSet.target_cols # ['test_result_loinc_code', ]

derived_cols = FeatureSet.derived_cols
# ['count',  # due to resolve_duplicate()
# ]

# cardinality < 100
low_card_cols = ['patient_gender', 'fasting', 'meta_sender_name' ]
high_card_cols = list(set(cat_cols)-set(low_card_cols))

representative_cols = ["meta_sender_name",
      # "receiving_organization_id",
      # "test_order_code",
      "test_order_name",
      # "test_result_code",
      "test_result_name",
      "test_result_loinc_code",
      "test_result_units_of_measure"]

target_columns = cat_cols + cont_cols + target_cols

# feature transformation
#####################################################
# to_age(ts0)
# values = sample_col_values(ts0, col='age', n=10)
# print("> age: {}".format(values))

### Balance Classes 

note: balance classes from external dataset

In [7]:
from analyzer import balance_data_incr, load_data_incr, stratify
from transformer import resolve_duplicate
import loinc as lc
from loinc import LoincTSet

# run here or just load the curated data generated by analyzer.t_stratify()
tLoad = True
tSave = not tLoad

col_target = LoincTSet.col_target
token_default = token_missing = 'unknown'

ts = ts0
ds = stratify(ts, col='test_result_loinc_code', ascending=False)
print("(balance_classes) data size distribution:\n{}\n".format(ds[:25]))
ds = [(code, size) for code, size in ds if not code in lc.LoincTSet.non_codes]
max_size = ds[0][1] # ds[2][1]
codes_low_sz = set([code for code, size in ds if size < max_size])
print("(balance_classes) We have n={} with low sample size (< {})".format(len(codes_low_sz), max_size))

if tLoad: 
    input_file=f"andromeda-pond-{cohort}-balanced.csv"
    ts = load_src_data(input_file=input_file, warn_bad_lines=False)
    ts[col_target] = ts[col_target].astype(str)
    # ts = ts.drop_duplicates(keep='last')  # drop duplicates 
    # ts = lc.canonicalize(ts, col_target=col_target, token_missing=token_default, target_labels=loinc_set)
    if not lc.is_canonicalized(ts, col_target=col_target, token_missing=token_default, target_labels=loinc_set): 
        ts = ts.drop_duplicates(keep='last')  # drop duplicates 
        ts = lc.canonicalize(ts, col_target=col_target, token_missing=token_default) # noisy_values/[]
    
else: 
    # max_size = 1000
    input_file = f"andromeda-pond-{cohort}-loinc.csv"
    codes = set(loinc_set)
    codes_hit = set([])
    for i, tsi in enumerate(load_data_incr(input_file=input_file, chunksize=1000000, warn_bad_lines=False)): 
        N0 = ts.shape[0]
        # tsi = tsi.drop_duplicates(keep='last')  # drop duplicates 
        tsi = resolve_duplicate(tsi)
        tsi = lc.canonicalize(tsi, col_target=col_target, token_missing=token_default, target_labels=loinc_set)
        print("[{}] Processing chunk #{} | n(ts): {}, n(tsi): {} ...".format(i, i+1, N0, tsi.shape[0]))

        ts_incr, hit, missed = balance_data_incr(df=ts, df_extern=tsi, n_samples=max_size, col=col_target)
        if not ts_incr.empty: ts = pd.concat([ts, ts_incr])
            
        # --- analysis --- 
        N = ts.shape[0]
        codes_hit.union(hit) # still has this many codes without a match 

        print(f"[{i}] size: {N0} -> {N}")
        ds = stratify(ts, col='test_result_loinc_code', ascending=False)
        ds = [(code, size) for code, size in ds if size < max_size]
        print("[{}] size(codes) < {} (n={}): \n{}\n".format(i, max_size, len(ds), ds[:20]))

    codes_missed = codes_low_sz - codes_hit  
    print("(t_stratify) At last, we could not find a match for n={} codes among nl={} low-sample-size labels:\n{}\n".format(
        len(codes_missed), len(codes_low_sz), codes_missed))  
    
    # down sample control data (very often the negative examples are too huge)

    if tSave: 
        
        # should not have duplicates at this point (due to resolve_duplicate() call)
        # ts = ts.drop_duplicates(keep='last')  # drop duplicates 
        
        output_file = f"andromeda-pond-{cohort}-balanced.csv"
        save_data(ts, output_file=output_file)

"""
Output
------
   ts: balanced training data; well... as balaned as possible because external data may not be able to supply sufficient 
       data for a subset of the classes/loincs
"""

(balance_classes) data size distribution:
[('unknown', 12005), ('67686', 5703), ('17426', 4418), ('19752', 4408), ('17517', 4392), ('21600', 4167), ('7187', 3826), ('7773', 3783), ('178616', 3756), ('486431', 3735), ('30973', 3686), ('19208', 3589), ('66902', 3347), ('7310', 3045), ('28233', 3005), ('7112', 2791), ('7138', 2723), ('110114', 2559), ('7708', 2463), ('45443', 2413), ('7518', 2376), ('25718', 2241), ('134577', 2175), ('20933', 2165), ('20859', 2115)]

(balance_classes) We have n=1506 with low sample size (< 5703)
[load] Loading default input data: andromeda-pond-hepatitis-c-balanced.csv
(canonicalize) Operations: fill n/a + dehyphenate + replace_values + trim_tail + fill others (non-target classes)
(is_canonicalized) Found n=1 codes not in target set:
{'nan'}

(canonicalize) Operations: fill n/a + dehyphenate + replace_values + trim_tail + fill others (non-target classes)


'\nOutput\n------\n   ts: balanced training data; well... as balaned as possible because external data may not be able to supply sufficient \n       data for a subset of the classes/loincs\n'

### Feature Transformation

note: patient_date_of_birth => age

In [8]:
from transformer import to_age
from analyzer import col_values
# from loinc import FeatureSet

to_age(ts)
values = col_values(ts, col='age', n=10)
print("> age: {}".format(values))

# resolve_duplicate() call adds a new column: count (of duplicates)
# assert 'count' in ts.columns

# datatime columns

> age: [60 34 21 84 69 78 45 56 43 24]


### Subset Features and Handling missing values

In [9]:
tCategorify = False
tDropHighMissing = False # drop columns with high rate of missing values
p_null = 0.9
token_default = token_missing = 'unknown'

df = ts # ... :)

# V = list(feature_lookup.keys())
V = cont_cols + cat_cols # + derived_cols
L = target_cols
dfX = df[V]
dfy = df[L]

print("> Given features set:\n{}\n".format(V))

assert np.sum(dfy[target_cols[0]].isnull()) == 0

# drop columns/vars with too many missing values 
N = dfX.shape[0]
n_thresh = int(N * p_null)
nf0 = nf = dfX.shape[1]
fset0 = set(dfX.columns.values)

if tDropHighMissing: 
    dfX = dfX[dfX.columns[dfX.isnull().mean() < p_null]]
    fset = set(dfX.columns.values)
    nf = dfX.shape[1]
    print("> Dropped n={} features:\n{}\n".format(nf-nf0, fset0-fset))
    
fset = set(dfX.columns.values)
print("> Final feature set (nf={}):\n{}\n".format(nf, fset))

# fill in missing values (also see default_values)
dfX.fillna(value=token_default, inplace=True)
#################################################
# Convert our three categorical columns to category dtypes.

cat_cols = [cat for cat in cat_cols if cat in dfX.columns]
cont_cols = [c for c in cont_cols if c in dfX.columns]

"""
Output
------
   dfX 
   dfy
"""

> Given features set:
['age', 'patient_gender', 'patient_state', 'patient_bill_type', 'fasting', 'performing_organization_id', 'receiving_organization_id', 'test_result_status', 'test_order_code', 'test_order_name', 'test_result_code', 'test_result_name', 'test_result_value', 'test_result_range', 'test_result_abnormal_flag', 'test_result_reference_range', 'test_result_units_of_measure', 'test_result_comments', 'test_cpt_code', 'panel_order_code', 'panel_order_name', 'meta_sender_name', 'medivo_test_result_type']

> Final feature set (nf=23):
{'test_result_name', 'fasting', 'test_result_comments', 'test_result_units_of_measure', 'test_result_range', 'age', 'test_cpt_code', 'patient_state', 'test_result_status', 'test_result_reference_range', 'patient_bill_type', 'performing_organization_id', 'patient_gender', 'test_result_abnormal_flag', 'test_order_code', 'panel_order_code', 'receiving_organization_id', 'meta_sender_name', 'test_order_name', 'test_result_value', 'medivo_test_result_typ

'\nOutput\n------\n   dfX \n   dfy\n'

### Encode Variables

In [10]:
from transformer import encode_vars 
from loinc import FeatureSet
# high_card_cols = FeatureSet.high_card_cols
nf0 = dfX.shape[1]
dfX, encoder = encode_vars(dfX, fset=cat_cols, high_card_cols=high_card_cols)
print("> After variable encoding we have dim(dfX): {} | nf: {} -> {}".format(dfX.shape, nf0, dfX.shape[1]))
print("> New feature set:\n{}\n".format(dfX.columns))
print(df[representative_cols].head(10).to_string(index=False))

(encoder_vars2) low card vars (n=['meta_sender_name', 'patient_gender', 'fasting']):
3
 ... high card vars (n=['patient_state', 'test_result_name', 'test_result_status', 'test_result_reference_range', 'patient_bill_type', 'panel_order_name', 'performing_organization_id', 'test_result_comments', 'test_result_abnormal_flag', 'test_order_code', 'panel_order_code', 'receiving_organization_id', 'test_result_units_of_measure', 'test_order_name', 'test_result_range', 'test_result_value', 'medivo_test_result_type', 'test_cpt_code', 'test_result_code']):
19

... transforming var: patient_gender ...
... transforming var: patient_state ...
... transforming var: patient_bill_type ...
... transforming var: fasting ...
... transforming var: performing_organization_id ...
... transforming var: receiving_organization_id ...
... transforming var: test_result_status ...
... transforming var: test_order_code ...
... transforming var: test_order_name ...
... transforming var: test_result_code ...
... tran

### Encode Labels 

In [11]:
from analyzer import encode_labels, summarize_dict, get_sample_sizes
import collections, operator

# verify
assert dfX.shape[0] == dfy.shape[0], "> dim(dfX): {} | dfy.cols: {}".format(dfX.shape, dfy.columns.values)

codebook={'pos': 1, 'neg': 0, '+': 1, '-': 0}

# choose the one with a large sample size as 'positive'
col_label = 'test_result_loinc_code' # strings

topn = 5
sizes = get_sample_sizes(dfy[col_label])
# ... sizes: (loinc) label -> sample size
# print("> n(sizes): {}".format(len(sizes)))  # 734 for cohort='hepatitis-c'

# Q: How many classes/codes have less than N instances? 
N_low = 1000
n_low = sum(1 for l, c in sizes.items() if c < N_low)
print("> Low sample size classes | n={} (< {})".format(n_low, N_low))

N_elow = 10
n_elow = sum(1 for l, c in sizes.items() if c < N_elow)
print("> Extreme low sample size classes | n={} (< {})".format(n_elow, N_elow))

# Q: How many classes/codes were able to match the most enriched class in terms of sample size (e.g. 5707)? 
eps = 10
n_matched = sum(1 for l, c in sizes.items() if c >= max_size-eps)
print("> n_matched: {} | max_size: {}".format(n_matched, max_size))

# sort by values
sizes_sorted = sorted(sizes.items(), key=operator.itemgetter(1))
summarize_dict(sizes, topn=15, sort_=True)

print("> sizes: {}".format(sizes.most_common(20)))
most_sample_sizes = sizes.most_common(topn)  # take(topn, sizes.items())
print("> Sample sizes | Top N={} codes:\n{}\n".format(topn, most_sample_sizes))
least_sample_sizes = sizes.most_common()[:-topn-1:-1]
print("> Sample sizss | Last N={} codes:\n{}\n".format(topn, least_sample_sizes))

# test
target = most_sample_sizes[0][0]
y = encode_labels(dfy, pos_label=target, codebook=codebook, verbose=1)

> Low sample size classes | n=698 (< 1000)
> Extreme low sample size classes | n=395 (< 10)
> n_matched: 2 | max_size: 5703
[96107] -> 1
[772079] -> 1
[356188] -> 1
[160119] -> 1
[448134] -> 1
[159756] -> 1
[206649] -> 1
[80100] -> 1
[223123] -> 1
[71001] -> 1
[156679] -> 1
[530170] -> 1
[264663] -> 1
[52092] -> 1
[148049] -> 1
> sizes: [('nan', 980139), ('67686', 5872), ('17426', 4647), ('17517', 4602), ('19752', 4562), ('21600', 4379), ('unknown', 4267), ('178616', 4001), ('7773', 3938), ('7187', 3917), ('486431', 3861), ('30973', 3851), ('19208', 3726), ('66902', 3405), ('110114', 3253), ('28233', 3142), ('7310', 3104), ('7112', 2848), ('7138', 2827), ('7708', 2503)]
> Sample sizes | Top N=5 codes:
[('nan', 980139), ('67686', 5872), ('17426', 4647), ('17517', 4602), ('19752', 4562)]

> Sample sizss | Last N=5 codes:
[('76919', 1), ('206060', 1), ('62372', 1), ('162339', 1), ('64204', 1)]

(encode_labels) sample size: Counter({1: 980139, 0: 143360})


## Initial Model Training

### 1. Feature Selection

In [12]:
"""
Ref
---
1. pip install feature-selector

   https://github.com/WillKoehrsen/feature-selector
   
   possible dependency 
      brew install libomp
      
   <debug> 
       + RuntimeError: Python is not installed as a framework.
          > https://stackoverflow.com/questions/34977388/matplotlib-runtimeerror-python-is-not-installed-as-a-framework
   
"""
import feature_selector 

### 2. Save a copy of encoded training set (optional)

In [13]:
tSaveEncodedTSet = True 

if tSaveEncodedTSet: 
    output_file = f"andromeda-pond-{cohort}-encoded.csv"
    save_data(ts, output_file=output_file)
    

(save_data) Saved dataframe (dim=(1123499, 129)) to:
/Users/barnett/Documents/work/loinc_predictor/data/andromeda-pond-hepatitis-c-encoded.csv



### 3. Model Training

In [14]:
import utils_tree, utils_sys, analyzer
import collections
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from analyzer import balance_by_downsampling

# data transformation
col = 'test_result_loinc_code'
X, y = dfX.values, dfy[col].values
print("> dim(X): {}, sample(y): {}".format(X.shape, np.random.choice(np.unique(y),20) ))

# feature scaling
scaler = MinMaxScaler() # MinMaxScaler(), StandardScaler()
X = scaler.fit_transform(X)

n_fold = 5
n_min = n_fold

# to save performance data
header = ['code', 'mean', 'std', 'n_pos']
sdict = {h:[] for h in header}
for code in loinc_set: 
    y_eff = analyzer.encode_labels(y, pos_label=code)
    
    counter = collections.Counter(y_eff)
    n_pos, n_neg = counter[codebook['pos']], counter[codebook['neg']]
    print("> sample size | n(+): {}, n(-): {} | dim(y_encoded): {}".format(n_pos, n_neg, y_eff.shape))
    
    if n_pos >= n_min: 
        # downsampling majority classes (very often we have too many negative instances)
        X_eff, y_eff = balance_by_downsampling(X, y_eff, method='multiple', majority_max=3)
        n_pos, n_neg = counter[codebook['pos']], counter[codebook['neg']]
        print(f"> sample size (after downsampling majority) | n(+): {n_pos}, n(-): {n_neg}")
        
        scores = analyzer.eval_performance(X_eff, y_eff, model=None, cv=n_fold, random_state=53, verbose=1)
        mean_score = np.mean(scores)
        std_score = np.std(scores)
        print("> average: {}, std: {}".format(mean_score, std_score))
    else: 
        print("> (positive) sample size too small, n={}".format(n_pos))
        mean_score = -1 
        std_score = -1
    sdict['code'].append(code)
    sdict['mean'].append(mean_score)
    sdict['std'].append(std_score)
    sdict['n_pos'].append(n_pos)

# --------------------------------------------------
# save performance dataframe
df_perf = DataFrame(sdict, columns=header)
df_perf = df_perf.sort_values(by=['mean', ]) # ascending=False
analyzer.save_performance(df, output_dir='result') # output_file/'' (performance-<cohort>.csv)

cohort = 'hepatitis-c'
output_dir = os.path.join(os.getcwd(), 'result')
output_file = f"performance-{cohort}-0.csv" 
output_path = os.path.join(output_dir, output_file)
df_perf.to_csv(output_path, sep='|', index=False, header=True)

for code, score in zip(df_perf['code'], df_perf['mean']):
    print(f"[{code}] -> {score}")

> dim(X): (1123499, 243), sample(y): ['56853' '7708' '103317' '295980' '25007' '339440' '115725' '66902'
 '169151' '10058' '39347' '160119' '177915' '21600' '505610' '28886'
 '204545' '7096' '143149' '139907']
(encode_labels) sample size: Counter({0: 1118852, 1: 4647})
> sample size | n(+): 4647, n(-): 1118852 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 4647, n(-): 1118852
> 0 of KFold 5
> Fmax: 0.9983879634605051 p_th: 0.92472648938096 | F1: 0.9973161567364466, AUC: 0.9999791584731806
> 1 of KFold 5
> Fmax: 0.998389694041868 p_th: 0.6156882545750916 | F1: 0.998389694041868, AUC: 0.9999911294179356
> 2 of KFold 5
> Fmax: 0.9978517722878624 p_th: 0.9223685692664245 | F1: 0.9967845659163987, AUC: 0.999987658320606
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9115691417877637 | F1: 0.9989247311827958, AUC: 1.0
> 4 of KFold 5
> Fmax: 

> sample size | n(+): 973, n(-): 1122526 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 973, n(-): 1122526
> 0 of KFold 5
> Fmax: 0.9974293059125964 p_th: 0.7930443192916324 | F1: 0.9948717948717949, AUC: 0.9998682824025289
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.841096526684512 | F1: 0.9974424552429668, AUC: 1.0
> 2 of KFold 5
> Fmax: 0.9948717948717948 p_th: 0.8630448451505258 | F1: 0.9948717948717948, AUC: 0.9999735206891682
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.7245684662283568 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.956380427368394 | F1: 0.9873417721518987, AUC: 1.0
> average: 0.9984602201568784, std: 0.0020519427916753322
(encode_labels) sample size: Counter({0: 1119120, 1: 4379})
> sample size | n(+): 4379, n(-): 1119120 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labe

> Fmax: 0.9991235758106924 p_th: 0.7138506700874838 | F1: 0.9965034965034965, AUC: 0.999998972841941
> average: 0.9994735288291142, std: 0.00042986338535625304
(encode_labels) sample size: Counter({0: 1119582, 1: 3917})
> sample size | n(+): 3917, n(-): 1119582 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 3917, n(-): 1119582
> 0 of KFold 5
> Fmax: 0.9993618379068284 p_th: 0.6684909275431046 | F1: 0.9993618379068284, AUC: 0.9999994567676699
> 1 of KFold 5
> Fmax: 0.9993618379068284 p_th: 0.9525800480928899 | F1: 0.996817313812858, AUC: 0.999997286148502
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9594603514607686 | F1: 0.9974554707379135, AUC: 1.0
> 3 of KFold 5
> Fmax: 0.9993618379068284 p_th: 0.916647674724642 | F1: 0.9974522292993631, AUC: 0.9999994565365072
> 4 of KFold 5
> Fmax: 0.9974457215836526 p_th: 0.8307176172145435 | F1

> Fmax: 0.995405819295559 p_th: 0.9094634473634876 | F1: 0.9938837920489296, AUC: 0.9993608133669609
> 4 of KFold 5
> Fmax: 0.995398773006135 p_th: 0.8309598426667172 | F1: 0.9946360153256705, AUC: 0.9993913869841847
> average: 0.9955576613564092, std: 0.0012222869210699707
(encode_labels) sample size: Counter({0: 1122425, 1: 1074})
> sample size | n(+): 1074, n(-): 1122425 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 1074, n(-): 1122425
> 0 of KFold 5
> Fmax: 0.9907407407407408 p_th: 0.8469155127278413 | F1: 0.9884526558891455, AUC: 0.9990769785469623
> 1 of KFold 5
> Fmax: 0.9907407407407407 p_th: 0.5032524542905642 | F1: 0.9907407407407407, AUC: 0.9986452220531767
> 2 of KFold 5
> Fmax: 0.9976798143851509 p_th: 0.8647429553903304 | F1: 0.9953703703703703, AUC: 0.9999205546728297
> 3 of KFold 5
> Fmax: 0.995370370370370

> Fmax: 1.0 p_th: 0.922054958210193 | F1: 0.9989888776541962, AUC: 1.0
> average: 0.999392915162501, std: 0.0004956849434250422
(encode_labels) sample size: Counter({0: 1123485, 1: 14})
> sample size | n(+): 14, n(-): 1123485 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 14, n(-): 1123485
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9696668131916392 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9724800618723248 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9369812336284101 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9371025361818622 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.1962643615339869 | F1: 0.8, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1121047, 1: 2452})
> sample size | n(+): 2452, n(-): 1121047 | dim(y_encoded): (1123499,)
(balance_by_downsampling) di

> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123131, 1: 368})
> sample size | n(+): 368, n(-): 1123131 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 368, n(-): 1123131
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9277032185269851 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 0.9726027397260274 p_th: 0.9710756226797478 | F1: 0.9419354838709678, AUC: 0.9987159104806164
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.828316423160265 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9460271110288854 | F1: 0.9931972789115647, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.7620954482498857 | F1: 0.9932885906040269, AUC: 1.0
> average: 0.9945205479452055, std: 0.010958904109589041
(encode_labels) sample size: Counter({0: 1123319, 1: 180})
> sample size | n(+): 180, n(-): 1123319 | dim(y_encoded): (1123499,)
(balance_by_downsampling

> Fmax: 0.967741935483871 p_th: 0.576792058496042 | F1: 0.967741935483871, AUC: 0.9988304093567252
> 2 of KFold 5
> Fmax: 0.989010989010989 p_th: 0.5442581687145859 | F1: 0.989010989010989, AUC: 0.9998329156223893
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.7951939051604533 | F1: 0.9777777777777777, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9355695305124856 | F1: 0.9777777777777777, AUC: 1.0
> average: 0.991350584898972, std: 0.012548140745894505
(encode_labels) sample size: Counter({0: 1123247, 1: 252})
> sample size | n(+): 252, n(-): 1123247 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 252, n(-): 1123247
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.500599516932866 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 0.9902912621359222 p_th: 0.6352632144413509 | F1: 0.9902912621359222, AUC: 0.9998701467341903
> 2 of KFold 5
> Fmax: 0.990291262135

> Fmax: 0.9955156950672646 p_th: 0.1828212030464557 | F1: 0.9954751131221719, AUC: 0.9999731074357939
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9698686954596438 | F1: 0.9955555555555555, AUC: 1.0
> average: 0.9973092727489984, std: 0.0021971168703661006
(encode_labels) sample size: Counter({0: 1123057, 1: 442})
> sample size | n(+): 442, n(-): 1123057 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 442, n(-): 1123057
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.801708079028407 | F1: 0.9723756906077349, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.7802231032503405 | F1: 0.9944134078212291, AUC: 1.0
> 2 of KFold 5
> Fmax: 0.9887640449438202 p_th: 0.2966390138243345 | F1: 0.9886363636363636, AUC: 0.9991520033919864
> 3 of KFold 5
> Fmax: 0.9832402234636871 p_th: 0.31222412226179447 | F1: 0.9714285714285714, AUC: 0.9996140651801029
> 4 of KFold 5

> Fmax: 1.0 p_th: 0.89762317754459 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9448953704566705 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 0.888888888888889 p_th: 0.15086993486331693 | F1: 0.8571428571428571, AUC: 0.9750000000000001
> average: 0.9777777777777779, std: 0.04444444444444442
(encode_labels) sample size: Counter({0: 1123226, 1: 273})
> sample size | n(+): 273, n(-): 1123226 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 273, n(-): 1123226
> 0 of KFold 5
> Fmax: 0.972972972972973 p_th: 0.9372283925349528 | F1: 0.9557522123893805, AUC: 0.9983370288248337
> 1 of KFold 5
> Fmax: 0.9649122807017544 p_th: 0.1480033012557228 | F1: 0.9549549549549549, AUC: 0.9980044345898004
> 2 of KFold 5
> Fmax: 0.9908256880733944 p_th: 0.4415498760927347 | F1: 0.9906542056074767, AUC: 0.9998870822041553
> 3 of KFold 5
> Fm

> Fmax: 1.0 p_th: 0.9201803510976984 | F1: 0.9600000000000001, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.3352033302381314 | F1: 0.9787234042553191, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9314988274221527 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.7837029243043688 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 0.9600000000000001 p_th: 0.8796973570114667 | F1: 0.9600000000000001, AUC: 0.9976851851851851
> average: 0.992, std: 0.01599999999999997
(encode_labels) sample size: Counter({0: 1123428, 1: 71})
> sample size | n(+): 71, n(-): 1123428 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 71, n(-): 1123428
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9868876225385528 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9638362236229499 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9509320970532065 | F1: 1.0, AU

> 0 of KFold 5
> Fmax: 1.0 p_th: 0.7641008968683658 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.859116419224149 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.8923962011327887 | F1: 0.9, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.2363390211404617 | F1: 0.9411764705882353, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.7957978821478014 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123444, 1: 55})
> sample size | n(+): 55, n(-): 1123444 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 55, n(-): 1123444
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9588858829338344 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9152969515188225 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.8502044758556543 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.624296288476664 | F1: 

> Fmax: 1.0 p_th: 0.9674725491547584 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.4751563653529249 | F1: 0.8, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.972196261619922 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.38488791373619513 | F1: 0.8571428571428571, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9688364564665993 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1122965, 1: 534})
> sample size | n(+): 534, n(-): 1122965 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 534, n(-): 1122965
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9747656594604365 | F1: 0.9907407407407407, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9772159430908104 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.8066353780094925 | F1: 0.9907407407407407, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.972

> sample size (after downsampling majority) | n(+): 55, n(-): 1123444
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9593574933252628 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.6802348575924403 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.8869538209447378 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.8943993378877916 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.8317928987454651 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123327, 1: 172})
> sample size | n(+): 172, n(-): 1123327 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 172, n(-): 1123327
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9883835058715597 | F1: 0.9855072463768115, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9848722894769096 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9894723617878748 | F

> Fmax: 0.9850746268656716 p_th: 0.8678292071260791 | F1: 0.9850746268656716, AUC: 0.9921992199219922
> 3 of KFold 5
> Fmax: 0.9850746268656716 p_th: 0.9497194358641522 | F1: 0.9565217391304348, AUC: 0.9924992499249925
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.7314901354035438 | F1: 0.9855072463768115, AUC: 1.0
> average: 0.991044776119403, std: 0.007311909679949807
(encode_labels) sample size: Counter({0: 1123353, 1: 146})
> sample size | n(+): 146, n(-): 1123353 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 146, n(-): 1123353
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9043902353307235 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.6936316652180383 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9217749144753113 | F1: 0.983050847457627, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.5833620336584757 | F1: 1.0, AUC: 1.0
> 4 o

> Fmax: 0.9772727272727273 p_th: 0.44377199539199075 | F1: 0.9655172413793104, AUC: 0.9974955277280859
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9418076957494982 | F1: 0.9885057471264368, AUC: 1.0
> average: 0.9842668059909438, std: 0.016651056711713925
(encode_labels) sample size: Counter({0: 1123470, 1: 29})
> sample size | n(+): 29, n(-): 1123470 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 29, n(-): 1123470
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.727960769420226 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.48761568892154833 | F1: 0.888888888888889, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.6346828909292911 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9525973160516844 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.7682169342337752 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample siz

> 1 of KFold 5
> Fmax: 1.0 p_th: 0.8811369717455244 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9477899095624867 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.941262030461344 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9412303926866725 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123450, 1: 49})
> sample size | n(+): 49, n(-): 1123450 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 49, n(-): 1123450
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.8088356159445788 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.8247170220095912 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.6431411805390943 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.6422957190229328 | F1: 0.9523809523809523, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.5153567665437675 | F1:

(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 16, n(-): 1123483
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9857393223156976 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.987497906899585 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9610128150320529 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9845216025398026 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9288163148112962 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123410, 1: 89})
> sample size | n(+): 89, n(-): 1123410 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 89, n(-): 1123410
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9436982301499443 | F1: 1.0, AUC: 1.0
> 1 of KFold 5


> sample size | n(+): 97, n(-): 1123402 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 97, n(-): 1123402
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9192120408701439 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.1865583579327853 | F1: 0.9743589743589743, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9255571281767255 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9679255452442659 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9835453510263054 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123473, 1: 26})
> sample size | n(+): 26, n(-): 1123473 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 26, n(-): 1123473
> 

(encode_labels) sample size: Counter({0: 1123466, 1: 33})
> sample size | n(+): 33, n(-): 1123466 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 33, n(-): 1123466
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.3274422934484662 | F1: 0.923076923076923, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.25575099490229014 | F1: 0.923076923076923, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9327392558117354 | F1: 0.923076923076923, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9448206475061953 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9373065857482205 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123468, 1: 31})
> sample size | n(+): 31, n(-): 1123468 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (

> Fmax: 1.0 p_th: 0.9819972085405477 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123452, 1: 47})
> sample size | n(+): 47, n(-): 1123452 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 47, n(-): 1123452
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.8381543003235148 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.22013633205723898 | F1: 0.9473684210526316, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.2871689925774292 | F1: 0.9473684210526316, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9839628228949316 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9834980632347801 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123493, 1: 6})
> sample size | n(+): 6, n(-): 1123493 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243),

> Fmax: 1.0 p_th: 0.7053606297698373 | F1: 0.9696969696969697, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.7126823839571412 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 0.9411764705882353 p_th: 0.7306167414025817 | F1: 0.9411764705882353, AUC: 0.9813829787234043
> average: 0.9882352941176471, std: 0.023529411764705885
(encode_labels) sample size: Counter({0: 1123491, 1: 8})
> sample size | n(+): 8, n(-): 1123491 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 8, n(-): 1123491
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.7174925351276429 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.2945941526927452 | F1: 0.6666666666666666, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.8695314013893174 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.7404799118014905 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.3569653921937001 | 

> sample size (after downsampling majority) | n(+): 37, n(-): 1123462
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9170401583687293 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.7463159346702868 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.6620163615064041 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.8920685727516038 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.7381714540423623 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123313, 1: 186})
> sample size | n(+): 186, n(-): 1123313 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 186, n(-): 1123313
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.7691036302962143 | F1: 0.9736842105263158, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.8523448531823486 | F1: 0.9736842105263158, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.73894

> Fmax: 1.0 p_th: 0.9926472675865339 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123473, 1: 26})
> sample size | n(+): 26, n(-): 1123473 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 26, n(-): 1123473
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.8169862487309625 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.947796166073319 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9593047350483093 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.7570250574645498 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9636864418745451 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123473, 1: 26})
> sample size | n(+): 26, n(-): 1123473 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243

> Fmax: 1.0 p_th: 0.9575245468676439 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.959101333626427 | F1: 0.923076923076923, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.8365275883482873 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.3145309671258981 | F1: 0.923076923076923, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123488, 1: 11})
> sample size | n(+): 11, n(-): 1123488 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 11, n(-): 1123488
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.4925970055564295 | F1: 0.6666666666666666, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.5793282131851517 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9326554032342204 | F1: 0.8, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.23193310302916295 | F1: 0.8, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.92903074

> 4 of KFold 5
> Fmax: 1.0 p_th: 0.7241592392866785 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123465, 1: 34})
> sample size | n(+): 34, n(-): 1123465 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 34, n(-): 1123465
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9834918339782857 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9650110024839116 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9644681668374455 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.8822233165705382 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9838999923141383 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123497, 1: 2})
> sample size | n(+): 2, n(-): 1123497 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=2
(encode_labels) sample si

> Fmax: 1.0 p_th: 0.9169940799350652 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9836879995011605 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.7699450364230778 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9879037613486786 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123487, 1: 12})
> sample size | n(+): 12, n(-): 1123487 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 12, n(-): 1123487
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9078999367918298 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.32154816798010954 | F1: 0.5, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9657162084204646 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.885138597563111 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9094137340530344 | F1: 1.0, AUC: 1.0
> average: 1.0

> sample size (after downsampling majority) | n(+): 5, n(-): 1123494
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.39050688274040096 | F1: 0.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.8369377582434885 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9250181852400188 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9051543386497021 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.809781117837687 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123496, 1: 3})
> sample size | n(+): 3, n(-): 1123496 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=3
(encode_labels) sample size: Counter({0: 1123496, 1: 3})
> sample size | n(+): 3, n(-): 1123496 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=3
(encode_labels) sample size: Counter({0: 1123492, 1: 7})
> sample size | n(+): 7, n(-): 1123492 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After m

... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 11, n(-): 1123488
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9805360131942112 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9815722195370635 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9248802593931181 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.8546929176156914 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9846473737198169 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123450, 1: 49})
> sample size | n(+): 49, n(-): 1123450 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 49, n(-): 1123450
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.7138153565990354 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.653923681420922 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 

> sample size (after downsampling majority) | n(+): 47, n(-): 1123452
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9301976655305011 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 0.9523809523809523 p_th: 0.6826460098129701 | F1: 0.9523809523809523, AUC: 0.9964285714285714
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.37367812695905195 | F1: 0.9473684210526316, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9078942307963142 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 0.9473684210526316 p_th: 0.7934781580230653 | F1: 0.9473684210526316, AUC: 0.9920634920634921
> average: 0.9799498746867169, std: 0.024607393902022205
(encode_labels) sample size: Counter({0: 1123483, 1: 16})
> sample size | n(+): 16, n(-): 1123483 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 16, n(-): 1123483
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9392011910921484 | F1: 1.0, AUC: 1.0
> 1 of 

> sample size (after downsampling majority) | n(+): 20, n(-): 1123479
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.15627756091349854 | F1: 0.8571428571428571, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9283606197828163 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.981280334272925 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 0.888888888888889 p_th: 0.045321566972579254 | F1: 0.8571428571428571, AUC: 0.9791666666666666
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9264068734132379 | F1: 1.0, AUC: 1.0
> average: 0.9777777777777779, std: 0.04444444444444442
(encode_labels) sample size: Counter({0: 1123498, 1: 1})
> sample size | n(+): 1, n(-): 1123498 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1123495, 1: 4})
> sample size | n(+): 4, n(-): 1123495 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=4
(encode_labels) sample size: Counter({0: 1123498, 1: 1})
> sample size | n(+): 1, n(-): 1123498 | dim(y_encoded): (1123499,

> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123498, 1: 1})
> sample size | n(+): 1, n(-): 1123498 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1123490, 1: 9})
> sample size | n(+): 9, n(-): 1123490 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 9, n(-): 1123490
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9559300491379857 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9787641055199164 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9664196503904764 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9674979210741125 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.1583470052205944 | F1: 0.6666666666666666, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123491, 1: 8})
> sample size | n(+): 8, n(-

> sample size (after downsampling majority) | n(+): 17, n(-): 1123482
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.909325843649936 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9624421071395381 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9580036686655786 | F1: 0.888888888888889, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.5178986778731617 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.48599028567682623 | F1: 0.8, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123489, 1: 10})
> sample size | n(+): 10, n(-): 1123489 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 10, n(-): 1123489
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9800056285311545 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9498324937674094 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9096572353579668 | F1: 0

> sample size (after downsampling majority) | n(+): 342, n(-): 1123157
> 0 of KFold 5
> Fmax: 0.9927007299270074 p_th: 0.9591770755767282 | F1: 0.9927007299270074, AUC: 0.9999286122215877
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.6881770072553737 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 0.9857142857142858 p_th: 0.8252018121327569 | F1: 0.9857142857142858, AUC: 0.9959703075291623
> 3 of KFold 5
> Fmax: 0.9927007299270074 p_th: 0.9303409596053545 | F1: 0.9927007299270074, AUC: 0.9994978479196557
> 4 of KFold 5
> Fmax: 0.9855072463768115 p_th: 0.5419152709989812 | F1: 0.9855072463768115, AUC: 0.9964849354375896
> average: 0.9913245983890224, std: 0.005373402309034959
(encode_labels) sample size: Counter({0: 1123494, 1: 5})
> sample size | n(+): 5, n(-): 1123494 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 5, n(-): 1123494
> 0 of 

> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123491, 1: 8})
> sample size | n(+): 8, n(-): 1123491 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 8, n(-): 1123491
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.979734854401544 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9706506501411197 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9689868230298072 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9718917890230732 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9628039082151721 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123497, 1: 2})
> sample size | n(+): 2, n(-): 1123497 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=2
(encode_labels) sample size: Counter({0: 1123492, 1: 7})
> sample size | n(+): 7, n(-): 1123492 | dim

> Fmax: 1.0 p_th: 0.9492702008403238 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9645980562395438 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9727234967352667 | F1: 1.0, AUC: 1.0
> average: 0.9333333333333332, std: 0.13333333333333336
(encode_labels) sample size: Counter({0: 1123498, 1: 1})
> sample size | n(+): 1, n(-): 1123498 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1123498, 1: 1})
> sample size | n(+): 1, n(-): 1123498 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1123490, 1: 9})
> sample size | n(+): 9, n(-): 1123490 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 9, n(-): 1123490
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.8452938651975156 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
>

> Fmax: 1.0 p_th: 0.9696434466765311 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9730596761847926 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9691342814932852 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9717093676919754 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123491, 1: 8})
> sample size | n(+): 8, n(-): 1123491 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 8, n(-): 1123491
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9718745491181027 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9278368889801881 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.979595823640631 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9706735277878902 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9535303753751172 | F1: 1.0, AUC: 1.0
> average: 1.0, st

> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1123494, 1: 5})
> sample size | n(+): 5, n(-): 1123494 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 5, n(-): 1123494
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.972167852438757 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9687830529525427 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9625982335775377 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9689890509603497 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9689661005524158 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123496, 1: 3})
> sample size | n(+): 3, n(-): 1123496 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=3
(encode_labels) sample size: Counter({0: 1123496, 1: 3})
> sample size | n(+): 3, n(-)

> sample size (after downsampling majority) | n(+): 7, n(-): 1123492
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.8896649831955605 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.3071645574589783 | F1: 0.6666666666666666, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9172153702337944 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.8263783872082021 | F1: 0.6666666666666666, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9386913885179402 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123495, 1: 4})
> sample size | n(+): 4, n(-): 1123495 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=4
(encode_labels) sample size: Counter({0: 1123498, 1: 1})
> sample size | n(+): 1, n(-): 1123498 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1123494, 1: 5})
> sample size | n(+): 5, n(-): 1123494 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labe

> Fmax: 1.0 p_th: 0.4533838378306051 | F1: 0.6666666666666666, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.273443127867973 | F1: 0.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9706674561087019 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9767101216325818 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123497, 1: 2})
> sample size | n(+): 2, n(-): 1123497 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=2
(encode_labels) sample size: Counter({0: 1123497, 1: 2})
> sample size | n(+): 2, n(-): 1123497 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=2
(encode_labels) sample size: Counter({0: 1123498, 1: 1})
> sample size | n(+): 1, n(-): 1123498 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1123497, 1: 2})
> sample size | n(+): 2, n(-): 1123497 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=2
(encode_labels) sample size:

> Fmax: 1.0 p_th: 0.9051750161854194 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9608233943983274 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123498, 1: 1})
> sample size | n(+): 1, n(-): 1123498 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1123495, 1: 4})
> sample size | n(+): 4, n(-): 1123495 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=4
(encode_labels) sample size: Counter({0: 1123498, 1: 1})
> sample size | n(+): 1, n(-): 1123498 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1123497, 1: 2})
> sample size | n(+): 2, n(-): 1123497 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=2
(encode_labels) sample size: Counter({0: 1123488, 1: 11})
> sample size | n(+): 11, n(-): 1123488 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, la

> sample size | n(+): 6, n(-): 1123493 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 6, n(-): 1123493
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.8256335636505185 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9639120568669253 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.2540609070963435 | F1: 0.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.7691902177130951 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9729902230808904 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123498, 1: 1})
> sample size | n(+): 1, n(-): 1123498 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1123498, 1: 1})
> sample size | n(+): 1, n(-): 1123498 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=1
(encode_labels) s

(encode_labels) sample size: Counter({0: 1123495, 1: 4})
> sample size | n(+): 4, n(-): 1123495 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=4
(encode_labels) sample size: Counter({0: 1123498, 1: 1})
> sample size | n(+): 1, n(-): 1123498 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1123472, 1: 27})
> sample size | n(+): 27, n(-): 1123472 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 27, n(-): 1123472
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.8720922046377925 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.3634553236309839 | F1: 0.9090909090909091, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.967652621031126 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9499831927576528 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.87709

> sample size | n(+): 4, n(-): 1123495 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=4
(encode_labels) sample size: Counter({0: 1123496, 1: 3})
> sample size | n(+): 3, n(-): 1123496 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=3
(encode_labels) sample size: Counter({0: 1123497, 1: 2})
> sample size | n(+): 2, n(-): 1123497 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=2
(encode_labels) sample size: Counter({0: 1123481, 1: 18})
> sample size | n(+): 18, n(-): 1123481 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 18, n(-): 1123481
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9202635664719878 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.955051204645977 | F1: 0.888888888888889, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9110232188787155 | F1: 1.0, AUC: 1.0
> 3 o

... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 13, n(-): 1123486
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9697680480412231 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9147434507597384 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9333092417621922 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9777229028462994 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9777064461738552 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1123496, 1: 3})
> sample size | n(+): 3, n(-): 1123496 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=3
(encode_labels) sample size: Counter({0: 1123496, 1: 3})
> sample size | n(+): 3, n(-): 1123496 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=3
(encode_labels) sample size: Counter({0: 1123485, 1: 14})
> sample size | n(+): 14, n(-): 1123485 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (11

> (positive) sample size too small, n=3
(encode_labels) sample size: Counter({0: 1123498, 1: 1})
> sample size | n(+): 1, n(-): 1123498 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1123498, 1: 1})
> sample size | n(+): 1, n(-): 1123498 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1123494, 1: 5})
> sample size | n(+): 5, n(-): 1123494 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 5, n(-): 1123494
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.20029711494443284 | F1: 0.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9420820781825324 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.7258347455288303 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9438000872844835 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
>

... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 11, n(-): 1123488
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.5726806898024087 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9038985502674145 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 0.8 p_th: 0.41298017315396945 | F1: 0.5, AUC: 0.9285714285714286
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.8006750584885524 | F1: 0.8571428571428571, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.5993142433604921 | F1: 1.0, AUC: 1.0
> average: 0.96, std: 0.07999999999999999
(encode_labels) sample size: Counter({0: 1123489, 1: 10})
> sample size | n(+): 10, n(-): 1123489 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 10, n(-): 1123489
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.857640981593104 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.966045110962

> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1123496, 1: 3})
> sample size | n(+): 3, n(-): 1123496 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=3
(encode_labels) sample size: Counter({0: 1123498, 1: 1})
> sample size | n(+): 1, n(-): 1123498 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1123498, 1: 1})
> sample size | n(+): 1, n(-): 1123498 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1123497, 1: 2})
> sample size | n(+): 2, n(-): 1123497 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=2
(encode_labels) sample size: Counter({0: 1123496, 1: 3})
> sample size | n(+): 3, n(-): 1123496 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=3
(encode_labels) sample size: Counter({0: 1123492, 1: 7})
> sample size | n(+): 7, n(-): 1123492 | dim(y_encoded): (1123499,)
(balance_b

(encode_labels) sample size: Counter({0: 1123495, 1: 4})
> sample size | n(+): 4, n(-): 1123495 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=4
(encode_labels) sample size: Counter({0: 1123498, 1: 1})
> sample size | n(+): 1, n(-): 1123498 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1123487, 1: 12})
> sample size | n(+): 12, n(-): 1123487 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 12, n(-): 1123487
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9729316785092583 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.8951380063927971 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.8320158914887106 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.8774068446763696 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.8262875770506514 | 

(encode_labels) sample size: Counter({0: 1123497, 1: 2})
> sample size | n(+): 2, n(-): 1123497 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=2
(encode_labels) sample size: Counter({0: 1123495, 1: 4})
> sample size | n(+): 4, n(-): 1123495 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=4
(encode_labels) sample size: Counter({0: 1123498, 1: 1})
> sample size | n(+): 1, n(-): 1123498 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1123492, 1: 7})
> sample size | n(+): 7, n(-): 1123492 | dim(y_encoded): (1123499,)
(balance_by_downsampling) dim(X): (1123499, 243), nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)
> sample size (after downsampling majority) | n(+): 7, n(-): 1123492
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.8600280846302669 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.6216474987248386 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9

> sample size | n(+): 1, n(-): 1123498 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1123497, 1: 2})
> sample size | n(+): 2, n(-): 1123497 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=2
(encode_labels) sample size: Counter({0: 1123498, 1: 1})
> sample size | n(+): 1, n(-): 1123498 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1123498, 1: 1})
> sample size | n(+): 1, n(-): 1123498 | dim(y_encoded): (1123499,)
> (positive) sample size too small, n=1
(save) Saving performance dataframe to:
/Users/barnett/Documents/work/loinc_predictor/result/performance-hepatitis-c.csv
 ... #


KeyError: 'code'

### Visualize Results

In [None]:
"""

Memo
---- 
1. performance plot

   perplot: https://pypi.org/project/perfplot/
"""
import seaborn as sns
import matplotlib.pyplot as plt
from analyzer import load_performance

sns.set(style="whitegrid")

# Initialize the matplotlib figure
f, ax = plt.subplots(figsize=(6, 20))
sns.set_color_codes("pastel")

#---------------------------------------------

# load performance data
cohort = 'hepatitis-c'
df_perf = load_performance(input_dir='result', cohort=cohort)
print("> dim(performance matrix): {}".format(df_perf.shape))

# sort ~ performance scores 
# df_perf = df_perf.sort_values(by=['mean', ], ascending=False)

header = ['code', 'mean', 'std', 'n_pos']
codes = df_perf['code']
n_codes = len(codes)
scores = df_perf['mean']

# some statistics
score_high = 0.90
score_low = 0.50

codes_low_sz = df_perf.loc[df_perf['mean'] < 0]['code']
codes_scored = df_perf.loc[df_perf['mean'] >= 0]['code']
codes_high_score = df_perf.loc[df_perf['mean'] >= score_high]['code']
assert n_codes == len(codes_low_sz) + len(codes_scored)

print("1. Total number of codes: {} | n(low_sample): {}, n(scored):{}, n(high scored):{}".format(n_codes, 
   len(codes_low_sz), len(codes_scored), len(codes_high_score)))
r_scored = len(codes_scored)/(n_codes+0.0)
rh = len(codes_high_score)/(n_codes+0.0)
print("2. Fraction of scored codes: {}".format(r_scored))
print("3. Fraction of highly scored codes: {}".format(rh))

# Effective performance dataframe, ruling out those codes without scores (due to low sample sizes)
df_eff = df_perf.loc[df_perf['mean'] >= 0.0]

n_offset = 25
df_topn = df_eff.sort_values(['mean', ], ascending=False).head(n_offset)
df_botn = df_eff.sort_values(['mean', ], ascending=True).head(n_offset)
# print(df_botn)

# codes = [str(c) for c in df_botn['code'].values]
# print('lower codes: {}'.format(codes))
# scores = df_botn['mean'].values
# print('scores: {}'.format(scores))

# top n + bottom n
dfe = pd.concat([df_topn, df_botn], ignore_index=True)
dfe.sort_values(by=['mean', ], ascending=False, inplace=True)
codes = [str(c) for c in dfe['code'].values]
scores = dfe['mean'].values
# print('lower(n)+higher codes(n): {}'.format(codes))
# print('scores: {}'.format(scores))
print(dfe)

# sns.barplot(x="total", y="abbrev", data=crashes,
#             label="Total", color="b")

# --------------------
# ax = sns.barplot(x='mean', y='code', data=df_botn)
# print("-------------------------\n\n")
# print("> dtype: {}".format(df_botn.dtypes))
# print(df_botn.head(10))

# dfe = dfe[['mean', 'code']]
# dfe.plot(kind='bar')

sns.barplot(x='mean', y='code', data=dfe, order=dfe['code'], # order has to be specified; even if already sorted!!!
            label="LOINC", color="b", orient='h')

# ax = sns.barplot(x='mean', y='code', data=df)

# ax.set_xlabel('Fmax Score')
# ax.set_ylabel('LOINC')
