In [1]:
import pandas as pd 
from pandas import DataFrame, Series
import os, sys, re
import numpy as np

from decimal import Decimal
from tabulate import tabulate

import warnings
warnings.filterwarnings('ignore')  # action='once'

%matplotlib inline

## Utility Functions

In [2]:
from analyzer import interpret

def col_values(df, col='age', n=10):
    df_subset = df.sample(n=n, random_state=1) # for each column, sample n values (usually n=1)
    return df_subset[col].values

def summary(df, n=1): 
    msg = ""
    msg += "> sample sizes: {}\n".format(df.shape[0])
    msg += "> n(features):  {}\n".format(df.shape[1])
    # msg += "> list of features:\n{}\n".format(df.columns.values)
    print(msg)

    interpret(df, n=n, verbose=True)
def show_dict(adict, topn=-1, by='', header=[], n_samples=-1, ascending=False, print_=False): 
    # print(adict)
    
    # convert to two-column dataframe format 
    if not header: 
        header = ['key', 'value']
    else: 
        assert len(header) == 2
    D = {h:[] for h in header}
    for k, v in adict.items(): 
        D[header[0]].append(k)
        D[header[1]].append(v)
    
    df = DataFrame(D, columns=header)
    msg = ''
    if topn > 0: 
        assert by in df.columns
        df = df.sort_values([by, ], ascending=ascending)
        msg = tabulate(df[:topn], headers='keys', tablefmt='psql')
    else: 
        if n_samples < 0: 
            msg = tabulate(df, headers='keys', tablefmt='psql')
        else: 
            n = min(df.shape[0], n_samples)
            msg = tabulate(df.sample(n=n), headers='keys', tablefmt='psql')
    if print_: print(msg)
    return msg

### Determine and Retrieve Patient Cohort(s)

In [3]:
"""
1. Find rows whose column match a substring 
   https://davidhamann.de/2017/06/26/pandas-select-elements-by-string/
   
   https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.contains.html

"""
import json 
from cohort_search import gen_code_set, gen_query_str

# use module cohort_search to generate the desired query strings 
# run CohortRetrieval on Databricks (https://dbc-25924283-13f6.cloud.databricks.com/#notebook/1971793/command/1973735)

### Load Base/Positive Dataset

In [4]:
from analyzer import load_data, save_data, load_performance, stratify
# from transformer import canonicalize  # ... obsolete
import loinc as lc
# from loinc import canonicalize

######################################
# params
cohort = domain = 'hepatitis-c'
token_default = 'unknown'
col_target = 'test_result_loinc_code'
######################################

# load source data
ts0 = load_data(input_file='andromeda-pond-hepatitis-c.csv', warn_bad_lines=False)
ts0 = ts0.drop_duplicates(keep='last')  # drop duplicates 
ts0 = lc.canonicalize(ts0, col_target=col_target, token_missing=token_default)

print("> dim(ts0): {}".format(ts0.shape))  
loinc_set = codes0 = ts0[col_target].unique()
codes0_subset = np.random.choice(codes0, 30)
print("> N(loinc_set): {} | example codes (source):\n{}\n".format(len(loinc_set), list(codes0_subset)))

# stratify the data by class labels (e.g. LOINC codes)
ds = stratify(ts0, col=col_target) # code|mean|std|n_pos
Nmax0 = ds[0][1]  
# ... here we have not added extra control data yet (i.e. patients without the target disease such hepatitis C)

# compare to the previously generated performance dataframe
df_perf = load_performance(input_dir='result', cohort=cohort)
df_perf = df_perf.sort_values(by=['n_pos', ], ascending=False)
Nmax = np.max(df_perf['n_pos'].values)
Nm = np.median(df_perf['n_pos'].values)

ss_dict = dict(zip(df_perf['code'].values, df_perf['n_pos'].values))
print("> Set the baseline class sample size: Nmax: {} (Nmax0: {}, median: {})".format(Nmax, Nmax0, Nm))

topn = 10
print("> top {} sample sizes:\n{}\n".format(topn, show_dict(ss_dict, topn=topn, 
        by='n_pos', header=['code', 'n_pos'])))

######################################
"""
Conclusion 
----------

Cohort: hepatitis-c

1. max sample size with known code: 67686 => 57202
2. loinc_set: target loinc codes
   size(loinc_set): 733
3. ts0: dim=(71224, 127)


"""

(load_data) Loaded dataframe (dim=(71224, 127)) from:
/Users/barnett/Documents/work/loinc_predictor/data/andromeda-pond-hepatitis-c.csv

(canonicalize) Operations> fillna, dehyphenate, replace_values, trim_tail, fill_others
> dim(ts0): (71224, 127)
> N(loinc_set): 733 | example codes (source):
['108860', '28654', '122861', '18341', '461277', '108399', '139907', '58032', '505552', '58131', '486423', '103317', '823773', '192997', '533265', '20644', '6304', '602797', '98301', '111567', '104661', '397786', '32987', '505628', '223271', '19711', '62489', '771477', '17590', '218404']

> dim(performance matrix): (733, 4)
> Set the baseline class sample size: Nmax: 12033 (Nmax0: 5593, median: 7.0)
> top 10 sample sizes:
+----+---------+---------+
|    | code    |   n_pos |
|----+---------+---------|
|  0 | unknown |   12033 |
|  1 | 67686   |    5720 |
|  2 | 19752   |    4454 |
|  3 | 17426   |    4445 |
|  4 | 17517   |    4399 |
|  5 | 21600   |    4207 |
|  6 | 178616  |    3870 |
|  7 | 77

'\nConclusion \n----------\n\nCohort: hepatitis-c\n\n1. max sample size with known code: 67686 => 57202\n2. loinc_set: target loinc codes\n   size(loinc_set): 733\n3. ts0: dim=(71224, 127)\n\n\n'

### Load Control Data

In [5]:
# patients not in given domain/disease
from cohort_search import filter_by_diagnosis
from analyzer import save_data

######################################
# params
tFilter = False
tSave = True
tAddCtrl = True
output_dir = 'data'
######################################

N0 = ts0.shape[0]

# load source data
ts_ctrl = load_data(input_file='andromeda_pond-10p.csv', warn_bad_lines=False)
ts_ctrl = ts_ctrl.drop_duplicates(keep='last')  # drop duplicates
ts_ctrl = lc.canonicalize(ts_ctrl, col_target=col_target, token_missing=token_default, target_labels=loinc_set)
print("> dim(ts_ctrl): {}".format(ts_ctrl.shape))  

if tFilter: 
    ts_ctrl = filter_by_diagnosis(ts_ctrl, condition='hepatitis c')
    nctrl = ts_ctrl.shape[0]
    print("> sample size | orig: {}, filtered(control): {}".format(n0, nctrl))

# summary(df_ctrl, n=1)
# df_ctrl.info()   

# save 
if tSave: 
    output_file = f"andromeda-pond-{cohort}-ctrl.csv" 
    save_data(ts_ctrl,  output_file=output_file, sep=',')
    
# mix-in 
if tAddCtrl: 
    Nctrl = ts_ctrl.shape[0]
    assert Nctrl > N0, f"Control data is too small | n({cohort})={N0} > n(ctrl)={Nctrl}"
    ts_ctrl = ts_ctrl.sample(n=N0, replace=False)
    
    ts0 = pd.concat([ts0, ts_ctrl], ignore_index=True)
    
assert np.sum(ts0[col_target].isnull()) == 0
print("> Adding control data | N: {} -> {}".format(N0, ts0.shape[0]))

(load_data) Loaded dataframe (dim=(2891340, 127)) from:
/Users/barnett/Documents/work/loinc_predictor/data/andromeda_pond-10p.csv

(canonicalize) Operations> fillna, dehyphenate, replace_values, trim_tail, fill_others
(canonicalize) Focus only on target labels (n=733), labeling the rest as other
> dim(ts_ctrl): (2891340, 127)
(save_data) Saved dataframe (dim=(2891340, 127)) to:
/Users/barnett/Documents/work/loinc_predictor/data/andromeda-pond-hepatitis-c-ctrl.csv

> Adding control data | N: 71224 -> 142448


### Define Feature Set 

Note: Subsetting columns could potentially reduce the size of the data

In [6]:
"""
Memo
----
1. medivo_test_result_type is a function of the following attributes: 
      "meta_sender_name",
      "receiving_organization_id",
      "test_order_code",
      "test_order_name",
      "test_result_code",
      "test_result_name",
      "test_result_loinc_code",
      "test_result_units_of_measure"
      
"""
from transformer import to_age
from analyzer import sample_col_values
from loinc import FeatureSet

cat_cols = ['patient_gender', 
            'patient_state',  # n_uniq=199
            'patient_bill_type',  # n_uniq=31
            'fasting',   # n_uniq=5
            
            'performing_organization_id', # n_uniq=151, m=40%+, NOT part of medivo_test_result_type
            
            'receiving_organization_id', # n_uniq=43, m=50%+, part of medivo_test_result_type
            # 'receiving_organization_name', 
            
            # 'receiving_organization_state', 
            # 'receiving_organization_zip_code', 
            
            # 'ordering_practice_lab_account_name',  # high card
            # 'ordering_practice_lab_account_number', # high card
            
            # 'ordering_practice_city', # high card 
            # 'ordering_practice_state', # high card 124? 
            
            # 'ordering_practice_zip_code', # high card,  n_uniq=79392
            # 'ordering_provider_alternate_id_type',   # n_uniq=32
            
            # 'ordering_provider_alternate_id', # n_uniq=132768
            
            # ---------------------------------
            
            'test_result_status', # n_uniq=144
            # 'test_turnaround_time', # n_uniq=417, high missing
            
            'test_order_code',  # n_uniq=27668
            'test_order_name',  # n_uniq=20039
            
            'test_result_code', # n_uniq=23731 (2771052/2891340)
            'test_result_name',  # n_uniq=15581    # <<<< 
            
            'test_result_value',  # n_uniq=35441    # <<<< 
            'test_result_range',   # n_uniq=151, mostly missing   # <<<< 
            
            'test_result_abnormal_flag',  # n_uniq=524, high missing
            
            'test_result_reference_range',  # n_uniq=5735, moderate missing
            
            'test_result_units_of_measure',  # n_uniq=669, m=40%+
            
            # 'test_result_comment_source', # mostly missing
            
            'test_result_comments',  # mostly missing > 80%   # <<<< 
            
            # 'test_priority', 
            # 'test_specimen_collection_volume',
            
            # 'test_specimen_type',  # mostly missing
            
            # 'test_specimen_source', # n_uniq=15971
            # 'test_relevant_clinical_information', # n_uniq=26/
            
            'test_cpt_code',    # n_uniq=655
            
            # 'parent_test_order_code', # n_uniq=5088
            # 'parent_test_order_name', # high missing
            
            # --- datetime ---
            # 'test_specimen_draw_datetime',  # e.g. '2019-08-07T14:47:00.000Z'
            # 'test_specimen_receipt_datetime', #  e.g. '2016-10-06T10:54:00.000Z
            
            # 'test_specimen_analysis_datetime', # high missin
            # 'test_observation_datetime', 
            
            # 'test_observation_reported_datetime', 
            
            'panel_order_code',  # n_uniq=18018
            'panel_order_name',  # n_uniq=11663
            
            # 'parent_panel_order_code', # high missing
            # 'parent_panel_order_name', # high missing
            
            # 'datetime_of_processing',  # no year e.g. 'Jun 29 14:44:25'
            
            # 'meta_ingestion_datetime',
            
            'meta_sender_name',  #  n_uniq=7, m=0 # <<< 
            'medivo_test_result_type',  # n_uniq=3493, <<<<
        
            ]

cont_cols = ['age',   # patient_gender -> age  # <<< 
     ]  

target_cols = ['test_result_loinc_code', ]

# cardinality < 100
low_card_cols = ['patient_gender', 'fasting', 'meta_sender_name' ]
high_card_cols = list(set(cat_cols)-set(low_card_cols))

target_columns = cat_cols + cont_cols + target_cols

# feature transformation
#####################################################
# to_age(ts0)
# values = sample_col_values(ts0, col='age', n=10)
# print("> age: {}".format(values))

### Balance Classes 

note: balance classes from external dataset

In [9]:
from analyzer import balance_data_incr, load_data_incr, stratify
import loinc as lc

# run here or just load the curated data generated by analyzer.t_stratify()
tLoad = False
tSave = True

ts = ts0
ds = stratify(ts, col='test_result_loinc_code', ascending=False)
print("(balance_classes) data size distribution:\n{}\n".format(ds[:25]))
ds = [(code, size) for code, size in ds if not code in lc.LoincTSet.non_codes]
max_size = ds[0][1] # ds[2][1]
codes_low_sz = set([code for code, size in ds if size < max_size])
print("(balance_classes) We have n={} with low sample size (< {})".format(len(codes_low_sz), max_size))

if tLoad: 
    input_file=f"andromeda-pond-{cohort}-balanced.csv"
    ts = load_data(input_file=input_file, warn_bad_lines=False)
    # ts = ts.drop_duplicates(keep='last')  # drop duplicates 
    # ts = lc.canonicalize(ts, col_target=col_target, token_missing=token_default, target_labels=loinc_set)
    assert lc.is_canonicalized(ts, col_target=col_target, token_missing=token_default, target_labels=loinc_set)
    
else: 
    # max_size = 1000
    input_file = f"andromeda-pond-{cohort}-loinc.csv"
    codes = set(loinc_set)
    codes_hit = set([])
    for i, tsi in enumerate(load_data_incr(input_file=input_file, chunksize=1000000, warn_bad_lines=False)): 
        N0 = ts.shape[0]
        tsi = tsi.drop_duplicates(keep='last')  # drop duplicates 
        tsi = lc.canonicalize(tsi, col_target=col_target, token_missing=token_default, target_labels=loinc_set)
        print("[{}] Processing chunk #{} | n(ts): {}, n(tsi): {} ...".format(i, i+1, N0, tsi.shape[0]))

        ts_incr, hit, missed = balance_data_incr(df=ts, df_extern=tsi, n_samples=max_size, col=col_target)
        if not ts_incr.empty: ts = pd.concat([ts, ts_incr])

        # analysis 
        N = ts.shape[0]
        codes_hit.union(hit) # still has this many codes without a match 

        print(f"[{i}] size: {N0} -> {N}")
        ds = stratify(ts, col='test_result_loinc_code', ascending=False)
        ds = [(code, size) for code, size in ds if size < max_size]
        print("[{}] size(codes) < {} (n={}): \n{}\n".format(i, max_size, len(ds), ds[:200]))

    codes_missed = codes_low_sz - codes_hit  
    print("(t_stratify) At last, we could not find a match for n={} codes among nl={} low-sample-size labels:\n{}\n".format(
        len(codes_missed), len(codes_low_sz), codes_missed))  

    if tSave: 
        ts = ts.drop_duplicates(keep='last')  # drop duplicates 
        output_file = f"andromeda-pond-{cohort}-balanced.csv"
        save_data(ts, output_file=output_file)

"""
Output
------
   ts: balanced training data; well... as balaned as possible because external data may not be able to supply sufficient 
       data for a subset of the classes/loincs
"""

(balance_classes) data size distribution:
[('unknown', 11899), ('67686', 5708), ('17426', 4520), ('17517', 4384), ('19752', 4346), ('21600', 4223), ('486431', 3770), ('7187', 3761), ('178616', 3759), ('30973', 3743), ('7773', 3706), ('19208', 3673), ('66902', 3287), ('28233', 3072), ('7310', 2989), ('7112', 2823), ('7138', 2715), ('110114', 2571), ('7708', 2463), ('7518', 2431), ('25718', 2324), ('45443', 2308), ('20933', 2189), ('134577', 2178), ('30940', 2153)]

(balance_classes) We have n=731 with low sample size (< 5708)
(canonicalize) Operations> fillna, dehyphenate, replace_values, trim_tail, fill_others
(canonicalize) Focus only on target labels (n=733), labeling the rest as other
[0] Processing chunk #1 | n(ts): 142448, n(tsi): 1000000 ...
(balance_data_incr) n_baseline=5708
(balance_data_incr) Found n=734 unique codes from source | nc=476 unique codes from external
... found 0=?=0 extra codes from the df_extern:
[]

... found 476 common codes from the df_extern:
['264747', '81

(canonicalize) Operations> fillna, dehyphenate, replace_values, trim_tail, fill_others
(canonicalize) Focus only on target labels (n=733), labeling the rest as other
[2] Processing chunk #3 | n(ts): 528991, n(tsi): 1000000 ...
(balance_data_incr) n_baseline=5708
(balance_data_incr) Found n=734 unique codes from source | nc=480 unique codes from external
... found 0=?=0 extra codes from the df_extern:
[]

... found 480 common codes from the df_extern:
['825141', '81174', '93187', '52449', '218404', '62489', '329987', '182626', '385187', '28852', '27318', '139527', '203943', 'other', '20958', '30494', '51300', '95976', '297713', '45518']

(balance_data_incr) Added n=108 cases to code=122358
(balance_data_incr) Added n=5 cases to code=143081
(balance_data_incr) Added n=1090 cases to code=191239
(balance_data_incr) Added n=11 cases to code=205211
(balance_data_incr) Added n=122 cases to code=22848
(balance_data_incr) Added n=15 cases to code=25395
(balance_data_incr) Added n=584 cases to c

(canonicalize) Operations> fillna, dehyphenate, replace_values, trim_tail, fill_others
(canonicalize) Focus only on target labels (n=733), labeling the rest as other
[4] Processing chunk #5 | n(ts): 703154, n(tsi): 1000000 ...
(balance_data_incr) n_baseline=5708
(balance_data_incr) Found n=734 unique codes from source | nc=481 unique codes from external
... found 0=?=0 extra codes from the df_extern:
[]

... found 481 common codes from the df_extern:
['825141', '81174', '93187', '52449', '218404', '62489', '329987', '182626', '385187', '28852', '27318', '139527', '203943', 'other', '20958', '30494', '51300', '95976', '297713', '45518']

(balance_data_incr) Added n=357 cases to code=128413
(balance_data_incr) Added n=2 cases to code=143149
(balance_data_incr) Added n=1144 cases to code=191239
(balance_data_incr) Added n=19 cases to code=204966
(balance_data_incr) Added n=961 cases to code=25320
(balance_data_incr) Added n=614 cases to code=28654
(balance_data_incr) Added n=79 cases to c

(canonicalize) Operations> fillna, dehyphenate, replace_values, trim_tail, fill_others
(canonicalize) Focus only on target labels (n=733), labeling the rest as other
[6] Processing chunk #7 | n(ts): 818458, n(tsi): 1000000 ...
(balance_data_incr) n_baseline=5708
(balance_data_incr) Found n=734 unique codes from source | nc=475 unique codes from external
... found 0=?=0 extra codes from the df_extern:
[]

... found 475 common codes from the df_extern:
['81174', '93187', '52449', '218404', '62489', '329987', '182626', '385187', '28852', '27318', '139527', '203943', 'other', '20958', '30494', '51300', '95976', '297713', '45518', '110387']

(balance_data_incr) Added n=8 cases to code=122861
(balance_data_incr) Added n=1 cases to code=143149
(balance_data_incr) Added n=28 cases to code=17442
(balance_data_incr) Added n=330 cases to code=206243
(balance_data_incr) Added n=183 cases to code=237610
(balance_data_incr) Added n=20 cases to code=264507
(balance_data_incr) Added n=27 cases to code

(canonicalize) Operations> fillna, dehyphenate, replace_values, trim_tail, fill_others
(canonicalize) Focus only on target labels (n=733), labeling the rest as other
[8] Processing chunk #9 | n(ts): 903361, n(tsi): 1000000 ...
(balance_data_incr) n_baseline=5708
(balance_data_incr) Found n=734 unique codes from source | nc=481 unique codes from external
... found 0=?=0 extra codes from the df_extern:
[]

... found 481 common codes from the df_extern:
['825141', '81174', '93187', '52449', '218404', '62489', '329987', '182626', '385187', '28852', '27318', '139527', '203943', 'other', '20958', '30494', '51300', '95976', '297713', '45518']

(balance_data_incr) Added n=7 cases to code=115808
(balance_data_incr) Added n=15 cases to code=142778
(balance_data_incr) Added n=108 cases to code=172841
(balance_data_incr) Added n=176 cases to code=18846
(balance_data_incr) Added n=634 cases to code=28654
(balance_data_incr) Added n=20 cases to code=303610
(balance_data_incr) Added n=28 cases to cod

(canonicalize) Operations> fillna, dehyphenate, replace_values, trim_tail, fill_others
(canonicalize) Focus only on target labels (n=733), labeling the rest as other
[10] Processing chunk #11 | n(ts): 972922, n(tsi): 1000000 ...
(balance_data_incr) n_baseline=5708
(balance_data_incr) Found n=734 unique codes from source | nc=476 unique codes from external
... found 0=?=0 extra codes from the df_extern:
[]

... found 476 common codes from the df_extern:
['81174', '93187', '52449', '218404', '62489', '329987', '182626', '385187', '28852', '27318', '139527', '203943', 'other', '30494', '20958', '51300', '95976', '297713', '45518', '110387']

(balance_data_incr) Added n=352 cases to code=128413
(balance_data_incr) Added n=4 cases to code=143388
(balance_data_incr) Added n=162 cases to code=17541
(balance_data_incr) Added n=33 cases to code=191411
(balance_data_incr) Added n=35 cases to code=20644
(balance_data_incr) Added n=155 cases to code=237610
(balance_data_incr) Added n=14 cases to c

(canonicalize) Operations> fillna, dehyphenate, replace_values, trim_tail, fill_others
(canonicalize) Focus only on target labels (n=733), labeling the rest as other
[12] Processing chunk #13 | n(ts): 1033823, n(tsi): 1000000 ...
(balance_data_incr) n_baseline=5708
(balance_data_incr) Found n=734 unique codes from source | nc=476 unique codes from external
... found 0=?=0 extra codes from the df_extern:
[]

... found 476 common codes from the df_extern:
['825141', '264747', '81174', '93187', '52449', '218404', '62489', '329987', '182626', '385187', '28852', '27318', '139527', '203943', 'other', '20958', '30494', '51300', '95976', '297713']

(balance_data_incr) Added n=45 cases to code=146282
(balance_data_incr) Added n=65 cases to code=17541
(balance_data_incr) Added n=38 cases to code=191411
(balance_data_incr) Added n=315 cases to code=206243
(balance_data_incr) Added n=17 cases to code=303925
(balance_data_incr) Added n=230 cases to code=326249
(balance_data_incr) Added n=381 cases 

(canonicalize) Operations> fillna, dehyphenate, replace_values, trim_tail, fill_others
(canonicalize) Focus only on target labels (n=733), labeling the rest as other
[14] Processing chunk #15 | n(ts): 1084442, n(tsi): 1000000 ...
(balance_data_incr) n_baseline=5708
(balance_data_incr) Found n=734 unique codes from source | nc=482 unique codes from external
... found 0=?=0 extra codes from the df_extern:
[]

... found 482 common codes from the df_extern:
['825141', '264747', '81174', '93187', '52449', '218404', '62489', '329987', '182626', '385187', '28852', '27318', '139527', '203943', 'other', '20958', '30494', '51300', '95976', '297713']

(balance_data_incr) Added n=16 cases to code=122861
(balance_data_incr) Added n=10 cases to code=143081
(balance_data_incr) Added n=7 cases to code=205211
(balance_data_incr) Added n=72 cases to code=22848
(balance_data_incr) Added n=18 cases to code=25395
(balance_data_incr) Added n=123 cases to code=30262
(balance_data_incr) Added n=74 cases to co

(t_stratify) At last, we could not find a match for n=731 codes among nl=731 low-sample-size labels:
{'825141', '285429', '264747', '81174', '93187', '52449', '218404', '62489', '329987', '163485', '182626', '139899', '19638', '385187', '28852', '27318', '115790', '596155', '139527', '203943', '20958', '30494', '51300', '95976', '297713', '45518', '205088', '141358', '390179', '110387', '52472', '274167', '236562', '304501', '486431', '251488', '7765', '30163', '107011', '80614', '241133', '31419', '163626', '264846', '381806', '30741', '7427', '744441', '460980', '143149', '157743', '333138', '514869', '20396', '25320', '347146', '31674', '98426', '178152', '28902', '622910', '265157', '162503', '422428', '823807', '305524', '25015', '21618', '243311', '487926', '79095', '162057', '192526', '148049', '22764', '51953', '602797', '28894', '98301', '178566', '59469', '52092', '96610', '122861', '505578', '502211', '505511', '339101', '128413', '51268', '611517', '5056360', '322156', '286

'\nOutput\n------\n   ts: balanced training data; well... as balaned as possible because external data may not be able to supply sufficient \n       data for a subset of the classes/loincs\n'

### Feature Transformation

note: patient_date_of_birth => age

In [11]:
from transformer import to_age
from analyzer import col_values
# from loinc import FeatureSet

to_age(ts)
values = col_values(ts, col='age', n=10)
print("> age: {}".format(values))

# datatime columns

> age: [64 19 26 86 59 49 44 68 56 77]


### Subset Features and Handling missing values

In [14]:
tCategorify = False
tDropHighMissing = False # drop columns with high rate of missing values
p_null = 0.9
token_default = token_missing = 'unknown'

df = ts # ... :)

# V = list(feature_lookup.keys())
V = cont_cols + cat_cols
L = target_cols
dfX = df[V]
dfy = df[L]

print("> Given features set:\n{}\n".format(V))

assert np.sum(dfy[target_cols[0]].isnull()) == 0

# drop columns/vars with too many missing values 
N = dfX.shape[0]
n_thresh = int(N * p_null)
nf0 = nf = dfX.shape[1]
fset0 = set(dfX.columns.values)

if tDropHighMissing: 
    dfX = dfX[dfX.columns[dfX.isnull().mean() < p_null]]
    fset = set(dfX.columns.values)
    nf = dfX.shape[1]
    print("> Dropped n={} features:\n{}\n".format(nf-nf0, fset0-fset))
    
fset = set(dfX.columns.values)
print("> Final feature set (nf={}):\n{}\n".format(nf, fset))

# fill in missing values (also see default_values)
dfX.fillna(value=token_default, inplace=True)
#################################################
# Convert our three categorical columns to category dtypes.

cat_cols = [cat for cat in cat_cols if cat in dfX.columns]
cont_cols = [c for c in cont_cols if c in dfX.columns]

"""
Output
------
   dfX 
   dfy
"""

> Given features set:
['age', 'patient_gender', 'patient_state', 'patient_bill_type', 'fasting', 'performing_organization_id', 'receiving_organization_id', 'test_result_status', 'test_order_code', 'test_order_name', 'test_result_code', 'test_result_name', 'test_result_value', 'test_result_range', 'test_result_abnormal_flag', 'test_result_reference_range', 'test_result_units_of_measure', 'test_result_comments', 'test_cpt_code', 'panel_order_code', 'panel_order_name', 'meta_sender_name', 'medivo_test_result_type']

> Final feature set (nf=23):
{'test_order_code', 'test_result_units_of_measure', 'test_result_status', 'test_result_value', 'patient_gender', 'test_result_abnormal_flag', 'test_cpt_code', 'age', 'test_result_code', 'test_order_name', 'panel_order_code', 'panel_order_name', 'patient_bill_type', 'performing_organization_id', 'test_result_name', 'meta_sender_name', 'patient_state', 'test_result_reference_range', 'medivo_test_result_type', 'test_result_range', 'fasting', 'test_res

'\nOutput\n------\n   dfX \n   dfy\n'

### Encode Variables

In [15]:
from transformer import encode_vars 
from loinc import FeatureSet
# high_card_cols = FeatureSet.high_card_cols
nf0 = dfX.shape[1]
dfX = encode_vars(dfX, fset=cat_cols, high_card_cols=high_card_cols)
print("> After variable encoding we have dim(dfX): {} | nf: {} -> {}".format(dfX.shape, nf0, dfX.shape[1]))
print("> New feature set:\n{}\n".format(dfX.columns))
print(df.head(10).to_string(index=False))

(encoder_vars) low card vars (n=['meta_sender_name', 'fasting', 'patient_gender']):
3
 ... high card vars (n=['panel_order_name', 'test_order_code', 'patient_bill_type', 'test_result_units_of_measure', 'test_result_status', 'test_result_value', 'performing_organization_id', 'test_result_name', 'test_result_abnormal_flag', 'test_result_comments', 'test_cpt_code', 'patient_state', 'test_result_reference_range', 'medivo_test_result_type', 'test_result_code', 'test_result_range', 'test_order_name', 'receiving_organization_id', 'panel_order_code']):
19

... transforming var: patient_gender ...
... transforming var: patient_state ...
... transforming var: patient_bill_type ...
... transforming var: fasting ...
... transforming var: performing_organization_id ...
... transforming var: receiving_organization_id ...
... transforming var: test_result_status ...
... transforming var: test_order_code ...
... transforming var: test_order_name ...
... transforming var: test_result_code ...
... trans

### Encode Labels 

In [27]:
from analyzer import encode_labels, summarize_dict, get_sample_sizes
import collections, operator

# verify
assert dfX.shape[0] == dfy.shape[0], "> dim(dfX): {} | dfy.cols: {}".format(dfX.shape, dfy.columns.values)

codebook={'pos': 1, 'neg': 0, '+': 1, '-': 0}

# choose the one with a large sample size as 'positive'
col_label = 'test_result_loinc_code' # strings

topn = 5
sizes = get_sample_sizes(dfy[col_label])
# ... sizes: (loinc) label -> sample size
# print("> n(sizes): {}".format(len(sizes)))  # 734 for cohort='hepatitis-c'

# Q: How many classes/codes have less than N instances? 
N_low = 1000
n_low = sum(1 for l, c in sizes.items() if c < N_low)
print("> Low sample size classes | n={} (< {})".format(n_low, N_low))

N_elow = 10
n_elow = sum(1 for l, c in sizes.items() if c < N_elow)
print("> Extreme low sample size classes | n={} (< {})".format(n_elow, N_elow))

# Q: How many classes/codes were able to match the most enriched class in terms of sample size (e.g. 5707)? 
eps = 10
n_matched = sum(1 for l, c in sizes.items() if c >= max_size-eps)
print("> n_matched: {} | max_size: {}".format(n_matched, max_size))

# sort by values
sizes_sorted = sorted(sizes.items(), key=operator.itemgetter(1))
summarize_dict(sizes, topn=15, sort_=True)

print("> sizes: {}".format(sizes.most_common(20)))
most_sample_sizes = sizes.most_common(topn)  # take(topn, sizes.items())
print("> Sample sizes | Top N={} codes:\n{}\n".format(topn, most_sample_sizes))
least_sample_sizes = sizes.most_common()[:-topn-1:-1]
print("> Sample sizss | Last N={} codes:\n{}\n".format(topn, least_sample_sizes))

# test
target = most_sample_sizes[0][0]
y = encode_labels(dfy, pos_label=target, codebook=codebook, verbose=1)

> Low sample size classes | n=489 (< 1000)
> Extreme low sample size classes | n=175 (< 10)
> n_matched: 121 | max_size: 5708
[356188] -> 1
[223123] -> 1
[156679] -> 1
[530170] -> 1
[264663] -> 1
[21188] -> 1
[154120] -> 1
[162511] -> 1
[196618] -> 1
[302505] -> 1
[634642] -> 1
[223198] -> 1
[451765] -> 1
[295980] -> 1
[316273] -> 1
> sizes: [('unknown', 11892), ('30973', 5707), ('20933', 5707), ('30940', 5707), ('82511', 5707), ('45484', 5707), ('7765', 5707), ('17590', 5707), ('98301', 5707), ('21618', 5707), ('29512', 5707), ('203927', 5707), ('28902', 5707), ('28688', 5707), ('7112', 5706), ('134577', 5706), ('115725', 5706), ('29868', 5706), ('59055', 5706), ('254284', 5706)]
> Sample sizes | Top N=5 codes:
[('unknown', 11892), ('30973', 5707), ('20933', 5707), ('30940', 5707), ('82511', 5707)]

> Sample sizss | Last N=5 codes:
[('550513', 1), ('206060', 1), ('64204', 1), ('80945', 1), ('162057', 1)]

(encode_labels) sample size: Counter({0: 1096047, 1: 11892})


## Initial Model Training

### Feature Selection

In [28]:
"""
Ref
---
1. pip install feature-selector

   https://github.com/WillKoehrsen/feature-selector
   
   possible dependency 
      brew install libomp
      
   <debug> 
       + RuntimeError: Python is not installed as a framework.
          > https://stackoverflow.com/questions/34977388/matplotlib-runtimeerror-python-is-not-installed-as-a-framework
   
"""
import feature_selector 

### Save a copy of encoded training set (optional)

In [None]:
tSaveEncodedTSet = True 

if tSaveEncodedTSet: 
    output_file = f"andromeda-pond-{cohort}-encoded.csv"
    save_data(ts, output_file=output_file)
    

### Model Training

In [None]:
import utils_tree, utils_sys, analyzer
import collections
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# data transformation
col = 'test_result_loinc_code'
X, y = dfX.values, dfy[col].values
print("> dim(X): {}, sample(y): {}".format(X.shape, np.random.choice(np.unique(y),20) ))

# feature scaling
scaler = MinMaxScaler() # MinMaxScaler(), StandardScaler()
X = scaler.fit_transform(X)

n_fold = 5
n_min = n_fold

# to save performance data
header = ['code', 'mean', 'std', 'n_pos']
sdict = {h:[] for h in header}
for code in loinc_set: 
    y_eff = analyzer.encode_labels(y, pos_label=code)
    
    counter = collections.Counter(y_eff)
    n_pos, n_neg = counter[codebook['pos']], counter[codebook['neg']]
    print(f"> sample size | n(+): {n_pos}, n(-): {n_neg}")
    
    if n_pos >= n_min: 
        scores = analyzer.eval_performance(X, y_eff, model=None, cv=n_fold, random_state=53, verbose=1)
        mean_score = np.mean(scores)
        std_score = np.std(scores)
        print("> average: {}, std: {}".format(mean_score, std_score))
    else: 
        print("> (positive) sample size too small, n={}".format(n_pos))
        mean_score = -1 
        std_score = -1
    sdict['code'].append(code)
    sdict['mean'].append(mean_score)
    sdict['std'].append(std_score)
    sdict['n_pos'].append(n_pos)

# --------------------------------------------------
# save performance dataframe
df_perf = DataFrame(sdict, columns=header)
df_perf = df_perf.sort_values(by=['mean', ]) # ascending=False
analyzer.save_performnace(df, output_dir='result', output_file='', **kargs)

cohort = 'hepatitis-c'
output_dir = os.path.join(os.getcwd(), 'result')
output_file = f"performance-{cohort}-2.csv" 
output_path = os.path.join(output_dir, output_file)
df_perf.to_csv(output_path, sep='|', index=False, header=True)

for code, score in zip(df_perf['code'], df_perf['mean']):
    print(f"[{code}] -> {score}")

> dim(X): (1107939, 247), sample(y): ['98301' '424820' '425959' '71001' '204545' '81166' '62489' '162370'
 '21600' '274167' '297713' '505594' '490243' '183962' '38794' '162503'
 '19869' '134585' '33969' '241240']
(encode_labels) sample size: Counter({0: 1102234, 1: 5705})
> sample size | n(+): 5705, n(-): 1102234
> 0 of KFold 5
> Fmax: 0.9982502187226596 p_th: 0.7867561123109736 | F1: 0.9982502187226596, AUC: 0.999997272691423
> 1 of KFold 5
> Fmax: 0.999124343257443 p_th: 0.9112923514722256 | F1: 0.9982502187226596, AUC: 0.9999999483163098
> 2 of KFold 5
> Fmax: 0.9982502187226596 p_th: 0.978871514015395 | F1: 0.996071584460934, AUC: 0.9999976145989123
> 3 of KFold 5
> Fmax: 0.9991235758106923 p_th: 0.9814877481741727 | F1: 0.9978137297770004, AUC: 0.9999999642189836
> 4 of KFold 5
> Fmax: 0.9991235758106923 p_th: 0.9898714730940393 | F1: 0.9973776223776224, AUC: 0.9999972925574838
> average: 0.9987743864648294, std: 0.0004279812610588265
(encode_labels) sample size: Counter({0: 11076

> Fmax: 0.9995612110574814 p_th: 0.952594156688057 | F1: 0.9991228070175439, AUC: 0.9999999880625323
> average: 0.999824638181711, std: 0.0002147736251843956
(encode_labels) sample size: Counter({0: 1102232, 1: 5707})
> sample size | n(+): 5707, n(-): 1102232
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.891652826803922 | F1: 0.999562363238512, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.8806263315358833 | F1: 0.9991251093613298, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9856382907910297 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.6146183071201646 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.5598096084262801 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1102238, 1: 5701})
> sample size | n(+): 5701, n(-): 1102238
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.7722632041755012 | F1: 0.9995619798510731, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9682164732092319 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9881020641440466 | F1: 0.998685

(encode_labels) sample size: Counter({0: 1102236, 1: 5703})
> sample size | n(+): 5703, n(-): 1102236
> 0 of KFold 5
> Fmax: 0.999124343257443 p_th: 0.9398305167885403 | F1: 0.9978137297770004, AUC: 0.9999992128212118
> 1 of KFold 5
> Fmax: 0.9995619798510731 p_th: 0.9792690578834592 | F1: 0.9986870897155361, AUC: 0.9999998608516032
> 2 of KFold 5
> Fmax: 0.9982486865148862 p_th: 0.9234121607727426 | F1: 0.996071584460934, AUC: 0.9999986641753909
> 3 of KFold 5
> Fmax: 0.9995612110574814 p_th: 0.9539170075289493 | F1: 0.9982471516213848, AUC: 0.9999999880625322
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9401031408308259 | F1: 0.9969392216878007, AUC: 1.0
> average: 0.9992992441361768, std: 0.0005937974633586974
(encode_labels) sample size: Counter({0: 1102235, 1: 5704})
> sample size | n(+): 5704, n(-): 1102235
> 0 of KFold 5
> Fmax: 0.9986859395532194 p_th: 0.9846783764293892 | F1: 0.9960681520314547, AUC: 0.999999892656951
> 1 of KFold 5
> Fmax: 0.9986870897155361 p_th: 0.6598212290551733 | 

> Fmax: 1.0 p_th: 0.9812099877730336 | F1: 0.9986859395532194, AUC: 1.0
> 4 of KFold 5
> Fmax: 0.9991220368744512 p_th: 0.7252300063331129 | F1: 0.9991220368744512, AUC: 0.9999187774696054
> average: 0.9996491225370286, std: 0.00042973564881704197
(encode_labels) sample size: Counter({0: 1105821, 1: 2118})
> sample size | n(+): 2118, n(-): 1105821
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9923421171117716 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.975295775406622 | F1: 0.9976470588235293, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9932126541753804 | F1: 0.9988221436984689, AUC: 1.0
> 3 of KFold 5
> Fmax: 0.9988165680473373 p_th: 0.985806769529339 | F1: 0.9988165680473373, AUC: 0.9999999251756011
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9996295151041769 | F1: 0.9941245593419507, AUC: 1.0
> average: 0.9997633136094674, std: 0.0004733727810650734
(encode_labels) sample size: Counter({0: 1102234, 1: 5705})
> sample size | n(+): 5705, n(-): 1102234
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9951332

(encode_labels) sample size: Counter({0: 1107934, 1: 5})
> sample size | n(+): 5, n(-): 1107934
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9999834418453607 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9999141878888383 | F1: 0.6666666666666666, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9999627018520375 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9974170030085228 | F1: 0.6666666666666666, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9999321157020272 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1105612, 1: 2327})
> sample size | n(+): 2327, n(-): 1105612
> 0 of KFold 5
> Fmax: 0.9989281886387995 p_th: 0.8509779836310797 | F1: 0.9978586723768736, AUC: 0.9999999805906871
> 1 of KFold 5
> Fmax: 0.9989281886387995 p_th: 0.40622667452881317 | F1: 0.9978540772532188, AUC: 0.9999999902953436
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9973439419558409 | F1: 0.9978540772532188, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9954352913799112 | F1: 1.0, AU

> Fmax: 1.0 p_th: 0.9986810411508216 | F1: 0.9333333333333333, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1102259, 1: 5680})
> sample size | n(+): 5680, n(-): 1102259
> 0 of KFold 5
> Fmax: 0.9995596653456628 p_th: 0.969366536248451 | F1: 0.9986801583809942, AUC: 0.999999992013847
> 1 of KFold 5
> Fmax: 0.9995600527936648 p_th: 0.4480797709830669 | F1: 0.9991197183098591, AUC: 0.9999965260234861
> 2 of KFold 5
> Fmax: 0.9986813186813187 p_th: 0.9607269760947214 | F1: 0.9982425307557118, AUC: 0.999999964062312
> 3 of KFold 5
> Fmax: 0.9995596653456628 p_th: 0.9965370324134899 | F1: 0.997804128238911, AUC: 0.999999992013847
> 4 of KFold 5
> Fmax: 0.9986813186813187 p_th: 0.9218534878573094 | F1: 0.997804128238911, AUC: 0.9999999041657304
> average: 0.9992084041695255, std: 0.00043036352223163534
(encode_labels) sample size: Counter({0: 1107294, 1: 645})
> sample size | n(+): 645, n(-): 1107294
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9999091048987391 | F1: 1.0, 

> Fmax: 0.9959183673469388 p_th: 0.37897801792481833 | F1: 0.9918032786885246, AUC: 0.999999962988805
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.999065247022592 | F1: 0.991869918699187, AUC: 1.0
> 2 of KFold 5
> Fmax: 0.991869918699187 p_th: 0.1690528722917677 | F1: 0.9876543209876544, AUC: 0.999999888966415
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.980296901502531 | F1: 0.991869918699187, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9456009264090242 | F1: 0.9958847736625513, AUC: 1.0
> average: 0.9975576572092251, std: 0.0032536980663274797
(encode_labels) sample size: Counter({0: 1104672, 1: 3267})
> sample size | n(+): 3267, n(-): 1104672
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9872865608938767 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9986795440667937 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9994195882900896 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9915618734218032 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9892300072478549 | F1: 1.0, AUC: 1.0
> aver

> Fmax: 0.9991235758106924 p_th: 0.9919201716001124 | F1: 0.9969392216878007, AUC: 0.999999550357422
> 2 of KFold 5
> Fmax: 0.998683633172444 p_th: 0.790836606635974 | F1: 0.9982456140350877, AUC: 0.9999982014296883
> 3 of KFold 5
> Fmax: 0.9995615957913195 p_th: 0.984351435984577 | F1: 0.9978118161925601, AUC: 0.9999998647093129
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9069946348571712 | F1: 0.9991235758106924, AUC: 1.0
> average: 0.999386080113155, std: 0.00044740347503175723
(encode_labels) sample size: Counter({0: 1102240, 1: 5699})
> sample size | n(+): 5699, n(-): 1102240
> 0 of KFold 5
> Fmax: 0.9991228070175439 p_th: 0.9357822364302799 | F1: 0.9978118161925601, AUC: 0.9999998368553478
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9664021669364227 | F1: 0.9986859395532194, AUC: 1.0
> 2 of KFold 5
> Fmax: 0.9995615957913195 p_th: 0.6850963631008682 | F1: 0.9991235758106924, AUC: 0.9999999920417243
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9693826260379101 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p

> Fmax: 0.9991228070175439 p_th: 0.9706542659813239 | F1: 0.9986847873739588, AUC: 0.9999997455560631
> average: 0.9995615956902512, std: 0.0002773928570393085
(encode_labels) sample size: Counter({0: 1102238, 1: 5701})
> sample size | n(+): 5701, n(-): 1102238
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9943060719908821 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9939044152249269 | F1: 0.9991235758106924, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9883416007120654 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9416834333377326 | F1: 0.9995615957913195, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9972294681973054 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1102236, 1: 5703})
> sample size | n(+): 5703, n(-): 1102236
> 0 of KFold 5
> Fmax: 0.9995615957913195 p_th: 0.8535278047367142 | F1: 0.9991235758106923, AUC: 0.9999540971399616
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9278094145820289 | F1: 0.9995619798510731, AUC: 1.0
> 2 of KFold 5
> 

> Fmax: 1.0 p_th: 0.9958092950163051 | F1: 0.9995615957913195, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.6425059576147263 | F1: 0.9995615957913195, AUC: 1.0
> average: 0.9999123191582638, std: 0.00017536168347218606
(encode_labels) sample size: Counter({0: 1102247, 1: 5692})
> sample size | n(+): 5692, n(-): 1102247
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9877361178619758 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 0.999560825647782 p_th: 0.980933918782999 | F1: 0.9991220368744512, AUC: 0.9999999840696189
> 2 of KFold 5
> Fmax: 0.9995604395604395 p_th: 0.9927176549863735 | F1: 0.9986824769433464, AUC: 0.9999999282499662
> 3 of KFold 5
> Fmax: 0.999560825647782 p_th: 0.5093303573712951 | F1: 0.999560825647782, AUC: 0.999999996013887
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9929336928014755 | F1: 1.0, AUC: 1.0
> average: 0.9997364181712006, std: 0.00021521370818462215
(encode_labels) sample size: Counter({0: 1107610, 1: 329})
> sample size | n(+): 329, n(-): 1107610
> 0 of KFold 5
> Fmax: 1.0 p_th

> Fmax: 1.0 p_th: 0.9853316615126491 | F1: 0.9991235758106924, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1102236, 1: 5703})
> sample size | n(+): 5703, n(-): 1102236
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9972457282869164 | F1: 0.9995619798510731, AUC: 1.0
> 1 of KFold 5
> Fmax: 0.9995619798510731 p_th: 0.9445509083943856 | F1: 0.9995619798510731, AUC: 0.9999999960243315
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9859963810622567 | F1: 0.9995619798510731, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9961003789456518 | F1: 0.9991235758106924, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9934416513664976 | F1: 0.9995615957913195, AUC: 1.0
> average: 0.9999123959702146, std: 0.0001752080595707639
(encode_labels) sample size: Counter({0: 1102233, 1: 5706})
> sample size | n(+): 5706, n(-): 1102233
> 0 of KFold 5
> Fmax: 0.9991251093613298 p_th: 0.8513965381374803 | F1: 0.9973799126637556, AUC: 0.9999999841112513
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9519785338693205 | F1: 1

> sample size | n(+): 168, n(-): 1107771
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9521983799405198 | F1: 0.9714285714285714, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9992688871846018 | F1: 0.9444444444444444, AUC: 1.0
> 2 of KFold 5
> Fmax: 0.9850746268656716 p_th: 0.7658501623965682 | F1: 0.9850746268656716, AUC: 0.9999982742223513
> 3 of KFold 5
> Fmax: 0.9846153846153847 p_th: 0.9933311083624062 | F1: 0.955223880597015, AUC: 0.9999868696078198
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9994801898881163 | F1: 0.9705882352941176, AUC: 1.0
> average: 0.9939380022962112, std: 0.007425820802144024
(encode_labels) sample size: Counter({0: 1107863, 1: 76})
> sample size | n(+): 76, n(-): 1107863
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9991534139901973 | F1: 0.888888888888889, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9984104934342061 | F1: 0.7692307692307693, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9998671576231746 | F1: 0.967741935483871, AUC: 1.0
> 3 of KFold 5
> Fmax: 0.9655172413793104 p_th: 0.8

> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1107549, 1: 390})
> sample size | n(+): 390, n(-): 1107549
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9955340649361498 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9896785347941415 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9662376808457965 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9991971400106351 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9059586908998948 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1102234, 1: 5705})
> sample size | n(+): 5705, n(-): 1102234
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9995758501394698 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9996239960174385 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9996973407230653 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9986270386587348 | F1: 0.9995619798510731, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9993445710757134 | F1: 1.0, AUC: 1.0
> average: 1.

> Fmax: 1.0 p_th: 0.9985419332755734 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9970818753792325 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9999422590435959 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 0.9795918367346939 p_th: 0.9998091485980319 | F1: 0.96, AUC: 0.9999994583908793
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9958279499418634 | F1: 0.9803921568627451, AUC: 1.0
> average: 0.9959183673469388, std: 0.008163265306122458
(encode_labels) sample size: Counter({0: 1102235, 1: 5704})
> sample size | n(+): 5704, n(-): 1102235
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.997422627460551 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.7128300064698658 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9985253853621497 | F1: 0.9995619798510731, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9931537791696365 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9752917627827098 | F1: 0.9995615957913195, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter

> Fmax: 1.0 p_th: 0.9975927360157395 | F1: 1.0, AUC: 1.0
> average: 0.9999123959702146, std: 0.0001752080595707639
(encode_labels) sample size: Counter({0: 1102235, 1: 5704})
> sample size | n(+): 5704, n(-): 1102235
> 0 of KFold 5
> Fmax: 0.9995619798510731 p_th: 0.9282498519057235 | F1: 0.9978137297770004, AUC: 0.9999999960243315
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9442184287428599 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 0.9995619798510731 p_th: 0.944690423924247 | F1: 0.9973776223776224, AUC: 0.9999999960243314
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9972648058999715 | F1: 0.9986870897155361, AUC: 1.0
> 4 of KFold 5
> Fmax: 0.9991228070175439 p_th: 0.9785432493876571 | F1: 0.9991228070175439, AUC: 0.9999999005211023
> average: 0.9996493533439381, std: 0.00032815406234417435
(encode_labels) sample size: Counter({0: 1107128, 1: 811})
> sample size | n(+): 811, n(-): 1107128
> 0 of KFold 5
> Fmax: 0.9969418960244648 p_th: 0.4540197076111494 | F1: 0.9938650306748467, AUC: 0.999999916880095

> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9973352363669055 | F1: 0.991869918699187, AUC: 1.0
> average: 0.9983471074380166, std: 0.0033057851239669312
(encode_labels) sample size: Counter({0: 1107925, 1: 14})
> sample size | n(+): 14, n(-): 1107925
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9999617261424972 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9999941635796762 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9999947487718519 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9999865888770111 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9999657139743201 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1107938, 1: 1})
> sample size | n(+): 1, n(-): 1107938
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1104081, 1: 3858})
> sample size | n(+): 3858, n(-): 1104081
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9921505390823087 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9771281152313869 | F1: 0.998

> Fmax: 1.0 p_th: 0.9981762415412347 | F1: 0.8333333333333333, AUC: 0.9999999999999999
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9994605019457572 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 0.9473684210526316 p_th: 0.6831251991967814 | F1: 0.9, AUC: 0.9999467456155395
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9992827178033312 | F1: 0.9090909090909091, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.7745523422439173 | F1: 1.0, AUC: 1.0
> average: 0.9894736842105264, std: 0.021052631578947347
(encode_labels) sample size: Counter({0: 1107927, 1: 12})
> sample size | n(+): 12, n(-): 1107927
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9999840336299467 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9999895854610764 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9999902959264995 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9999929484479987 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9999577577021351 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0:

> Fmax: 0.9565217391304348 p_th: 0.43807889348710216 | F1: 0.9090909090909091, AUC: 0.9999995897178366
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9999164434300948 | F1: 0.9565217391304348, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9966552007862681 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.7941220020852201 | F1: 1.0, AUC: 1.0
> average: 0.9817805383022774, std: 0.022352579008307717
(encode_labels) sample size: Counter({0: 1107935, 1: 4})
> sample size | n(+): 4, n(-): 1107935
> (positive) sample size too small, n=4
(encode_labels) sample size: Counter({0: 1107350, 1: 589})
> sample size | n(+): 589, n(-): 1107350
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9954609337019443 | F1: 0.9874476987447699, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9896621999909495 | F1: 0.9915966386554621, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.49605114471715783 | F1: 0.9957446808510638, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9152466231825592 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 0.99570815450643

> Fmax: 1.0 p_th: 0.9995191699249721 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9997314776804562 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9965360807973282 | F1: 0.9803921568627451, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9964921953164056 | F1: 0.9615384615384615, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.8504400280075078 | F1: 0.9615384615384615, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1102426, 1: 5513})
> sample size | n(+): 5513, n(-): 1102426
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9949527277085718 | F1: 0.9995468962392388, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9516874910839586 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9862918738150002 | F1: 0.9990942028985508, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9896039630154113 | F1: 0.9986406887177164, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9955548175067978 | F1: 0.99909338168631, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0:

(encode_labels) sample size: Counter({0: 1107937, 1: 2})
> sample size | n(+): 2, n(-): 1107937
> (positive) sample size too small, n=2
(encode_labels) sample size: Counter({0: 1102235, 1: 5704})
> sample size | n(+): 5704, n(-): 1102235
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9978816555531892 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9785904181280464 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.997076361347916 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.994902153103373 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9977030149181078 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1107935, 1: 4})
> sample size | n(+): 4, n(-): 1107935
> (positive) sample size too small, n=4
(encode_labels) sample size: Counter({0: 1107912, 1: 27})
> sample size | n(+): 27, n(-): 1107912
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9996932170208517 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9338128883449939 | F1: 1.0, AUC: 1.0
>

> average: 0.9980295446962113, std: 0.000985242594324671
(encode_labels) sample size: Counter({0: 1107909, 1: 30})
> sample size | n(+): 30, n(-): 1107909
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9996126795136431 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.999245279761594 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9998622357915019 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9999761669323465 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9935448507623265 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1102233, 1: 5706})
> sample size | n(+): 5706, n(-): 1102233
> 0 of KFold 5
> Fmax: 0.9986847873739588 p_th: 0.9958158062878212 | F1: 0.9978098992553658, AUC: 0.9998143479165803
> 1 of KFold 5
> Fmax: 0.9995619798510731 p_th: 0.9929015211928608 | F1: 0.999124343257443, AUC: 0.9999999761459891
> 2 of KFold 5
> Fmax: 0.9995619798510731 p_th: 0.8507411459839432 | F1: 0.9978137297770004, AUC: 0.9999999960243315
> 3 of KF

> Fmax: 1.0 p_th: 0.9992803957122446 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9992659312210191 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 0.6666666666666666 p_th: 0.9999916886725602 | F1: 0.6666666666666666, AUC: 0.9999616400027078
> average: 0.9333333333333333, std: 0.13333333333333336
(encode_labels) sample size: Counter({0: 1107523, 1: 416})
> sample size | n(+): 416, n(-): 1107523
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9956523739928929 | F1: 0.9940828402366864, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9465482023006963 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9915477127902488 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9915784577797983 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9998481130538679 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1107779, 1: 160})
> sample size | n(+): 160, n(-): 1107779
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9989939642338228 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 0.9

> Fmax: 0.9833333333333333 p_th: 0.9855643029328881 | F1: 0.9516129032258064, AUC: 0.9999972163037328
> 4 of KFold 5
> Fmax: 0.9827586206896551 p_th: 0.9986873226915107 | F1: 0.9743589743589743, AUC: 0.9999992348979567
> average: 0.9898285602961231, std: 0.008306934235415338
(encode_labels) sample size: Counter({0: 1107705, 1: 234})
> sample size | n(+): 234, n(-): 1107705
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9995513927360585 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 0.9782608695652174 p_th: 0.9992583527927571 | F1: 0.967741935483871, AUC: 0.9999888594657135
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.999877655042586 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 0.9894736842105264 p_th: 0.12441491672483108 | F1: 0.989247311827957, AUC: 0.9999999039609113
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9998745682108761 | F1: 1.0, AUC: 1.0
> average: 0.9935469107551487, std: 0.008662347397214436
(encode_labels) sample size: Counter({0: 1104723, 1: 3216})
> sample size | n(+): 3216, n(-): 1104723
> 0 of KFold 5
> 

> Fmax: 1.0 p_th: 0.9999722458718324 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9999935801267079 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9999430055748548 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1107628, 1: 311})
> sample size | n(+): 311, n(-): 1107628
> 0 of KFold 5
> Fmax: 0.9130434782608696 p_th: 0.9996972552499493 | F1: 0.9064748201438849, AUC: 0.9999762828369854
> 1 of KFold 5
> Fmax: 0.9242424242424242 p_th: 0.9997058002466629 | F1: 0.9242424242424242, AUC: 0.9999745169357532
> 2 of KFold 5
> Fmax: 0.9612403100775194 p_th: 0.99937081819614 | F1: 0.9612403100775194, AUC: 0.9999927191245009
> 3 of KFold 5
> Fmax: 0.9323308270676691 p_th: 0.9995905782749682 | F1: 0.898550724637681, AUC: 0.9999880593102796
> 4 of KFold 5
> Fmax: 0.9538461538461539 p_th: 0.999027963609175 | F1: 0.9393939393939393, AUC: 0.9999878408830286
> average: 0.9369406386989272, std: 0.018054582588151815
(encode_labels) sample size: Counter

> Fmax: 1.0 p_th: 0.9986277078328315 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 0.9993510707332901 p_th: 0.9953357177871524 | F1: 0.9993510707332901, AUC: 0.9999997239339882
> average: 0.999870214146658, std: 0.00025957170668395864
(encode_labels) sample size: Counter({0: 1106562, 1: 1377})
> sample size | n(+): 1377, n(-): 1106562
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9919534164304278 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9966074804225895 | F1: 0.9981916817359855, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9078135085980437 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9934976786436358 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9945402248702344 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1107931, 1: 8})
> sample size | n(+): 8, n(-): 1107931
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.999926735094795 | F1: 0.8, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.999638891543609 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p

> Fmax: 0.6666666666666666 p_th: 0.9997500635954966 | F1: 0.3333333333333333, AUC: 0.9999842047782801
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9935152624131462 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.35359227878124855 | F1: 0.6666666666666666, AUC: 1.0
> 4 of KFold 5
> Fmax: 0.5714285714285715 p_th: 0.11122609967091258 | F1: 0.0, AUC: 0.9999887176478551
> average: 0.8476190476190476, std: 0.18904222134551626
(encode_labels) sample size: Counter({0: 1107588, 1: 351})
> sample size | n(+): 351, n(-): 1107588
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9997139531481606 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9451417995372516 | F1: 0.9790209790209791, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9996189540517797 | F1: 0.9929078014184397, AUC: 1.0
> 3 of KFold 5
> Fmax: 0.9928057553956835 p_th: 0.9994335287873433 | F1: 0.9787234042553192, AUC: 0.9999998065288765
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9000944420326897 | F1: 0.9859154929577464, AUC: 1.0
> average: 0.9985611510791367

> Fmax: 0.9995129079396006 p_th: 0.9991186493327099 | F1: 0.9995129079396006, AUC: 0.9999999911705674
> average: 0.9998053526136857, std: 0.0002383935762875566
(encode_labels) sample size: Counter({0: 1107702, 1: 237})
> sample size | n(+): 237, n(-): 1107702
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9999279814110561 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.999419399192307 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 0.989247311827957 p_th: 0.9999750997745822 | F1: 0.989247311827957, AUC: 0.9999980792095562
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9999492281749699 | F1: 0.9894736842105264, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9999351468783483 | F1: 1.0, AUC: 1.0
> average: 0.9978494623655914, std: 0.004301075268817201
(encode_labels) sample size: Counter({0: 1107922, 1: 17})
> sample size | n(+): 17, n(-): 1107922
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9982893237613641 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9998140002233378 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1

> Fmax: 1.0 p_th: 0.9999315624561315 | F1: 0.6666666666666666, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9999950192910437 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1102236, 1: 5703})
> sample size | n(+): 5703, n(-): 1102236
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9931510311019259 | F1: 0.9995619798510731, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9946784128030792 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9965796432587041 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 0.9995612110574814 p_th: 0.9410477061505339 | F1: 0.9995612110574814, AUC: 0.9999997612506455
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9985426840852923 | F1: 0.9995615957913195, AUC: 1.0
> average: 0.9999122422114963, std: 0.0001755155770074346
(encode_labels) sample size: Counter({0: 1107656, 1: 283})
> sample size | n(+): 283, n(-): 1107656
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9983059594223578 | F1: 0.9827586206896551, AUC: 1.0
> 1 of KFold 5
> Fmax: 0.9911504424778761 p_th: 0

> sample size | n(+): 20, n(-): 1107919
> 0 of KFold 5
> Fmax: 0.8571428571428571 p_th: 0.9998386984242749 | F1: 0.8571428571428571, AUC: 0.9999864611163262
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9999876910947854 | F1: 0.8, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9996237621576008 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 0.888888888888889 p_th: 0.28646163615588816 | F1: 0.75, AUC: 0.9999988717596938
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.999922981115055 | F1: 0.888888888888889, AUC: 1.0
> average: 0.9492063492063492, std: 0.06301407378183876
(encode_labels) sample size: Counter({0: 1107930, 1: 9})
> sample size | n(+): 9, n(-): 1107930
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9999927928738287 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9999921422979783 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 0.6666666666666666 p_th: 0.9999610062482983 | F1: 0.6666666666666666, AUC: 0.9999526143348407
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9999522876504936 | F1: 0.8, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0

(encode_labels) sample size: Counter({0: 1107936, 1: 3})
> sample size | n(+): 3, n(-): 1107936
> (positive) sample size too small, n=3
(encode_labels) sample size: Counter({0: 1102235, 1: 5704})
> sample size | n(+): 5704, n(-): 1102235
> 0 of KFold 5
> Fmax: 0.999124343257443 p_th: 0.4474939266624522 | F1: 0.9991235758106923, AUC: 0.9999999880729945
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9841903822587817 | F1: 0.9995619798510731, AUC: 1.0
> 2 of KFold 5
> Fmax: 0.9991235758106923 p_th: 0.99336794915934 | F1: 0.9991235758106923, AUC: 0.9999986800780648
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9974284476269627 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.968708337128268 | F1: 0.9995615957913195, AUC: 0.9999999999999999
> average: 0.9996495838136271, std: 0.0004291704957306238
(encode_labels) sample size: Counter({0: 1107625, 1: 314})
> sample size | n(+): 314, n(-): 1107625
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9995076065200158 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.695432

> Fmax: 1.0 p_th: 0.9996902418167196 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 0.9904761904761905 p_th: 0.9997136996335773 | F1: 0.9904761904761905, AUC: 0.9998338351919348
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9857474769655838 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9999476553467448 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9999442627451663 | F1: 0.9904761904761905, AUC: 1.0
> average: 0.9980952380952381, std: 0.003809523809523796
(encode_labels) sample size: Counter({0: 1107875, 1: 64})
> sample size | n(+): 64, n(-): 1107875
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9938009389428002 | F1: 0.962962962962963, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9999116074951552 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9999821610775966 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9998262325957288 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9985012323942913 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 

> Fmax: 1.0 p_th: 0.8864497342628305 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9999970467498791 | F1: 0.6666666666666666, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.8220174380207659 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9983957914099164 | F1: 0.5, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1106570, 1: 1369})
> sample size | n(+): 1369, n(-): 1106570
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9964551956673617 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9979256365785167 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9302507282659929 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9989544635167552 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9994558776008454 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1107938, 1: 1})
> sample size | n(+): 1, n(-): 1107938
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1107310, 1: 629})
> sam

> Fmax: 0.9949748743718593 p_th: 0.9998187778604783 | F1: 0.9569377990430622, AUC: 0.9999998645518698
> 4 of KFold 5
> Fmax: 0.9949748743718593 p_th: 0.9940305790181888 | F1: 0.9852216748768473, AUC: 0.9999999097012466
> average: 0.9959898997474937, std: 0.0020051436426197795
(encode_labels) sample size: Counter({0: 1107938, 1: 1})
> sample size | n(+): 1, n(-): 1107938
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1107935, 1: 4})
> sample size | n(+): 4, n(-): 1107935
> (positive) sample size too small, n=4
(encode_labels) sample size: Counter({0: 1107646, 1: 293})
> sample size | n(+): 293, n(-): 1107646
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9993340515843022 | F1: 0.9915966386554621, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.997668326118162 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9993558836940714 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.99778946000196 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9997321525803146

(encode_labels) sample size: Counter({0: 1107934, 1: 5})
> sample size | n(+): 5, n(-): 1107934
> 0 of KFold 5
> Fmax: 0.0003233629749393695 p_th: 3.3614568816127027e-05 | F1: 0.0, AUC: 0.9720967385270797
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9979794593838067 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9998972540619332 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9999760239159878 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9990265388448839 | F1: 1.0, AUC: 1.0
> average: 0.8000646725949878, std: 0.3998706548100242
(encode_labels) sample size: Counter({0: 1107937, 1: 2})
> sample size | n(+): 2, n(-): 1107937
> (positive) sample size too small, n=2
(encode_labels) sample size: Counter({0: 1107930, 1: 9})
> sample size | n(+): 9, n(-): 1107930
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9947734561166672 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9999924251673633 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9816856142748299 | F1: 1.0, AUC: 1.0
> 3 

> Fmax: 0.8571428571428571 p_th: 0.9999932498035374 | F1: 0.8571428571428571, AUC: 0.9971049353743953
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9999922101042865 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9999882456255614 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9999845602330787 | F1: 1.0, AUC: 1.0
> average: 0.9492063492063492, std: 0.06301407378183875
(encode_labels) sample size: Counter({0: 1107934, 1: 5})
> sample size | n(+): 5, n(-): 1107934
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.999991805273555 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9999935178159748 | F1: 0.6666666666666666, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9999896924693213 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9999889295809163 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9999968391292766 | F1: 0.6666666666666666, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1107563, 1: 376})
> sample size | n(+): 376, n(-): 1107563
> 0 of KFold 5
>

> Fmax: 1.0 p_th: 0.9997316160142253 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9993046706244609 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1107936, 1: 3})
> sample size | n(+): 3, n(-): 1107936
> (positive) sample size too small, n=3
(encode_labels) sample size: Counter({0: 1107938, 1: 1})
> sample size | n(+): 1, n(-): 1107938
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1107914, 1: 25})
> sample size | n(+): 25, n(-): 1107914
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9721391809933478 | F1: 0.8333333333333333, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.7055812866428387 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9994890689371227 | F1: 0.8333333333333333, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9994339514140522 | F1: 0.8333333333333333, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9979167428651772 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 11079

> Fmax: 1.0 p_th: 0.9972090763837608 | F1: 0.9995285242809996, AUC: 0.9999999999999999
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9943608365221097 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 0.9995280792826805 p_th: 0.9707790463393201 | F1: 0.9995280792826805, AUC: 0.9999960557783104
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9949369171617647 | F1: 0.9995285242809996, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9169136582581056 | F1: 1.0, AUC: 1.0
> average: 0.9999056158565361, std: 0.00018876828692779843
(encode_labels) sample size: Counter({0: 1107767, 1: 172})
> sample size | n(+): 172, n(-): 1107767
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9947826792122443 | F1: 0.9859154929577464, AUC: 1.0
> 1 of KFold 5
> Fmax: 0.9859154929577464 p_th: 0.9513841044935342 | F1: 0.9859154929577464, AUC: 0.9999998710407911
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9999432121720923 | F1: 0.9855072463768115, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9999205617328258 | F1: 0.9577464788732395, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p

> Fmax: 1.0 p_th: 0.9999907913875579 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 0.03225806451612903 p_th: 0.0015505842573209265 | F1: 0.0, AUC: 0.9994922941534852
> average: 0.8064516129032258, std: 0.3870967741935484
(encode_labels) sample size: Counter({0: 1107098, 1: 841})
> sample size | n(+): 841, n(-): 1107098
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9976662137322926 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9992550156198714 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9960908172381541 | F1: 0.9970326409495549, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9988759589077663 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9973453909529322 | F1: 0.9970326409495549, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1106061, 1: 1878})
> sample size | n(+): 1878, n(-): 1106061
> 0 of KFold 5
> Fmax: 0.9986684420772304 p_th: 0.7811213890655646 | F1: 0.9986684420772304, AUC: 0.9999993627976347
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9299520120414

> Fmax: 1.0 p_th: 0.9980877894569228 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9998029811610751 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1107938, 1: 1})
> sample size | n(+): 1, n(-): 1107938
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1107930, 1: 9})
> sample size | n(+): 9, n(-): 1107930
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9999970873865623 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9999794964542058 | F1: 0.8, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9999903494191082 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9999953900336027 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9787314481745635 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1107899, 1: 40})
> sample size | n(+): 40, n(-): 1107899
> 0 of KFold 5
> Fmax: 0.9333333333333333 p_th: 0.9999650494833183 | F1: 0.9333333333333333, AUC: 0.9996400848452027
> 1 of KFold 5

> Fmax: 1.0 p_th: 0.9992330900040582 | F1: 0.9793103448275862, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9998408220297161 | F1: 0.9861111111111112, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 1107937, 1: 2})
> sample size | n(+): 2, n(-): 1107937
> (positive) sample size too small, n=2
(encode_labels) sample size: Counter({0: 1107938, 1: 1})
> sample size | n(+): 1, n(-): 1107938
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 1107937, 1: 2})
> sample size | n(+): 2, n(-): 1107937
> (positive) sample size too small, n=2
(encode_labels) sample size: Counter({0: 1107935, 1: 4})
> sample size | n(+): 4, n(-): 1107935
> (positive) sample size too small, n=4
(encode_labels) sample size: Counter({0: 1104416, 1: 3523})
> sample size | n(+): 3523, n(-): 1104416
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9756375122464029 | F1: 0.9985835694050991, AUC: 1.0
> 1 of KFold 5
> Fmax: 0.9985795454545454 p_th: 0.9962728818350107 | F1: 0.99787083

### Visualize Results

In [None]:
"""

Memo
---- 
1. performance plot

   perplot: https://pypi.org/project/perfplot/
"""
import seaborn as sns
import matplotlib.pyplot as plt
from analyzer import load_performance

sns.set(style="whitegrid")

# Initialize the matplotlib figure
f, ax = plt.subplots(figsize=(6, 20))
sns.set_color_codes("pastel")

#---------------------------------------------

# load performance data
cohort = 'hepatitis-c'
df_perf = load_performance(input_dir='result', cohort=cohort)
print("> dim(performance matrix): {}".format(df_perf.shape))

# sort ~ performance scores 
# df_perf = df_perf.sort_values(by=['mean', ], ascending=False)

header = ['code', 'mean', 'std', 'n_pos']
codes = df_perf['code']
n_codes = len(codes)
scores = df_perf['mean']

# some statistics
score_high = 0.90
score_low = 0.50

codes_low_sz = df_perf.loc[df_perf['mean'] < 0]['code']
codes_scored = df_perf.loc[df_perf['mean'] >= 0]['code']
codes_high_score = df_perf.loc[df_perf['mean'] >= score_high]['code']
assert n_codes == len(codes_low_sz) + len(codes_scored)

print("1. Total number of codes: {} | n(low_sample): {}, n(scored):{}, n(high scored):{}".format(n_codes, 
   len(codes_low_sz), len(codes_scored), len(codes_high_score)))
r_scored = len(codes_scored)/(n_codes+0.0)
rh = len(codes_high_score)/(n_codes+0.0)
print("2. Fraction of scored codes: {}".format(r_scored))
print("3. Fraction of highly scored codes: {}".format(rh))

# Effective performance dataframe, ruling out those codes without scores (due to low sample sizes)
df_eff = df_perf.loc[df_perf['mean'] >= 0.0]

n_offset = 25
df_topn = df_eff.sort_values(['mean', ], ascending=False).head(n_offset)
df_botn = df_eff.sort_values(['mean', ], ascending=True).head(n_offset)
# print(df_botn)

# codes = [str(c) for c in df_botn['code'].values]
# print('lower codes: {}'.format(codes))
# scores = df_botn['mean'].values
# print('scores: {}'.format(scores))

# top n + bottom n
dfe = pd.concat([df_topn, df_botn], ignore_index=True)
dfe.sort_values(by=['mean', ], ascending=False, inplace=True)
codes = [str(c) for c in dfe['code'].values]
scores = dfe['mean'].values
# print('lower(n)+higher codes(n): {}'.format(codes))
# print('scores: {}'.format(scores))
print(dfe)

# sns.barplot(x="total", y="abbrev", data=crashes,
#             label="Total", color="b")

# --------------------
# ax = sns.barplot(x='mean', y='code', data=df_botn)
# print("-------------------------\n\n")
# print("> dtype: {}".format(df_botn.dtypes))
# print(df_botn.head(10))

# dfe = dfe[['mean', 'code']]
# dfe.plot(kind='bar')

sns.barplot(x='mean', y='code', data=dfe, order=dfe['code'], # order has to be specified; even if already sorted!!!
            label="LOINC", color="b", orient='h')

# ax = sns.barplot(x='mean', y='code', data=df)

# ax.set_xlabel('Fmax Score')
# ax.set_ylabel('LOINC')
