In [1]:
import pandas as pd 
from pandas import DataFrame, Series
import os
import numpy as np

from decimal import Decimal

import warnings
warnings.filterwarnings('ignore')  # action='once'

Utility Functions
-------------------

In [2]:
from analyzer import interpret

def col_values(df, col='age', n=10):
    df_subset = df.sample(n=n, random_state=1) # for each column, sample n values (usually n=1)
    return df_subset[col].values

def summary(df, n=1): 
    msg = ""
    msg += "> sample sizes: {}\n".format(df.shape[0])
    msg += "> n(features):  {}\n".format(df.shape[1])
    # msg += "> list of features:\n{}\n".format(df.columns.values)
    print(msg)

    interpret(df, n=n, verbose=True)

### Define patient cohort

In [3]:
# ICD-10-CM CODE, ICD-10-CM CODE DESCRIPTION, CCSR CATEGORY, CCSR CATEGORY DESCRIPTION
"""
1. Find rows whose column match a substring 
   https://davidhamann.de/2017/06/26/pandas-select-elements-by-string/
   
   https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.contains.html

"""
import json 
from cohort_search import gen_code_set, gen_query_str
    
# test 
# load_mapping()
# search_by_str(condition='hepatitis c')

# ICD-10
# codes = name_to_codes(condition='hepatitis c')
# print(codes)

# ICD-9
"""
0700 0701 0702 07020 07021 07022 07023 0703 07030 07031
          07032 07033 0704 07041 07042 07043 07044 07049 0705 07051
          07052 07053 07054 07059 0706 07070 07071 0709 07271 57140
          57141 57142 57149 5731 5732 5733 
"""
q = gen_query_str(condition='hepatitis c')

print("> query:\n{}\n".format(q))

codes = gen_code_set(condition='hepatitis c', verbose=1)


(gen_query_str) condition: hepatitis c -> codes:
["07041", "07044", "07051", "07054", "07070", "07071", "v0262", "b17.10", "b17.11", "b18.2", "b19.20", "b19.21", "z22.52"]

> query:
lower(diagnosis_codes) like '%07041%' OR lower(diagnosis_codes) like '%07044%' OR lower(diagnosis_codes) like '%07051%' OR lower(diagnosis_codes) like '%07054%' OR lower(diagnosis_codes) like '%07070%' OR lower(diagnosis_codes) like '%07071%' OR lower(diagnosis_codes) like '%v0262%' OR lower(diagnosis_codes) like '%b17.10%' OR lower(diagnosis_codes) like '%b17.11%' OR lower(diagnosis_codes) like '%b18.2%' OR lower(diagnosis_codes) like '%b19.20%' OR lower(diagnosis_codes) like '%b19.21%' OR lower(diagnosis_codes) like '%z22.52%' OR lower(billing_diagnosis_codes) like '%07041%' OR lower(billing_diagnosis_codes) like '%07044%' OR lower(billing_diagnosis_codes) like '%07051%' OR lower(billing_diagnosis_codes) like '%07054%' OR lower(billing_diagnosis_codes) like '%07070%' OR lower(billing_diagnosis_codes) like

### Run LoincCodePredictor2 on Databricks ... 

In [4]:
# lower(col("diag")).rlike("?i07041|")

### Load a disease-specific dataset from Andromeda

In [5]:
cohort = 'hepatitis-c'
input_dir = os.path.join(os.getcwd(), 'data')
input_file = f"andromeda-pond-{cohort}.csv" # "andromeda_pond-10p.csv"
input_path = os.path.join(input_dir, input_file)
df = pd.read_csv(input_path, sep=',', header=0, index_col=None, error_bad_lines=False)

summary(df, n=1)

df.info()   


b'Skipping line 2834: expected 127 fields, saw 128\n'
b'Skipping line 9920: expected 127 fields, saw 128\nSkipping line 10806: expected 127 fields, saw 128\n'
b'Skipping line 16962: expected 127 fields, saw 128\nSkipping line 17264: expected 127 fields, saw 128\nSkipping line 20551: expected 127 fields, saw 129\nSkipping line 23730: expected 127 fields, saw 128\n'
b'Skipping line 27323: expected 127 fields, saw 128\nSkipping line 27766: expected 127 fields, saw 128\nSkipping line 30213: expected 127 fields, saw 128\n'
b'Skipping line 33681: expected 127 fields, saw 128\nSkipping line 34672: expected 127 fields, saw 128\nSkipping line 36828: expected 127 fields, saw 129\nSkipping line 38755: expected 127 fields, saw 128\nSkipping line 38985: expected 127 fields, saw 128\n'
b'Skipping line 43025: expected 127 fields, saw 128\nSkipping line 44345: expected 127 fields, saw 128\nSkipping line 45396: expected 127 fields, saw 128\nSkipping line 46299: expected 127 fields, saw 128\nSkipping li

> sample sizes: 71224
> n(features):  127

[meta_package_key] => n_uniq=10552/(n=71224,N=71224 | r_miss=0.0%), dtype=object
  :['80a18fb2461e79c91ef6a6e2bc5cfb77']

[input_filename] => n_uniq=15473/(n=71224,N=71224 | r_miss=0.0%), dtype=object
  :['SNC_PAL_Live_ca90d28e-cf1e-44b6-aed7-1375732213cd_06302017.csv']

[patient_date_of_birth] => n_uniq=100/(n=71224,N=71224 | r_miss=0.0%), dtype=int64
  :[2015]

[patient_gender] => n_uniq=2/(n=71175,N=71224 | r_miss=0.06999999999999999%), dtype=object
  :['M']

[patient_state] => n_uniq=57/(n=69916,N=71224 | r_miss=1.8399999999999999%), dtype=object
  :['VA']

[patient_bill_type] => n_uniq=15/(n=62552,N=71224 | r_miss=12.18%), dtype=object
  :['PI']

[diagnosis_codes] => n_uniq=14770/(n=34368,N=71224 | r_miss=51.74999999999999%), dtype=object
  :['ICD9/07070^ICD9/V021^ICD9/V013^ICD9/78079']

[diagnosis_descriptions] => n_uniq=4025/(n=8391,N=71224 | r_miss=88.22%), dtype=object
  :['GASTRITIS, UNSPECIFIED, WITH BLEEDING^CHRONIC VIRAL HEPATITIS

### Load Control Data (Optional)

In [6]:
def filter_by_diagnosis(df,  condition='', codes=[], col='diagnosis_codes', col2='billing_diagnosis_codes', verbose=1): 
    
    if condition: 
        codes = gen_code_set(condition=condition, verbose=verbose)
    else: 
        assert len(codes) > 0
    
    indices = []
    for code in codes: 
        for c in [col, col2]: 
            dfm = df[c].str.match(f".*{code}.*", case=False)
            indices.extend(list(dfm.index.values))
    print("(filter_by_diagnosis) Found {} matching rows".format(len(indices)))
    
    # drop rows by indices 
    return df.drop(labels=indices, axis=0)

# get target labels
tFilter = False
tSave = False  # if True, save the control data sampled from the source data (e.g. Andromeda)

col = 'test_result_loinc_code'
loinc_set = list(df[col].unique())
print("> Found {} unique loinc labels (some of which may be noises)".format(len(loinc_set)))

input_dir = os.path.join(os.getcwd(), 'data')
input_file = "andromeda_pond-10p.csv"  # a random sample of rows
input_path = os.path.join(input_dir, input_file)

df_ctrl = pd.read_csv(input_path, sep=',', header=0, index_col=None, error_bad_lines=False)
n0 = nctrl = df_ctrl.shape[0]

if tFilter: 
    df_ctrl = filter_by_diagnosis(df_ctrl, condition='hepatitis c')
    nctrl = df_ctrl.shape[0]
print("> sample size | orig: {}, filtered(control): {}".format(n0, nctrl))

# summary(df_ctrl, n=1)
# df_ctrl.info()   

# save 
if tSave: 
    output_file = f"andromeda-pond-{cohort}-ctrl.csv" 
    output_path = os.path.join(input_dir, output_file)
    df_ctrl.to_csv(output_path, sep=',', index=False, header=True)



> Found 931 unique loinc labels


b'Skipping line 256: expected 127 fields, saw 128\nSkipping line 511: expected 127 fields, saw 129\nSkipping line 5917: expected 127 fields, saw 133\nSkipping line 7086: expected 127 fields, saw 128\nSkipping line 7750: expected 127 fields, saw 128\n'
b'Skipping line 10545: expected 127 fields, saw 131\nSkipping line 14112: expected 127 fields, saw 128\nSkipping line 14113: expected 127 fields, saw 128\nSkipping line 15206: expected 127 fields, saw 135\n'
b'Skipping line 20277: expected 127 fields, saw 130\n'
b'Skipping line 25255: expected 127 fields, saw 136\nSkipping line 30914: expected 127 fields, saw 164\n'
b'Skipping line 34649: expected 127 fields, saw 130\nSkipping line 37064: expected 127 fields, saw 131\nSkipping line 39673: expected 127 fields, saw 128\nSkipping line 39674: expected 127 fields, saw 128\nSkipping line 39676: expected 127 fields, saw 128\nSkipping line 40260: expected 127 fields, saw 132\nSkipping line 40291: expected 127 fields, saw 129\nSkipping line 40614:

b'Skipping line 304466: expected 127 fields, saw 128\nSkipping line 306384: expected 127 fields, saw 128\nSkipping line 306385: expected 127 fields, saw 128\nSkipping line 306387: expected 127 fields, saw 128\nSkipping line 307786: expected 127 fields, saw 128\n'
b'Skipping line 312711: expected 127 fields, saw 132\nSkipping line 313105: expected 127 fields, saw 128\nSkipping line 314345: expected 127 fields, saw 128\nSkipping line 315877: expected 127 fields, saw 129\nSkipping line 317175: expected 127 fields, saw 129\n'
b'Skipping line 323634: expected 127 fields, saw 130\nSkipping line 323650: expected 127 fields, saw 131\n'
b'Skipping line 331833: expected 127 fields, saw 148\nSkipping line 332177: expected 127 fields, saw 142\nSkipping line 334586: expected 127 fields, saw 130\n'
b'Skipping line 336654: expected 127 fields, saw 128\nSkipping line 341873: expected 127 fields, saw 128\n'
b'Skipping line 345036: expected 127 fields, saw 129\nSkipping line 346459: expected 127 fields,

b'Skipping line 580198: expected 127 fields, saw 128\nSkipping line 581088: expected 127 fields, saw 130\n'
b'Skipping line 581983: expected 127 fields, saw 131\nSkipping line 586929: expected 127 fields, saw 128\nSkipping line 586968: expected 127 fields, saw 130\n'
b'Skipping line 590629: expected 127 fields, saw 130\nSkipping line 595449: expected 127 fields, saw 130\nSkipping line 595463: expected 127 fields, saw 130\nSkipping line 596527: expected 127 fields, saw 130\n'
b'Skipping line 599747: expected 127 fields, saw 131\nSkipping line 599973: expected 127 fields, saw 141\nSkipping line 600752: expected 127 fields, saw 128\n'
b'Skipping line 606853: expected 127 fields, saw 129\nSkipping line 608275: expected 127 fields, saw 128\n'
b'Skipping line 615975: expected 127 fields, saw 129\nSkipping line 619136: expected 127 fields, saw 128\nSkipping line 620929: expected 127 fields, saw 130\n'
b'Skipping line 628790: expected 127 fields, saw 128\nSkipping line 628793: expected 127 fie

b'Skipping line 877453: expected 127 fields, saw 136\nSkipping line 880547: expected 127 fields, saw 130\nSkipping line 881191: expected 127 fields, saw 128\nSkipping line 882488: expected 127 fields, saw 128\n'
b'Skipping line 885532: expected 127 fields, saw 128\nSkipping line 888688: expected 127 fields, saw 130\nSkipping line 890608: expected 127 fields, saw 130\nSkipping line 891400: expected 127 fields, saw 128\n'
b'Skipping line 893962: expected 127 fields, saw 134\nSkipping line 894471: expected 127 fields, saw 128\nSkipping line 897155: expected 127 fields, saw 131\nSkipping line 897175: expected 127 fields, saw 129\n'
b'Skipping line 905330: expected 127 fields, saw 128\nSkipping line 907408: expected 127 fields, saw 128\nSkipping line 907851: expected 127 fields, saw 131\n'
b'Skipping line 910890: expected 127 fields, saw 132\nSkipping line 910933: expected 127 fields, saw 129\nSkipping line 911173: expected 127 fields, saw 135\nSkipping line 911180: expected 127 fields, saw

b'Skipping line 1149200: expected 127 fields, saw 128\nSkipping line 1151607: expected 127 fields, saw 128\nSkipping line 1153131: expected 127 fields, saw 129\nSkipping line 1153303: expected 127 fields, saw 135\nSkipping line 1153807: expected 127 fields, saw 134\nSkipping line 1154441: expected 127 fields, saw 130\nSkipping line 1155012: expected 127 fields, saw 128\n'
b'Skipping line 1158360: expected 127 fields, saw 130\nSkipping line 1159215: expected 127 fields, saw 128\nSkipping line 1161997: expected 127 fields, saw 129\n'
b'Skipping line 1165521: expected 127 fields, saw 131\nSkipping line 1166351: expected 127 fields, saw 128\nSkipping line 1167166: expected 127 fields, saw 134\nSkipping line 1167592: expected 127 fields, saw 148\nSkipping line 1167916: expected 127 fields, saw 138\nSkipping line 1169383: expected 127 fields, saw 130\nSkipping line 1171852: expected 127 fields, saw 128\n'
b'Skipping line 1172217: expected 127 fields, saw 130\nSkipping line 1178310: expected 

b'Skipping line 1426387: expected 127 fields, saw 128\nSkipping line 1426397: expected 127 fields, saw 132\nSkipping line 1429280: expected 127 fields, saw 131\nSkipping line 1434262: expected 127 fields, saw 130\n'
b'Skipping line 1436489: expected 127 fields, saw 128\nSkipping line 1437192: expected 127 fields, saw 133\nSkipping line 1437854: expected 127 fields, saw 128\nSkipping line 1438348: expected 127 fields, saw 130\nSkipping line 1439732: expected 127 fields, saw 130\nSkipping line 1442219: expected 127 fields, saw 129\n'
b'Skipping line 1444192: expected 127 fields, saw 128\nSkipping line 1445893: expected 127 fields, saw 129\nSkipping line 1450159: expected 127 fields, saw 130\n'
b'Skipping line 1452365: expected 127 fields, saw 128\nSkipping line 1452367: expected 127 fields, saw 128\nSkipping line 1452368: expected 127 fields, saw 128\nSkipping line 1452369: expected 127 fields, saw 128\nSkipping line 1452370: expected 127 fields, saw 128\nSkipping line 1452371: expected 

b'Skipping line 1698347: expected 127 fields, saw 130\nSkipping line 1700766: expected 127 fields, saw 143\nSkipping line 1704391: expected 127 fields, saw 134\n'
b'Skipping line 1706227: expected 127 fields, saw 130\nSkipping line 1706285: expected 127 fields, saw 130\nSkipping line 1712181: expected 127 fields, saw 128\nSkipping line 1712397: expected 127 fields, saw 130\nSkipping line 1712578: expected 127 fields, saw 134\n'
b'Skipping line 1714574: expected 127 fields, saw 128\nSkipping line 1715874: expected 127 fields, saw 129\nSkipping line 1716176: expected 127 fields, saw 131\nSkipping line 1716986: expected 127 fields, saw 128\nSkipping line 1720583: expected 127 fields, saw 130\n'
b'Skipping line 1721845: expected 127 fields, saw 138\nSkipping line 1727001: expected 127 fields, saw 128\nSkipping line 1729172: expected 127 fields, saw 129\n'
b'Skipping line 1729847: expected 127 fields, saw 130\nSkipping line 1730161: expected 127 fields, saw 128\nSkipping line 1730162: expec

b'Skipping line 1994562: expected 127 fields, saw 128\nSkipping line 1998858: expected 127 fields, saw 131\nSkipping line 1999073: expected 127 fields, saw 129\n'
b'Skipping line 2005359: expected 127 fields, saw 130\n'
b'Skipping line 2008931: expected 127 fields, saw 130\nSkipping line 2010181: expected 127 fields, saw 128\nSkipping line 2011005: expected 127 fields, saw 129\nSkipping line 2011192: expected 127 fields, saw 134\nSkipping line 2012724: expected 127 fields, saw 132\nSkipping line 2013700: expected 127 fields, saw 128\nSkipping line 2014254: expected 127 fields, saw 130\nSkipping line 2014420: expected 127 fields, saw 130\n'
b'Skipping line 2016602: expected 127 fields, saw 128\nSkipping line 2019017: expected 127 fields, saw 128\nSkipping line 2019018: expected 127 fields, saw 128\nSkipping line 2019019: expected 127 fields, saw 128\nSkipping line 2019020: expected 127 fields, saw 128\nSkipping line 2020212: expected 127 fields, saw 128\nSkipping line 2020213: expected 

b'Skipping line 2280427: expected 127 fields, saw 128\nSkipping line 2280428: expected 127 fields, saw 128\nSkipping line 2280429: expected 127 fields, saw 128\nSkipping line 2281041: expected 127 fields, saw 130\nSkipping line 2282551: expected 127 fields, saw 128\nSkipping line 2282552: expected 127 fields, saw 128\nSkipping line 2282556: expected 127 fields, saw 128\nSkipping line 2282668: expected 127 fields, saw 128\nSkipping line 2282670: expected 127 fields, saw 128\nSkipping line 2282671: expected 127 fields, saw 128\nSkipping line 2284235: expected 127 fields, saw 132\nSkipping line 2284294: expected 127 fields, saw 128\nSkipping line 2284972: expected 127 fields, saw 130\nSkipping line 2285291: expected 127 fields, saw 132\n'
b'Skipping line 2287700: expected 127 fields, saw 129\nSkipping line 2290998: expected 127 fields, saw 128\nSkipping line 2291745: expected 127 fields, saw 128\nSkipping line 2292079: expected 127 fields, saw 144\n'
b'Skipping line 2296407: expected 127 

b'Skipping line 2559847: expected 127 fields, saw 128\nSkipping line 2559848: expected 127 fields, saw 128\nSkipping line 2559849: expected 127 fields, saw 128\nSkipping line 2559850: expected 127 fields, saw 128\nSkipping line 2559851: expected 127 fields, saw 128\nSkipping line 2561384: expected 127 fields, saw 128\nSkipping line 2561413: expected 127 fields, saw 130\nSkipping line 2565166: expected 127 fields, saw 130\nSkipping line 2565198: expected 127 fields, saw 129\n'
b'Skipping line 2566270: expected 127 fields, saw 132\nSkipping line 2567912: expected 127 fields, saw 130\nSkipping line 2568102: expected 127 fields, saw 144\nSkipping line 2569523: expected 127 fields, saw 130\n'
b'Skipping line 2574653: expected 127 fields, saw 143\nSkipping line 2575429: expected 127 fields, saw 130\nSkipping line 2577944: expected 127 fields, saw 128\nSkipping line 2579050: expected 127 fields, saw 128\n'
b'Skipping line 2585041: expected 127 fields, saw 138\nSkipping line 2585513: expected 

b'Skipping line 2836429: expected 127 fields, saw 130\nSkipping line 2836540: expected 127 fields, saw 135\nSkipping line 2838034: expected 127 fields, saw 128\nSkipping line 2842677: expected 127 fields, saw 128\nSkipping line 2842695: expected 127 fields, saw 131\nSkipping line 2843575: expected 127 fields, saw 130\n'
b'Skipping line 2845399: expected 127 fields, saw 128\nSkipping line 2845400: expected 127 fields, saw 128\nSkipping line 2845401: expected 127 fields, saw 128\nSkipping line 2845796: expected 127 fields, saw 128\nSkipping line 2847493: expected 127 fields, saw 130\nSkipping line 2848922: expected 127 fields, saw 128\nSkipping line 2849326: expected 127 fields, saw 128\n'
b'Skipping line 2852507: expected 127 fields, saw 128\nSkipping line 2854649: expected 127 fields, saw 131\nSkipping line 2858257: expected 127 fields, saw 130\nSkipping line 2858882: expected 127 fields, saw 130\n'
b'Skipping line 2861895: expected 127 fields, saw 130\nSkipping line 2864322: expected 

> sample size | orig: 2891340, filtered(control): 2891340


Data Transformation 
------------------------

In [7]:
# input: df, df_ctrl

# transform features 
# patient_date_of_birth => age 
# patient_gender => numeric
"""

Ref
   1. replace column values 
        http://pytolearn.csd.auth.gr/b4-pandas/40/moddfcols.html
   2. df.apply()
        http://jonathansoma.com/lede/foundations/classes/pandas%20columns%20and%20functions/apply-a-function-to-every-row-in-a-pandas-dataframe/
"""
def to_age(df, col='patient_date_of_birth', new_col='age', add_new_col=True, throw_=False, default_val=None):
    if not col in df.columns: 
        msg = "Error: Missing {}".format(col)
        if throw_: raise ValueError(msg)
            
        # noop
        return df 
    
    import datetime
    now = datetime.datetime.now()
    
    # date_of_path is rarely NaN but it happens
    if default_val is None: default_val = int(df[col].mean())
    df[col].fillna(value=default_val, inplace=True)
    
    if add_new_col: 
        df[new_col] = df[col].apply(lambda x: now.year-int(x))
    else: 
        df.drop(col, axis=1, inplace=True)
        df['age'] = df[col].apply(lambda x: now.year-int(x))
    return df

def get_eff_values(df, col=''):
    if isinstance(y, DataFrame):
        assert col in df.columns
        return list(df[df[col].notnull()][col].values)
    else: 
        # df is a numpy array
        assert isinstance(df, np.ndarry)
        return list(df[~np.isnan(df)])

def get_sample_sizes(y, sorted_=True, col='test_result_loinc_code'): 
    import collections
    
    if isinstance(y, DataFrame): 
        
        sizes = collections.Counter( y[col].values )
        # sizes = dict(y[col].value_counts())

        # if sorted_: 
            # sort by sample sizes (values)
        #    sizes = collections.OrderedDict( sorted(sizes.items(), key=operator.itemgetter(1), reverse=True) )
    else: 
        # df is a numpy array or list
        sizes = collections.Counter(y)
        
    return sizes # label/col -> sample size

def take(n, iterable):
    from itertools import islice
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

def dehyphenate(df, col='test_result_loinc_code'): # 'LOINC_NUM'
    df[col] = df[col].str.replace('-','')
    return df

def trim_tail(df, col='test_result_loinc_code', delimit=['.', ';']):
    df[col] = df[col].str.lower().replace('(\.|;)[a-zA-Z0-9]*', '', regex=True)
    return df 

def replace_values(df, values=['.', ], new_value='Unknown', col='test_result_loinc_code'):
    for v in values: 
        df[col] = df[col].str.lower().replace(v, new_value)
    return df
    

In [8]:
# remove rows where LOINC code is a null 
import sys
from analyzer import summarize_loinc, save_data
from transformer import dehyphenate, trim_tail, replace_values

tAddCtrl = True # if True, add additional control data
tSave = True  # if True, save the final dataframe/training data prior to further transformations

N0 = df.shape[0]
col_target = col = "test_result_loinc_code"
token_default = 'unknown'
# df = df[df[col_target].notnull()]
N_eff = df[df[col_target].notnull()].shape[0]
df[col_target].fillna(value=token_default, inplace=True)

print("> {} out of {} columns carry effective LOINC codes (ratio: {})".format(N_eff, N0, N_eff/(N0+0.0)))
assert np.sum(df[col_target].isnull()) == 0

# standardize LOINC codes: dehyphenate, replace, trim
#########################################

print('> dehyphenate {} ...'.format(col_target))
dehyphenate(df, col=col_target)
 
### some LOINC codes seems to be malformed (75007.Z60, 57786;)

# replace noisy values 
noisy_values = ['request', 'no loinc needed', '.', ';', 'coumt', 'unloinc']
replace_values(df, values=noisy_values, new_value=token_default, col=col_target)
trim_tail(df, col=col_target, delimit=['.', ';', ])
df[col_target].replace('', token_default, inplace=True) 

loinc_set = ucodes = df[col_target].unique()

# for code in loinc_set: 
#     print("  + {}".format(code))
# sys.exit(0)
    
Nc, Nu = len(df[col_target].values), len(ucodes)
print("> Number of unique loinc codes (n={} vs N={}) => ratio: {}".format(Nu, Nc, Nu/(Nc+0.0) ))
summarize_loinc(ucodes, df=df, n=15, codebook={}) 

# add control data (optional)
#########################################

if tAddCtrl: 
    Nctrl = df_ctrl.shape[0]
    assert Nctrl > N0, f"Control data is too small | n({cohort})={N0} > n(ctrl)={Nctrl}"
    df_ctrl = df_ctrl.sample(n=N0, replace=False)
    
    # replace null codes
    df_ctrl[col_target].fillna(value=token_default, inplace=True)
    
    # standardize LOINC codes
    dehyphenate(df_ctrl, col=col_target)
    replace_values(df_ctrl, values=noisy_values, new_value=token_default, col=col_target)
    trim_tail(df_ctrl, col=col_target, delimit=['.', ';', ])
    df[col_target].replace('', token_default, inplace=True) 
    
    df = pd.concat([df_ctrl, df], ignore_index=True)
    
assert np.sum(df[col_target].isnull()) == 0

print("> Adding control data | N: {} -> {}".format(N0, df.shape[0]))

# balance classes from external dataset
#########################################


# drop duplicates
#########################################

N0 = df.shape[0]
# drop duplicates # e.g. df.drop_duplicates(subset=['launched', 'code'], keep='last')
df = df.drop_duplicates(keep='last')  
N = df.shape[0]
print(f"> Drop dup | n(before): {N0} >? n(after): {N}")

#############################

# save the final dataframe/training data prior to further transformations
# if tSave: 
#    save_data(df, cohort=cohort, verbose=1) # sep/','

### birth date to age
# values = col_values(df, col='patient_date_of_birth', n=10)
# print("> age (prior): {}".format(values))
to_age(df)
values = col_values(df, col='age', n=10)
print("> age: {}".format(values))
# pandas.DataFrame.select_dtypes
# df._get_numeric_data()

# df.info()
assert col_target in df.columns, "Missing target {}".format(col_target)


> 69710 out of 71224 columns carry effective LOINC codes (ratio: 0.9787431202965292)
> dehyphenate test_result_loinc_code ...
> Number of unique loinc codes (n=733 vs N=71224) => ratio: 0.010291474783780749
[205070] n=5
[139527] n=37
[728626] n=71
[205211] n=3
[908947] n=1
[149591] n=2
[51912] n=1
[611517] n=93
[433045] n=1
[44982] n=6
[75007] n=16
[7096] n=14
[80994] n=1
[60855] n=2
[330514] n=1
> Adding control data | N: 71224 -> 142448
> Drop dup | n(before): 142448 >? n(after): 142366
> age: [76 43 48 61 63 74 61 62 71 88]


Define Feature Set
----------------------

In [18]:
"""
Memo
----
1. medivo_test_result_type is a function of the following attributes: 
      "meta_sender_name",
      "receiving_organization_id",
      "test_order_code",
      "test_order_name",
      "test_result_code",
      "test_result_name",
      "test_result_loinc_code",
      "test_result_units_of_measure"
      
"""

cat_cols = ['patient_gender', 
            'patient_state',  # n_uniq=199
            'patient_bill_type',  # n_uniq=31
            'fasting',   # n_uniq=5
            
            'performing_organization_id', # n_uniq=151, m=40%+, NOT part of medivo_test_result_type
            
            'receiving_organization_id', # n_uniq=43, m=50%+, part of medivo_test_result_type
            # 'receiving_organization_name', 
            
            # 'receiving_organization_state', 
            # 'receiving_organization_zip_code', 
            
            # 'ordering_practice_lab_account_name',  # high card
            # 'ordering_practice_lab_account_number', # high card
            
            # 'ordering_practice_city', # high card 
            # 'ordering_practice_state', # high card 124? 
            
            # 'ordering_practice_zip_code', # high card,  n_uniq=79392
            # 'ordering_provider_alternate_id_type',   # n_uniq=32
            
            # 'ordering_provider_alternate_id', # n_uniq=132768
            
            # ---------------------------------
            
            'test_result_status', # n_uniq=144
            # 'test_turnaround_time', # n_uniq=417, high missing
            
            'test_order_code',  # n_uniq=27668
            'test_order_name',  # n_uniq=20039
            
            'test_result_code', # n_uniq=23731 (2771052/2891340)
            'test_result_name',  # n_uniq=15581    # <<<< 
            
            'test_result_value',  # n_uniq=35441    # <<<< 
            'test_result_range',   # n_uniq=151, mostly missing   # <<<< 
            
            'test_result_abnormal_flag',  # n_uniq=524, high missing
            
            'test_result_reference_range',  # n_uniq=5735, moderate missing
            
            'test_result_units_of_measure',  # n_uniq=669, m=40%+
            
            # 'test_result_comment_source', # mostly missing
            
            'test_result_comments',  # mostly missing > 80%   # <<<< 
            
            # 'test_priority', 
            # 'test_specimen_collection_volume',
            
            # 'test_specimen_type',  # mostly missing
            
            # 'test_specimen_source', # n_uniq=15971
            # 'test_relevant_clinical_information', # n_uniq=26/
            
            'test_cpt_code',    # n_uniq=655
            
            # 'parent_test_order_code', # n_uniq=5088
            # 'parent_test_order_name', # high missing
            
            # --- datetime ---
            # 'test_specimen_draw_datetime',  # e.g. '2019-08-07T14:47:00.000Z'
            # 'test_specimen_receipt_datetime', #  e.g. '2016-10-06T10:54:00.000Z
            
            # 'test_specimen_analysis_datetime', # high missin
            # 'test_observation_datetime', 
            
            # 'test_observation_reported_datetime', 
            
            'panel_order_code',  # n_uniq=18018
            'panel_order_name',  # n_uniq=11663
            
            # 'parent_panel_order_code', # high missing
            # 'parent_panel_order_name', # high missing
            
            # 'datetime_of_processing',  # no year e.g. 'Jun 29 14:44:25'
            
            # 'meta_ingestion_datetime',
            
            'meta_sender_name',  #  n_uniq=7, m=0 # <<< 
            #'meta_sender_source',  # n_uniq=2
            # 'meta_sender_type',    # n_uniq=2
            # 'meta_sender_dataset',  # n_uniq=1
            
            'medivo_test_result_type',  # n_uniq=3493, <<<<
        
            ]

cont_cols = ['age',   # patient_gender -> age  # <<< 
     ]  

target_cols = ['test_result_loinc_code', ]

# cardinality < 100
low_card_cols = ['patient_gender', 'fasting', 'meta_sender_name' ]
high_card_cols = list(set(cat_cols)-set(low_card_cols))

feature_lookup = {
   'age': 'numeric', 
    
   'patient_gender': 'cat', 
   'patient_state': 'cat',   # nv: 199
    
   'test_result_status': 'cat',  # nv: 8 
 
   'test_order_code': 'high_card',    # nv=3024, high card  
   'test_result_name': 'str',   # nv=2085, high card
   'test_result_value': 'str',   # should be numeric but may contain strings as well (e.g. NOT APPLICABLE)
    
   # ... nv=3049, high card
  
   # 'test_result_range': 'cond',  # condition, high_miss
   # 'test_result_abnormal_flag': 'cond'   # condition, high_miss
   
   # 'test_result_reference_range': 'cond',   # condition
  
   'test_result_units_of_measure': 'cond',  # condition, high_miss 
    
   'test_result_loinc_code': 'target',
  
   'parent_test_order_code': 'high_card',  # high_miss, high_card
  
   # >>> timestamps
  
   # 'test_specimen_draw_datetime': 'time',  # timestamp; other time stamps: high_miss
   # 'test_observation_datetime': 'time',    # timestamp, high miss
   # 'test_observation_reported_datetime': 'time',   
  
   # 'datetime_of_processing': 'time'
  
   'panel_order_code': 'high_card',   # high_card
   # 'panel_order_name': 'str',   
  
   # >>> ID
   # 'token1': 'id'
  
   # >>> Meta 
   # 'meta_package_source_file': 'meta',   # meta data, str
}


In [10]:
# feature engineering utilities
import collections
from analyzer import save_data

def analyze_values(df, cols=[]): 
    if not cols: cols = df.columns.values 
    for i, col in enumerate(cols): 
        mv = collections.Counter(df[col]).most_common(10)
        mvz = [e[0] for e in mv]
        m = mv[0]
        mn, mc = m
        print("[{}] name: {} => values: \n{}\n ... mode: {}".format(i+1, col, mv, mn))
        
tSave = True  # if True, save the final dataframe/training data prior to further transformations

############################################################################
# datetime attributes
# df['test_specimen_draw_datetime'] = pd.to_datetime(df['test_specimen_draw_datetime'])

############################################################################

analyze_values(df, cols=cat_cols, topn=10)  # topn: most common n feature values (and their counts)

# set default values for each cols
default_values = {'patient_gender': 'U', 
                  'patient_state': 'OO', 
                  'patient_bill_type': 'OO', 
                  'fasting': 'U', 
                   'test_result_status': 'O',
                   }

# save the final dataframe/training data prior to further transformations (
# ... dropping cols with missing values, encoding)
if tSave: 
    V = cont_cols + cat_cols
    L = target_cols    
    save_data(df[V+L], cohort=cohort, verbose=1) # sep/','

[1] name: patient_gender => values: 
[('M', 73442), ('F', 68705), (nan, 219)]
 ... mode: M
[2] name: patient_state => values: 
[('CA', 18128), ('TX', 17501), ('FL', 16775), ('NY', 11341), (nan, 7817), ('NJ', 6739), ('PA', 6739), ('GA', 5308), ('MD', 4947), ('NC', 3427)]
 ... mode: CA
[3] name: patient_bill_type => values: 
[('PRIVATE INSURANCE', 35214), ('PI', 28038), (nan, 24006), ('CM', 15773), ('MEDICARE', 10709), ('MC', 7936), ('MANAGE CARE FFS', 5741), ('CLIENT', 5148), ('MEDICAID', 4244), ('MD', 2956)]
 ... mode: PRIVATE INSURANCE
[4] name: fasting => values: 
[(nan, 114256), ('Y', 9735), ('NOT FASTING', 7419), ('U', 4071), ('N', 4001), ('FASTING', 2884)]
 ... mode: nan
[5] name: test_result_status => values: 
[('F', 81549), (nan, 56772), ('Ordered', 1914), ('C', 1759), ('Final', 201), ('X', 133), ('Cancelled', 36), ('527116304030', 1), ('718868718781', 1)]
 ... mode: F
[6] name: test_order_code => values: 
[('322000', 14205), ('2600010231', 13001), ('005009', 11928), ('550123', 

In [11]:
# Look into feauture lookup file: variable_analysis.txt 
"""
pass #1 

Note: 
1. 

"""
from tabulate import tabulate

tCategorify = False
tDropHighMissing = False # drop columns with high rate of missing values
p_null = 0.9

# print(df.head(10))
# print( tabulate(df.head(10), headers='keys', tablefmt='psql') )

# V = list(feature_lookup.keys())
V = cont_cols + cat_cols
L = target_cols
dfX = df[V]
dfy = df[L]

print("> Given features set:\n{}\n".format(V))

# null loinc is possible due to the control data
assert np.sum(dfy[target_cols[0]].isnull()) == 0
# dfy[col_target].fillna(value='Unknown', inplace=True)

# filter features based on types (e.g. removing meta features)
ftype_drop_list = ['meta', 'target']
ftype_target = ['target']

# fset = [var for var in V if not feature_lookup[var] in ftype_drop_list]
# target = [var for var in V if feature_lookup[var] in ftype_target]
# assert len(target) > 0
#################################################

# drop columns/vars with too many missing values 
N = dfX.shape[0]
n_thresh = int(N * p_null)
nf0 = dfX.shape[1]
fset0 = set(dfX.columns.values)

if tDropHighMissing: 
    # dfX = dfX.dropna(thresh=n_thresh, axis=1)
    print("> before dropping vars (nf={}):\n{}\n".format(nf0, dfX.columns.values))
    dfX = dfX[dfX.columns[dfX.isnull().mean() < p_null]]
    nf = dfX.shape[1]
    print("> Dropped n={} features:\n{}\n".format(nf-nf0, fset0-fset))
    print("> AFTER dropping vars (nf={}):\n{}\n".format(nf, dfX.columns.values))

fset = set(dfX.columns.values)

# fill in missing values (also see default_values)
dfX.fillna(value='unknown', inplace=True)
#################################################
# Convert our three categorical columns to category dtypes.

cat_cols = [cat for cat in cat_cols if cat in dfX.columns]
cont_cols = [c for c in cont_cols if c in dfX.columns]

# output: dfX, dfy

if tCategorify: 

    for cat in cat_cols:
        if cat in dfX.columns: 
            dfX[cat] = dfX[cat].astype('category')
    print("> dtypes: {}".format(dfX.dtypes))

    cat_cols_subset = ['test_result_status', ]
    for col in cat_cols_subset:
        print(f"> [{col}] =>\n  {dfX[col].head().cat.codes}\n")

    X_cats = np.stack([dfX[col].cat.codes.values for col in cat_cols], 1)
    X_conts = np.stack([dfX[col].values for col in cont_cols], 1)

    nf = X_cats.shape[1] + X_conts.shape[1]
    print("> Using n={} features ... #".format(nf))

    # to (X, y)
    X = np.hstack([X_cats, X_conts])
    y = dfy[target_cols[0]].values
    print('> dim(X): {}, dim(y): {}'.format(X.shape, y.shape))

    # non-consistent data types that require further processing
    # e.g. result value type 
    col = "test_result_value"
    print(df[df[col].notnull()][col].dtype)
    # print(list(df[df[col].notnull()][col].values)) # mostly numeric but with some presence of strings ... 
    # ... (e.g. NON-REACTIVE, Clear, Negative, Positive, ...)

    # col = 'patient_gender'
    # print(list(df[col].values))

    # dfX.head(10)

> Given features set:
['age', 'patient_gender', 'patient_state', 'patient_bill_type', 'fasting', 'test_result_status', 'test_order_code', 'test_order_name', 'test_result_code', 'test_result_name', 'test_result_value', 'test_result_range', 'test_result_abnormal_flag', 'test_result_reference_range', 'test_result_units_of_measure', 'test_result_comments', 'test_cpt_code', 'panel_order_code', 'panel_order_name', 'medivo_test_result_type']



Encode Variables
--------------------

In [12]:
# input (X, y) or df , fset, target
"""
With the variable set determined, now proceed to encode them with appropriate numeric values. 

Dependency
   pip install category_encoders

Ref
   http://contrib.scikit-learn.org/categorical-encoding/index.html
   
   
Memo
----
1. Use df.map()

   df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
   
"""
# dir(ce.OrdinalEncoder(cols=[col, ]))

def encode_vars_via_lookup(fset, feature_lookup): 
    import category_encoders as ce
    
    for var in fset: 

        encoder = None
        vtype = feature_lookup.get(var, 'numeric') # default numeric
        
        if vtype == 'ord': 
            encoder = ce.OrdinalEncoder(cols=[var, ])
        elif vtype == 'cat': 
            encoder = ce.OneHotEncoder(cols=[var, ])
        elif vtype in ('str', 'high_card'): # high_card: categorical but with high cardinality
            encoder = ce.BinaryEncoder(cols=[var, ])  # ... or use ce.HashingEncoder()
        else: 
            # assuming that the var is numeric
            pass 

        # data imputation [todo]

        if encoder is not None: 
            dfX = encoder.fit_transform(dfX, dfy)
    return dfX
            
def encode_vars(dfX, fset, high_card_cols=[], dfy=None):
    import category_encoders as ce
    
    low_card_cols = list(set(fset)-set(high_card_cols))
    
    n_trans = 0
    for var in fset: 

        encoder = None
        if var in low_card_cols: 
            encoder = ce.OneHotEncoder(cols=[var, ])
        elif var in high_card_cols: # categorical but with high cardinality
            encoder = ce.BinaryEncoder(cols=[var, ])  # or use ce.HashingEncoder()

        # data imputation

        if encoder is not None: 
            n_trans += 1
            print(f'... transforming var: {var} ...')
            if dfy is not None: 
                dfX = encoder.fit_transform(dfX, dfy)
            else: 
                dfX = encoder.fit_transform(dfX)
    assert n_trans > 0
    return dfX
    
    # debug
#     if var in ['patient_gender', ]: 
#         print("> var: {} | type: {}".format(var, feature_lookup[var]))
#         print("> before:\n{}\n".format(dfX[var].head(100)))

#         dfX = encoder.fit_transform(dfX, dfy)

#         print("> after:\n{}\n".format(dfX.head(100)))
print("> prior to encoding | dim(dfX): {}".format(dfX.shape))
print("> n_cat_cols: {}".format(len(cat_cols)))
print("> high card vars:\n{}\n".format(high_card_cols))    

# dfX['target'] = dfy
dfX = encode_vars(dfX, fset=cat_cols, high_card_cols=high_card_cols)
print("> After variable encoding we have dim(dfX): {}".format(dfX.shape))
print("> new feature set:\n{}\n".format(dfX.columns))
dfX.head(10)

> prior to encoding | dim(dfX): (142366, 20)
> n_cat_cols: 19
> high card vars:
['panel_order_code', 'test_cpt_code', 'test_result_range', 'medivo_test_result_type', 'test_result_status', 'patient_bill_type', 'test_order_code', 'test_result_name', 'test_result_units_of_measure', 'test_result_reference_range', 'test_result_abnormal_flag', 'test_result_comments', 'test_result_code', 'test_result_value', 'patient_state', 'panel_order_name', 'test_order_name']

... transforming var: patient_gender ...
... transforming var: patient_state ...
... transforming var: patient_bill_type ...
... transforming var: fasting ...
... transforming var: test_result_status ...
... transforming var: test_order_code ...
... transforming var: test_order_name ...
... transforming var: test_result_code ...
... transforming var: test_result_name ...
... transforming var: test_result_value ...
... transforming var: test_result_range ...
... transforming var: test_result_abnormal_flag ...
... transforming var: te

Unnamed: 0,age,patient_gender_1,patient_gender_2,patient_gender_3,patient_state_0,patient_state_1,patient_state_2,patient_state_3,patient_state_4,patient_state_5,...,medivo_test_result_type_2,medivo_test_result_type_3,medivo_test_result_type_4,medivo_test_result_type_5,medivo_test_result_type_6,medivo_test_result_type_7,medivo_test_result_type_8,medivo_test_result_type_9,medivo_test_result_type_10,medivo_test_result_type_11
0,34,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,72,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,60,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
3,48,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,70,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
5,69,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,1
6,72,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,1,0
7,29,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,1,1
8,69,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
9,69,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


Encode Labels
------------------

In [13]:
"""

Ref
---
   1. DataFrame with if condition: https://datatofish.com/if-condition-in-pandas-dataframe/
   
   2. Sort dictionary: 
      <link> https://stackoverflow.com/questions/613183/how-do-i-sort-a-dictionary-by-value?rq=1
       
      sorted_x = sorted(x.items(), key=operator.itemgetter(1))
   
"""
from analyzer import encode_labels, summarize_dict
import collections, operator

# verify
assert dfX.shape[0] == dfy.shape[0]
print("> dim(dfX): {} | dfy.cols: {}".format(dfX.shape, dfy.columns.values))

codebook={'pos': 1, 'neg': 0, '+': 1, '-': 0}

# choose the one with a large sample size as 'positive'
col_label = 'test_result_loinc_code' # strings

topn = 5
sizes = get_sample_sizes(dfy[col_label])
# ... sizes: (loinc) label -> sample size

# sort by values
sizes_sorted = sorted(sizes.items(), key=operator.itemgetter(1))
summarize_dict(sizes, topn=15, sort_=True)

print("> sizes: {}".format(sizes.most_common(20)))
top_sample_sizes = sizes.most_common(topn)  # take(topn, sizes.items())
print("> Top N={} codes:\n{}\n".format(topn, top_sample_sizes))

# test
target = top_sample_sizes[0][0]
y = encode_labels(dfy, pos_label=target, codebook=codebook, verbose=1)


> dim(dfX): (142366, 197) | dfy.cols: ['test_result_loinc_code']
[507582] -> 1
[353888] -> 1
[728337] -> 1
[204487] -> 1
[62422] -> 1
[54049] -> 1
[130476] -> 1
[269340] -> 1
[77933] -> 1
[214411] -> 1
[292573] -> 1
[64576] -> 1
[310813] -> 1
[325621] -> 1
[152850] -> 1
> sizes: [('unknown', 12033), ('67686', 5720), ('19752', 4454), ('17426', 4445), ('17517', 4399), ('21600', 4207), ('178616', 3870), ('7773', 3698), ('7187', 3688), ('486431', 3671), ('19208', 3627), ('30973', 3619), ('66902', 3263), ('28233', 3083), ('7310', 2989), ('7112', 2749), ('7138', 2728), ('110114', 2550), ('7708', 2438), ('7518', 2405)]
> Top N=5 codes:
[('unknown', 12033), ('67686', 5720), ('19752', 4454), ('17426', 4445), ('17517', 4399)]

(encode_labels) sample size: Counter({0: 130333, 1: 12033})


Initial Model Training
-------------------------

### Feature Selection

In [14]:
"""
Ref
---
1. pip install feature-selector

   https://github.com/WillKoehrsen/feature-selector
   
   possible dependency 
      brew install libomp
      
   <debug> 
       + RuntimeError: Python is not installed as a framework.
          > https://stackoverflow.com/questions/34977388/matplotlib-runtimeerror-python-is-not-installed-as-a-framework
   
"""
# import feature_selector 


'\nRef\n---\n1. pip install feature-selector\n\n   https://github.com/WillKoehrsen/feature-selector\n   \n   possible dependency \n      brew install libomp\n      \n   <debug> \n       + RuntimeError: Python is not installed as a framework.\n          > https://stackoverflow.com/questions/34977388/matplotlib-runtimeerror-python-is-not-installed-as-a-framework\n   \n'

### Model Training

In [15]:
import utils_tree, utils_sys, analyzer
import collections
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# data transformation
col = 'test_result_loinc_code'
X, y = dfX.values, dfy[col].values
print("> dim(X): {}".format(X.shape))
print("> y: {}".format(y))

# feature scaling
scaler = MinMaxScaler() # MinMaxScaler(), StandardScaler()
X = scaler.fit_transform(X)

n_fold = 5
n_min = n_fold
header = ['code', 'mean', 'std', 'n_pos']
sdict = {h:[] for h in header}
for code in loinc_set: 
    y_eff = analyzer.encode_labels(y, pos_label=code)
    
    counter = collections.Counter(y_eff)
    n_pos, n_neg = counter[codebook['pos']], counter[codebook['neg']]
    print(f"> sample size | n(+): {n_pos}, n(-): {n_neg}")
    
    if n_pos >= n_min: 
        scores = analyzer.eval_performance(X, y_eff, model=None, cv=n_fold, random_state=53, verbose=1)
        mean_score = np.mean(scores)
        std_score = np.std(scores)
        print("> average: {}, std: {}".format(mean_score, std_score))
    else: 
        print("> (positive) sample size too small, n={}".format(n_pos))
        mean_score = -1 
        std_score = -1
    sdict['code'].append(code)
    sdict['mean'].append(mean_score)
    sdict['std'].append(std_score)
    sdict['n_pos'].append(n_pos)

# --------------------------------------------------
# save performance dataframe
df_perf = DataFrame(sdict, columns=header)
df_perf = df_perf.sort_values(by=['mean', ]) # ascending=False

cohort = 'hepatitis-c'
output_dir = os.path.join(os.getcwd(), 'result')
output_file = f"performance-{cohort}.csv" 
output_path = os.path.join(output_dir, output_file)
df_perf.to_csv(output_path, sep='|', index=False, header=True)

for code, score in zip(df_perf['code'], df_perf['mean']):
    print(f"[{code}] -> {score}")


> dim(X): (142366, 197)
> y: ['265116' '21600' '151522' ... '486431' '67686' 'unknown']
(encode_labels) sample size: Counter({0: 137921, 1: 4445})
> sample size | n(+): 4445, n(-): 137921
> 0 of KFold 5
> Fmax: 0.9955156950672646 p_th: 0.6885719072452219 | F1: 0.9949579831932773, AUC: 0.9999078826402817
> 1 of KFold 5
> Fmax: 0.9977553310886644 p_th: 0.9169662198521715 | F1: 0.9971957375210321, AUC: 0.9999807928953777
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.8111425878465537 | F1: 0.998876404494382, AUC: 1.0
> 3 of KFold 5
> Fmax: 0.996078431372549 p_th: 0.5789812124109107 | F1: 0.996078431372549, AUC: 0.9999370773621394
> 4 of KFold 5
> Fmax: 0.998876404494382 p_th: 0.8771446380784739 | F1: 0.9966367713004484, AUC: 0.9999980018086486
> average: 0.9976451724045721, std: 0.001677058829640654
(encode_labels) sample size: Counter({0: 142085, 1: 281})
> sample size | n(+): 281, n(-): 142085
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9891350079648591 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0

> average: 0.9992241368863637, std: 0.0006343431496662645
(encode_labels) sample size: Counter({0: 138739, 1: 3627})
> sample size | n(+): 3627, n(-): 138739
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9507270621452104 | F1: 0.9993117687543015, AUC: 1.0
> 1 of KFold 5
> Fmax: 0.9993108201240524 p_th: 0.5819078623416882 | F1: 0.9986225895316805, AUC: 0.9999748325245986
> 2 of KFold 5
> Fmax: 0.9993108201240524 p_th: 0.8937708500607291 | F1: 0.9979353062629044, AUC: 0.999999602332321
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9736775214454112 | F1: 0.9993108201240524, AUC: 1.0
> 4 of KFold 5
> Fmax: 0.997932460372157 p_th: 0.9812799693087966 | F1: 0.9965588437715072, AUC: 0.999971963418226
> average: 0.9993108201240524, std: 0.0007549587284703007
(encode_labels) sample size: Counter({0: 139816, 1: 2550})
> sample size | n(+): 2550, n(-): 139816
> 0 of KFold 5
> Fmax: 0.9970674486803519 p_th: 0.7356688956720392 | F1: 0.9951219512195122, AUC: 0.9999995091728581
> 1 of KFold 5
> Fmax: 0.9980430528375733 p_t

(encode_labels) sample size: Counter({0: 141729, 1: 637})
> sample size | n(+): 637, n(-): 141729
> 0 of KFold 5
> Fmax: 0.9808429118773946 p_th: 0.9886895135107929 | F1: 0.9624060150375939, AUC: 0.9999790534819728
> 1 of KFold 5
> Fmax: 0.9844961240310077 p_th: 0.8724185032909126 | F1: 0.9694656488549618, AUC: 0.9999837388873211
> 2 of KFold 5
> Fmax: 0.9883268482490272 p_th: 0.40400056694062714 | F1: 0.9843749999999999, AUC: 0.999998333306481
> 3 of KFold 5
> Fmax: 0.9921875 p_th: 0.9905426301568176 | F1: 0.9731800766283525, AUC: 0.9999949999194432
> 4 of KFold 5
> Fmax: 0.980544747081712 p_th: 0.8128631100481778 | F1: 0.9655172413793104, AUC: 0.9999830546847547
> average: 0.9852796262478283, std: 0.004465908027419081
(encode_labels) sample size: Counter({0: 141951, 1: 415})
> sample size | n(+): 415, n(-): 141951
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9749350143767448 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 0.9940119760479043 p_th: 0.99758560468035 | F1: 0.988095238095238, AUC: 0.999

> Fmax: 0.9880952380952381 p_th: 0.578302036560012 | F1: 0.9764705882352942, AUC: 0.9999958067059158
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9958219762729044 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9931856311824522 | F1: 0.9940119760479043, AUC: 1.0
> 3 of KFold 5
> Fmax: 0.9878048780487805 p_th: 0.9082983493404707 | F1: 0.9820359281437125, AUC: 0.9999668982375433
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9892382382519659 | F1: 1.0, AUC: 1.0
> average: 0.9951800232288036, std: 0.005903955877880985
(encode_labels) sample size: Counter({0: 141203, 1: 1163})
> sample size | n(+): 1163, n(-): 141203
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9170620597361667 | F1: 0.9957264957264957, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.95710215424813 | F1: 0.9936034115138593, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9762772719055678 | F1: 0.9914893617021276, AUC: 1.0
> 3 of KFold 5
> Fmax: 0.9978494623655914 p_th: 0.9624931986072789 | F1: 0.9914529914529915, AUC: 0.9999998473673928
> 4 of KFold 5
> F

> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9967539301931166 | F1: 0.9841269841269841, AUC: 1.0
> 4 of KFold 5
> Fmax: 0.9945945945945946 p_th: 0.9912289846620786 | F1: 0.9735449735449735, AUC: 0.999979918514637
> average: 0.9978494002023414, std: 0.0026339995169354516
(encode_labels) sample size: Counter({0: 142352, 1: 14})
> sample size | n(+): 14, n(-): 142352
> 0 of KFold 5
> Fmax: 0.8571428571428571 p_th: 0.8522838897817848 | F1: 0.8571428571428571, AUC: 0.9999882921803472
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9999371087612073 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 0.5 p_th: 0.9998527611888222 | F1: 0.5, AUC: 0.9994614213792296
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9971982029164488 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 0.6666666666666666 p_th: 0.9999192436058656 | F1: 0.5, AUC: 0.9999297506146821
> average: 0.8047619047619048, std: 0.19541223360650659
(encode_labels) sample size: Counter({0: 142320, 1: 46})
> sample size | n(+): 46, n(-): 142320
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.547907

> Fmax: 1.0 p_th: 0.9806570921436495 | F1: 0.9090909090909091, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.8926601470670542 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.957674231368317 | F1: 0.9523809523809523, AUC: 1.0
> average: 0.9904761904761905, std: 0.019047619047619067
(encode_labels) sample size: Counter({0: 142226, 1: 140})
> sample size | n(+): 140, n(-): 142226
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9998812867873661 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9995319186853359 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9431832389144594 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9990341909239963 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9973753412608882 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 142323, 1: 43})
> sample size | n(+): 43, n(-): 142323
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.983230639297474 | F1: 0.9473684210526316, AUC: 1.0
> 1 of KFold 5
> Fmax: 0.9411764705882353 p_th: 

> Fmax: 1.0 p_th: 0.9570470940052006 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9992389823752622 | F1: 0.6666666666666666, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9841983423267805 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.999429168453836 | F1: 1.0, AUC: 1.0
> average: 0.9333333333333332, std: 0.13333333333333336
(encode_labels) sample size: Counter({0: 142341, 1: 25})
> sample size | n(+): 25, n(-): 142341
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9799369487766404 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9999246620047512 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9845167089350049 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.999945084257652 | F1: 1.0, AUC: 0.9999999999999999
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9998719383480243 | F1: 1.0, AUC: 0.9999999999999999
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 142357, 1: 9})
> sample size | n(+): 9, n(-): 142357
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9921605630048627 |

> Fmax: 1.0 p_th: 0.9937006885364028 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.5633642070935678 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9997722681437016 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 142326, 1: 40})
> sample size | n(+): 40, n(-): 142326
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9997082847706488 | F1: 0.9411764705882353, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9997331611624959 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9936161081864452 | F1: 0.9411764705882353, AUC: 1.0
> 3 of KFold 5
> Fmax: 0.9333333333333333 p_th: 0.9952130926124982 | F1: 0.7777777777777777, AUC: 0.9999824345687687
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9894392321999622 | F1: 0.888888888888889, AUC: 1.0
> average: 0.9866666666666667, std: 0.02666666666666666
(encode_labels) sample size: Counter({0: 142363, 1: 3})
> sample size | n(+): 3, n(-): 142363
> (positive) sample size too small, n=3
(encode_labels) sample size: Counter({0

> average: 0.9269841269841269, std: 0.06073373482447298
(encode_labels) sample size: Counter({0: 142318, 1: 48})
> sample size | n(+): 48, n(-): 142318
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9992109032116543 | F1: 0.9090909090909091, AUC: 1.0
> 1 of KFold 5
> Fmax: 0.9523809523809523 p_th: 0.9873431540255388 | F1: 0.8695652173913044, AUC: 0.9999859471613265
> 2 of KFold 5
> Fmax: 0.9523809523809523 p_th: 0.9321001613374169 | F1: 0.8695652173913044, AUC: 0.9999964867903317
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.7503475883784204 | F1: 0.9473684210526316, AUC: 1.0
> 4 of KFold 5
> Fmax: 0.9473684210526316 p_th: 0.9113164934360268 | F1: 0.9, AUC: 0.9999960962965566
> average: 0.9704260651629072, std: 0.02421628537179181
(encode_labels) sample size: Counter({0: 142342, 1: 24})
> sample size | n(+): 24, n(-): 142342
> 0 of KFold 5
> Fmax: 0.888888888888889 p_th: 0.5494823819978559 | F1: 0.888888888888889, AUC: 0.9997119674031403
> 1 of KFold 5
> Fmax: 0.8000000000000002 p_th: 0.2708408972217722 | F1

(encode_labels) sample size: Counter({0: 142331, 1: 35})
> sample size | n(+): 35, n(-): 142331
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.999975084661321 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 0.923076923076923 p_th: 0.9998262872481752 | F1: 0.923076923076923, AUC: 0.9776374823097229
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9999352298582106 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.999630543449133 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9996984774643323 | F1: 1.0, AUC: 1.0
> average: 0.9846153846153847, std: 0.030769230769230795
(encode_labels) sample size: Counter({0: 142362, 1: 4})
> sample size | n(+): 4, n(-): 142362
> (positive) sample size too small, n=4
(encode_labels) sample size: Counter({0: 142334, 1: 32})
> sample size | n(+): 32, n(-): 142334
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9700208192096257 | F1: 0.9333333333333333, AUC: 1.0
> 1 of KFold 5
> Fmax: 0.923076923076923 p_th: 0.8286508122852101 | F1: 0.923076923076923, AUC: 0.9999297432114379
> 2 of KFold 5

> (positive) sample size too small, n=3
(encode_labels) sample size: Counter({0: 142365, 1: 1})
> sample size | n(+): 1, n(-): 142365
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 142356, 1: 10})
> sample size | n(+): 10, n(-): 142356
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9818836777641144 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9993620012327133 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9897288760714951 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 0.6666666666666666 p_th: 0.9999769678961444 | F1: 0.6666666666666666, AUC: 0.9999473148115626
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9560262903542955 | F1: 1.0, AUC: 1.0
> average: 0.9333333333333332, std: 0.13333333333333336
(encode_labels) sample size: Counter({0: 142361, 1: 5})
> sample size | n(+): 5, n(-): 142361
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.999868490840995 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9995008377527014 | F1: 0.6666666666666666, AUC: 1.0
> 2 of KFold 5
> 

> sample size | n(+): 9, n(-): 142357
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9980579320353921 | F1: 0.8, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.7976128390312089 | F1: 0.8, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9980480722950906 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9991063098246472 | F1: 0.8, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9981683569998926 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 142335, 1: 31})
> sample size | n(+): 31, n(-): 142335
> 0 of KFold 5
> Fmax: 0.923076923076923 p_th: 0.998854938548792 | F1: 0.7999999999999999, AUC: 0.9999849449738796
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9985281936383124 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 0.8 p_th: 0.5521712645854486 | F1: 0.8, AUC: 0.997663961780307
> 3 of KFold 5
> Fmax: 0.923076923076923 p_th: 0.9535325785884244 | F1: 0.7499999999999999, AUC: 0.9999941452676199
> 4 of KFold 5
> Fmax: 0.9090909090909091 p_th: 0.9590968247734278 | F1: 0.7692307692307692,

> Fmax: 1.0 p_th: 0.8066055617262257 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9999078200796343 | F1: 0.6666666666666666, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9999796670900997 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.986994437229782 | F1: 0.6666666666666666, AUC: 0.9999999999999999
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9999837489846234 | F1: 0.6666666666666666, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 142355, 1: 11})
> sample size | n(+): 11, n(-): 142355
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9995005968993693 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9998492979562232 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9869295376570234 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 0.6666666666666666 p_th: 0.9999637623012061 | F1: 0.6666666666666666, AUC: 0.9995082715745847
> 4 of KFold 5
> Fmax: 0.6666666666666666 p_th: 0.36480422582025634 | F1: 0.4, AUC: 0.9999648765410417
> average: 0.8666666666666666, std: 0.1

> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9989534671807051 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9998842429479489 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9998305228042308 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9999077392482243 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9999704931647236 | F1: 1.0, AUC: 1.0
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 142362, 1: 4})
> sample size | n(+): 4, n(-): 142362
> (positive) sample size too small, n=4
(encode_labels) sample size: Counter({0: 142335, 1: 31})
> sample size | n(+): 31, n(-): 142335
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9725587132132828 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 0.9090909090909091 p_th: 0.9994551106178599 | F1: 0.9090909090909091, AUC: 0.9998302127609747
> 2 of KFold 5
> Fmax: 0.6666666666666666 p_th: 0.999725340275312 | F1: 0.6666666666666666, AUC: 0.998975421833468
> 3 of KFold 5
> Fmax: 0.9090909090909091 p_th: 0.9996051551393187 | F1: 0.555555

> (positive) sample size too small, n=2
(encode_labels) sample size: Counter({0: 142358, 1: 8})
> sample size | n(+): 8, n(-): 142358
> 0 of KFold 5
> Fmax: 0.8 p_th: 0.5856193341586962 | F1: 0.6666666666666666, AUC: 0.9999824388873279
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.999123262051939 | F1: 0.8, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9982269323947061 | F1: 0.8, AUC: 1.0
> 3 of KFold 5
> Fmax: 0.4 p_th: 0.18730012424922252 | F1: 0.0, AUC: 0.9998946296231253
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9999485944092522 | F1: 0.6666666666666666, AUC: 1.0
> average: 0.8399999999999999, std: 0.233238075793812
(encode_labels) sample size: Counter({0: 142361, 1: 5})
> sample size | n(+): 5, n(-): 142361
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9943917853708018 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.918803312044074 | F1: 0.6666666666666666, AUC: 1.0
> 2 of KFold 5
> Fmax: 0.0005346164127238706 p_th: 2.5803468171338018e-05 | F1: 0.0, AUC: 0.8686779994380444
> 3 of KFold 5
> Fmax: 0.666666

(encode_labels) sample size: Counter({0: 142359, 1: 7})
> sample size | n(+): 7, n(-): 142359
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9999318875117207 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9998282972409047 | F1: 1.0, AUC: 0.9999999999999999
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9999477750126271 | F1: 1.0, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9999664360716092 | F1: 1.0, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.999959922840475 | F1: 1.0, AUC: 0.9999999999999999
> average: 1.0, std: 0.0
(encode_labels) sample size: Counter({0: 142358, 1: 8})
> sample size | n(+): 8, n(-): 142358
> 0 of KFold 5
> Fmax: 1.0 p_th: 0.9897686324503202 | F1: 1.0, AUC: 1.0
> 1 of KFold 5
> Fmax: 0.6666666666666666 p_th: 0.4346993855737485 | F1: 0.0, AUC: 0.9958028940713683
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.788883169688769 | F1: 0.8, AUC: 1.0
> 3 of KFold 5
> Fmax: 0.6666666666666666 p_th: 0.3869375620000656 | F1: 0.0, AUC: 0.9999648765410418
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9979035556540127

> Fmax: 0.8571428571428571 p_th: 0.2962061421327581 | F1: 0.6666666666666666, AUC: 0.9993853178784685
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9870317152633965 | F1: 0.8, AUC: 1.0
> 2 of KFold 5
> Fmax: 0.8571428571428571 p_th: 0.9403669806692129 | F1: 0.8571428571428571, AUC: 0.9868633649455566
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.9901247181457095 | F1: 0.8571428571428571, AUC: 1.0
> 4 of KFold 5
> Fmax: 1.0 p_th: 0.9626208076283213 | F1: 0.8571428571428571, AUC: 1.0
> average: 0.9428571428571428, std: 0.06998542122237654
(encode_labels) sample size: Counter({0: 142364, 1: 2})
> sample size | n(+): 2, n(-): 142364
> (positive) sample size too small, n=2
(encode_labels) sample size: Counter({0: 142365, 1: 1})
> sample size | n(+): 1, n(-): 142365
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 142364, 1: 2})
> sample size | n(+): 2, n(-): 142364
> (positive) sample size too small, n=2
(encode_labels) sample size: Counter({0: 142362, 1: 4})
> sample size | n(+):

> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 142363, 1: 3})
> sample size | n(+): 3, n(-): 142363
> (positive) sample size too small, n=3
(encode_labels) sample size: Counter({0: 142355, 1: 11})
> sample size | n(+): 11, n(-): 142355
> 0 of KFold 5
> Fmax: 0.5 p_th: 0.9999495290376824 | F1: 0.5, AUC: 0.9643848126163465
> 1 of KFold 5
> Fmax: 1.0 p_th: 0.9984785042644874 | F1: 1.0, AUC: 1.0
> 2 of KFold 5
> Fmax: 1.0 p_th: 0.9999417143235245 | F1: 0.8, AUC: 1.0
> 3 of KFold 5
> Fmax: 1.0 p_th: 0.7468592899609766 | F1: 0.8, AUC: 1.0
> 4 of KFold 5
> Fmax: 0.6666666666666666 p_th: 0.3434100531919855 | F1: 0.5, AUC: 0.9999648765410418
> average: 0.8333333333333334, std: 0.21081851067789195
(encode_labels) sample size: Counter({0: 142364, 1: 2})
> sample size | n(+): 2, n(-): 142364
> (positive) sample size too small, n=2
(encode_labels) sample size: Counter({0: 142363, 1: 3})
> sample size | n(+): 3, n(-): 142363
> (positive) sample size too small, n=3
(

> (positive) sample size too small, n=3
(encode_labels) sample size: Counter({0: 142365, 1: 1})
> sample size | n(+): 1, n(-): 142365
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 142365, 1: 1})
> sample size | n(+): 1, n(-): 142365
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 142365, 1: 1})
> sample size | n(+): 1, n(-): 142365
> (positive) sample size too small, n=1
(encode_labels) sample size: Counter({0: 142364, 1: 2})
> sample size | n(+): 2, n(-): 142364
> (positive) sample size too small, n=2
(encode_labels) sample size: Counter({0: 142362, 1: 4})
> sample size | n(+): 4, n(-): 142362
> (positive) sample size too small, n=4
(encode_labels) sample size: Counter({0: 142364, 1: 2})
> sample size | n(+): 2, n(-): 142364
> (positive) sample size too small, n=2
(encode_labels) sample size: Counter({0: 142365, 1: 1})
> sample size | n(+): 1, n(-): 142365
> (positive) sample size too small, n=1
(encode_labels) sample

[7401] -> 0.9333333333333332
[122358] -> 0.9333333333333332
[205070] -> 0.9333333333333332
[181826] -> 0.9333333333333332
[296095] -> 0.9333333333333332
[20008] -> 0.9333333333333332
[142778] -> 0.9333333333333332
[142786] -> 0.9333333333333332
[107011] -> 0.9333333333333332
[490243] -> 0.9333333333333332
[7690] -> 0.9333333333333332
[44982] -> 0.9333333333333332
[178194] -> 0.9333333333333332
[453530] -> 0.9333333333333332
[25148] -> 0.9333333333333332
[223149] -> 0.9333333333333332
[823799] -> 0.9333333333333332
[384453] -> 0.9333333333333332
[243311] -> 0.9333333333333332
[303859] -> 0.9333333333333332
[139519] -> 0.9333333333333333
[143149] -> 0.9333333333333333
[823773] -> 0.9333333333333333
[148049] -> 0.9333333333333333
[24588] -> 0.9341991341991343
[539627] -> 0.9346638655462185
[241133] -> 0.9377777777777778
[312082] -> 0.9404305667010584
[28688] -> 0.9425925925925925
[602797] -> 0.9428571428571428
[79053] -> 0.9428571428571428
[483784] -> 0.9428571428571428
[369165] -> 0.9428

 ### Visualize Results

In [16]:
"""

Memo
---- 
1. performance plot

   perplot: https://pypi.org/project/perfplot/
"""
import seaborn as sns
import matplotlib.pyplot as plt
from analyzer import load_performance

sns.set(style="whitegrid")

# Initialize the matplotlib figure
f, ax = plt.subplots(figsize=(6, 20))
sns.set_color_codes("pastel")

#---------------------------------------------

# load performance data
cohort = 'hepatitis-c'
df_perf = load_performance(input_dir='result', cohort=cohort)
print("> dim(performance matrix): {}".format(df_perf.shape))

# sort ~ performance scores 
# df_perf = df_perf.sort_values(by=['mean', ], ascending=False)

header = ['code', 'mean', 'std', 'n_pos']
codes = df_perf['code']
n_codes = len(codes)
scores = df_perf['mean']

# some statistics
score_high = 0.90
score_low = 0.50

codes_low_sz = df_perf.loc[df_perf['mean'] < 0]['code']
codes_scored = df_perf.loc[df_perf['mean'] >= 0]['code']
codes_high_score = df_perf.loc[df_perf['mean'] >= score_high]['code']
assert n_codes == len(codes_low_sz) + len(codes_scored)

print("1. Total number of codes: {} | n(low_sample): {}, n(scored):{}, n(high scored):{}".format(n_codes, 
   len(codes_low_sz), len(codes_scored), len(codes_high_score)))
r_scored = len(codes_scored)/(n_codes+0.0)
rh = len(codes_high_score)/(n_codes+0.0)
print("2. Fraction of scored codes: {}".format(r_scored))
print("3. Fraction of highly scored codes: {}".format(rh))

# Effective performance dataframe, ruling out those codes without scores (due to low sample sizes)
df_eff = df_perf.loc[df_perf['mean'] >= 0.0]

n_offset = 25
df_topn = df_eff.sort_values(['mean', ], ascending=False).head(n_offset)
df_botn = df_eff.sort_values(['mean', ], ascending=True).head(n_offset)
# print(df_botn)

# codes = [str(c) for c in df_botn['code'].values]
# print('lower codes: {}'.format(codes))
# scores = df_botn['mean'].values
# print('scores: {}'.format(scores))

# top n + bottom n
dfe = pd.concat([df_topn, df_botn], ignore_index=True)
dfe.sort_values(by=['mean', ], ascending=False, inplace=True)
codes = [str(c) for c in dfe['code'].values]
scores = dfe['mean'].values
# print('lower(n)+higher codes(n): {}'.format(codes))
# print('scores: {}'.format(scores))
print(dfe)

# sns.barplot(x="total", y="abbrev", data=crashes,
#             label="Total", color="b")

# --------------------
# ax = sns.barplot(x='mean', y='code', data=df_botn)
# print("-------------------------\n\n")
# print("> dtype: {}".format(df_botn.dtypes))
# print(df_botn.head(10))

# dfe = dfe[['mean', 'code']]
# dfe.plot(kind='bar')

sns.barplot(x='mean', y='code', data=dfe, order=dfe['code'], # order has to be specified; even if already sorted!!!
            label="LOINC", color="b", orient='h')

# ax = sns.barplot(x='mean', y='code', data=df)

# ax.set_xlabel('Fmax Score')
# ax.set_ylabel('LOINC')


> dim(performance matrix): (733, 4)
> dim(performance matrix): (733, 4)
1. Total number of codes: 733 | n(low_sample): 292, n(scored):441, n(high scored):371
2. Fraction of scored codes: 0.6016371077762619
3. Fraction of highly scored codes: 0.5061391541609823
      code      mean       std  n_pos
0   327767  1.000000  0.000000     31
13  112599  1.000000  0.000000    175
23  487967  1.000000  0.000000      6
22  191395  1.000000  0.000000     79
21  111567  1.000000  0.000000      7
20  427682  1.000000  0.000000     20
19  301804  1.000000  0.000000    235
18  460980  1.000000  0.000000     12
17  203943  1.000000  0.000000    490
16  204081  1.000000  0.000000     37
15   20750  1.000000  0.000000    362
14  332569  1.000000  0.000000      5
12  736553  1.000000  0.000000     72
1   632117  1.000000  0.000000      5
11  882944  1.000000  0.000000    625
10  264648  1.000000  0.000000    166
9    63016  1.000000  0.000000    828
8     7765  1.000000  0.000000   1193
7   264531  1.000

<matplotlib.axes._subplots.AxesSubplot at 0x1a5758c438>

## LOINC mapping

In [17]:
"""
Relevant features 
   "LOINC_NUM","COMPONENT","PROPERTY","TIME_ASPCT",
   "METHOD_TYP","CLASS"
   "DefinitionDescription","STATUS"
   
- 6 parts 

  <component/analyte>:<kind of property>:<time aspect>:<system type>:<scale>:<method>
"""

input_file = "Loinc.csv"
input_dir = "LoincTable"

input_path = os.path.join(input_dir, input_file)
df = pd.read_csv(input_path, sep=',', header=0, index_col=None, error_bad_lines=False)

summary(df, n=1)


> sample sizes: 92369
> n(features):  46

[LOINC_NUM] => n_uniq=92369/(n=92369,N=92369 | r_miss=0.0%), dtype=object
  :['16852-6']

[COMPONENT] => n_uniq=47670/(n=92369,N=92369 | r_miss=0.0%), dtype=object
  :['Tobramycin induced platelet Ab.IgM']

[PROPERTY] => n_uniq=211/(n=92369,N=92369 | r_miss=0.0%), dtype=object
  :['Time']

[TIME_ASPCT] => n_uniq=94/(n=92369,N=92369 | r_miss=0.0%), dtype=object
  :['1W']

[SYSTEM] => n_uniq=2473/(n=92369,N=92369 | r_miss=0.0%), dtype=object
  :['Leg.right']

[SCALE_TYP] => n_uniq=10/(n=92369,N=92369 | r_miss=0.0%), dtype=object
  :['Set']

[METHOD_TYP] => n_uniq=1904/(n=49133,N=92369 | r_miss=46.81%), dtype=object
  :['Vascular surgery.attending']

[CLASS] => n_uniq=379/(n=92369,N=92369 | r_miss=0.0%), dtype=object
  :['EYE.EOG.NEI']

[VersionLastChanged] => n_uniq=83/(n=92369,N=92369 | r_miss=0.0%), dtype=object
  :['2.56']

[CHNG_TYPE] => n_uniq=7/(n=92369,N=92369 | r_miss=0.0%), dtype=object
  :['NAM']

[DefinitionDescription] => n_uniq=8813/