In [1]:
import pandas as pd 
from pandas import DataFrame, Series
import os, sys, re
import numpy as np

from decimal import Decimal
from tabulate import tabulate

import warnings
warnings.filterwarnings('ignore')  # action='once'

%matplotlib inline

## Utility Functions

In [2]:
from analyzer import interpret

def col_values(df, col='age', n=10):
    df_subset = df.sample(n=n, random_state=1) # for each column, sample n values (usually n=1)
    return df_subset[col].values

def summary(df, n=1): 
    msg = ""
    msg += "> sample sizes: {}\n".format(df.shape[0])
    msg += "> n(features):  {}\n".format(df.shape[1])
    # msg += "> list of features:\n{}\n".format(df.columns.values)
    print(msg)

    interpret(df, n=n, verbose=True)
def show_dict(adict, topn=-1, by='', header=[], n_samples=-1, ascending=False, print_=False): 
    # print(adict)
    
    # convert to two-column dataframe format 
    if not header: 
        header = ['key', 'value']
    else: 
        assert len(header) == 2
    D = {h:[] for h in header}
    for k, v in adict.items(): 
        D[header[0]].append(k)
        D[header[1]].append(v)
    
    df = DataFrame(D, columns=header)
    msg = ''
    if topn > 0: 
        assert by in df.columns
        df = df.sort_values([by, ], ascending=ascending)
        msg = tabulate(df[:topn], headers='keys', tablefmt='psql')
    else: 
        if n_samples < 0: 
            msg = tabulate(df, headers='keys', tablefmt='psql')
        else: 
            n = min(df.shape[0], n_samples)
            msg = tabulate(df.sample(n=n), headers='keys', tablefmt='psql')
    if print_: print(msg)
    return msg

### Determine and Retrieve Patient Cohort(s)

In [3]:
"""
1. Find rows whose column match a substring 
   https://davidhamann.de/2017/06/26/pandas-select-elements-by-string/
   
   https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.contains.html

"""
import json 
from cohort_search import gen_code_set, gen_query_str

# use module cohort_search to generate the desired query strings 
# run CohortRetrieval on Databricks (https://dbc-25924283-13f6.cloud.databricks.com/#notebook/1971793/command/1973735)

### Load Base/Positive Dataset

In [4]:
from analyzer import load_src_data, load_data, save_data, load_performance, stratify
# from transformer import canonicalize  # ... obsolete
from transformer import resolve_duplicate
import loinc as lc
# from loinc import canonicalize

######################################
# params
cohort = domain = 'hepatitis-c'
token_default = 'unknown'
col_target = 'test_result_loinc_code'
######################################

# load source data
ts0 = load_src_data(cohort=cohort, warn_bad_lines=False, canonicalized=True, processed=False)

print("> dim(ts0): {}".format(ts0.shape))  
loinc_set = codes0 = ts0[col_target].unique()
codes0_subset = np.random.choice(codes0, 30)
print("> N(loinc_set): {} | example codes (source):\n{}\n".format(len(loinc_set), list(codes0_subset)))

# stratify the data by class labels (e.g. LOINC codes)
ds = stratify(ts0, col=col_target) # code|mean|std|n_pos
Nmax0 = ds[0][1]  
# ... here we have not added extra control data yet (i.e. patients without the target disease such hepatitis C)

# compare to the previously generated performance dataframe
df_perf = load_performance(input_dir='result', cohort=cohort)
df_perf = df_perf.sort_values(by=['n_pos', ], ascending=False)
Nmax = np.max(df_perf['n_pos'].values)
Nm = np.median(df_perf['n_pos'].values)

ss_dict = dict(zip(df_perf['code'].values, df_perf['n_pos'].values))
print("> Set the baseline class sample size: Nmax: {} (Nmax0: {}, median: {})".format(Nmax, Nmax0, Nm))

topn = 10
print("> top {} sample sizes:\n{}\n".format(topn, show_dict(ss_dict, topn=topn, 
        by='n_pos', header=['code', 'n_pos'])))

######################################
"""
Conclusion 
----------

Cohort: hepatitis-c

1. max sample size with known code: 67686 => 57202
2. loinc_set: target loinc codes
   size(loinc_set): 733
3. ts0: dim=(71224, 127)


"""

[load] Loading default input data: andromeda-pond-hepatitis-c-processed.csv
(canonicalize) Operations: fill n/a + dehyphenate + replace_values + trim_tail + fill others (non-target classes)
> dim(ts0): (71224, 127)
> N(loinc_set): 733 | example codes (source):
['104661', '162289', '505586', '23390', '7732', '20933', '823781', '339101', '825216', '7732', '7708', '505552', '7351', '7765', '45443', '36830', '297713', '35436', '486423', '269712', '161281', '169334', '30973', '204552', '28621', '80945', '25718', '33894', '622910', '93179']

> dim(performance matrix): (733, 5)
> Set the baseline class sample size: Nmax: 5593 (Nmax0: 5593, median: 3.0)
> top 10 sample sizes:
+----+---------+---------+
|    | code    |   n_pos |
|----+---------+---------|
|  0 | unknown |    5593 |
|  1 | 67686   |    3120 |
|  2 | 110114  |    2521 |
|  3 | 19752   |    2423 |
|  4 | 17426   |    2405 |
|  5 | 17517   |    2368 |
|  6 | 7187    |    2037 |
|  7 | 7773    |    2037 |
|  8 | 21600   |    2029 |

'\nConclusion \n----------\n\nCohort: hepatitis-c\n\n1. max sample size with known code: 67686 => 57202\n2. loinc_set: target loinc codes\n   size(loinc_set): 733\n3. ts0: dim=(71224, 127)\n\n\n'

### Load Control Data

In [5]:
# patients not in given domain/disease
from cohort_search import filter_by_diagnosis
from analyzer import save_data
from transformer import resolve_duplicate

######################################
# params
tFilter = False
tSave = True
tAddCtrl = True
output_dir = 'data'
######################################

N0 = ts0.shape[0]

# load source data
ts_ctrl = load_src_data(input_file='andromeda_pond-10p.csv', canonicalized=True)
# ts_ctrl = ts_ctrl.drop_duplicates(keep='last')  # drop duplicates
# ts_ctrl = resolve_duplicate(ts_ctrl)
# ... generate a new variable: 'count'
# ts_ctrl = lc.canonicalize(ts_ctrl, col_target=col_target, token_missing=token_default, target_labels=loinc_set)
# ... now included by default within load_data()

print("> dim(ts_ctrl): {}".format(ts_ctrl.shape))  

if tFilter: 
    ts_ctrl = filter_by_diagnosis(ts_ctrl, condition='hepatitis c')
    nctrl = ts_ctrl.shape[0]
    print("> sample size | orig: {}, filtered(control): {}".format(n0, nctrl))

# summary(df_ctrl, n=1)
# df_ctrl.info()   

# save 
if tSave: 
    output_file = f"andromeda-pond-{cohort}-ctrl.csv" 
    save_data(ts_ctrl,  output_file=output_file, sep=',')
    
# mix-in 
if tAddCtrl: 
    Nctrl = ts_ctrl.shape[0]
    assert Nctrl > N0, f"Control data is too small | n({cohort})={N0} > n(ctrl)={Nctrl}"
    ts_ctrl = ts_ctrl.sample(n=N0, replace=False)
    
    ts0 = pd.concat([ts0, ts_ctrl], ignore_index=True)
    
assert np.sum(ts0[col_target].isnull()) == 0
print("> Adding control data | N: {} -> {}".format(N0, ts0.shape[0]))

[load] Loading default input data: andromeda_pond-10p.csv


b'Skipping line 256: expected 127 fields, saw 128\nSkipping line 511: expected 127 fields, saw 129\nSkipping line 5917: expected 127 fields, saw 133\nSkipping line 7086: expected 127 fields, saw 128\nSkipping line 7750: expected 127 fields, saw 128\n'
b'Skipping line 10545: expected 127 fields, saw 131\nSkipping line 14112: expected 127 fields, saw 128\nSkipping line 14113: expected 127 fields, saw 128\nSkipping line 15206: expected 127 fields, saw 135\n'
b'Skipping line 20277: expected 127 fields, saw 130\n'
b'Skipping line 25255: expected 127 fields, saw 136\nSkipping line 30914: expected 127 fields, saw 164\n'
b'Skipping line 34649: expected 127 fields, saw 130\nSkipping line 37064: expected 127 fields, saw 131\nSkipping line 39673: expected 127 fields, saw 128\nSkipping line 39674: expected 127 fields, saw 128\nSkipping line 39676: expected 127 fields, saw 128\nSkipping line 40260: expected 127 fields, saw 132\nSkipping line 40291: expected 127 fields, saw 129\nSkipping line 40614:

b'Skipping line 312711: expected 127 fields, saw 132\nSkipping line 313105: expected 127 fields, saw 128\nSkipping line 314345: expected 127 fields, saw 128\nSkipping line 315877: expected 127 fields, saw 129\nSkipping line 317175: expected 127 fields, saw 129\n'
b'Skipping line 323634: expected 127 fields, saw 130\nSkipping line 323650: expected 127 fields, saw 131\n'
b'Skipping line 331833: expected 127 fields, saw 148\nSkipping line 332177: expected 127 fields, saw 142\nSkipping line 334586: expected 127 fields, saw 130\n'
b'Skipping line 336654: expected 127 fields, saw 128\nSkipping line 341873: expected 127 fields, saw 128\n'
b'Skipping line 345036: expected 127 fields, saw 129\nSkipping line 346459: expected 127 fields, saw 131\nSkipping line 346788: expected 127 fields, saw 134\nSkipping line 348949: expected 127 fields, saw 130\nSkipping line 348994: expected 127 fields, saw 128\n'
b'Skipping line 353627: expected 127 fields, saw 130\nSkipping line 355166: expected 127 fields,

b'Skipping line 590629: expected 127 fields, saw 130\nSkipping line 595449: expected 127 fields, saw 130\nSkipping line 595463: expected 127 fields, saw 130\nSkipping line 596527: expected 127 fields, saw 130\n'
b'Skipping line 599747: expected 127 fields, saw 131\nSkipping line 599973: expected 127 fields, saw 141\nSkipping line 600752: expected 127 fields, saw 128\n'
b'Skipping line 606853: expected 127 fields, saw 129\nSkipping line 608275: expected 127 fields, saw 128\n'
b'Skipping line 615975: expected 127 fields, saw 129\nSkipping line 619136: expected 127 fields, saw 128\nSkipping line 620929: expected 127 fields, saw 130\n'
b'Skipping line 628790: expected 127 fields, saw 128\nSkipping line 628793: expected 127 fields, saw 128\nSkipping line 628794: expected 127 fields, saw 128\nSkipping line 629569: expected 127 fields, saw 128\n'
b'Skipping line 631936: expected 127 fields, saw 128\nSkipping line 632185: expected 127 fields, saw 128\nSkipping line 632331: expected 127 fields,

b'Skipping line 885532: expected 127 fields, saw 128\nSkipping line 888688: expected 127 fields, saw 130\nSkipping line 890608: expected 127 fields, saw 130\nSkipping line 891400: expected 127 fields, saw 128\n'
b'Skipping line 893962: expected 127 fields, saw 134\nSkipping line 894471: expected 127 fields, saw 128\nSkipping line 897155: expected 127 fields, saw 131\nSkipping line 897175: expected 127 fields, saw 129\n'
b'Skipping line 905330: expected 127 fields, saw 128\nSkipping line 907408: expected 127 fields, saw 128\nSkipping line 907851: expected 127 fields, saw 131\n'
b'Skipping line 910890: expected 127 fields, saw 132\nSkipping line 910933: expected 127 fields, saw 129\nSkipping line 911173: expected 127 fields, saw 135\nSkipping line 911180: expected 127 fields, saw 144\nSkipping line 913057: expected 127 fields, saw 129\nSkipping line 914740: expected 127 fields, saw 128\nSkipping line 917499: expected 127 fields, saw 128\nSkipping line 917500: expected 127 fields, saw 130

b'Skipping line 1149200: expected 127 fields, saw 128\nSkipping line 1151607: expected 127 fields, saw 128\nSkipping line 1153131: expected 127 fields, saw 129\nSkipping line 1153303: expected 127 fields, saw 135\nSkipping line 1153807: expected 127 fields, saw 134\nSkipping line 1154441: expected 127 fields, saw 130\nSkipping line 1155012: expected 127 fields, saw 128\n'
b'Skipping line 1158360: expected 127 fields, saw 130\nSkipping line 1159215: expected 127 fields, saw 128\nSkipping line 1161997: expected 127 fields, saw 129\n'
b'Skipping line 1165521: expected 127 fields, saw 131\nSkipping line 1166351: expected 127 fields, saw 128\nSkipping line 1167166: expected 127 fields, saw 134\nSkipping line 1167592: expected 127 fields, saw 148\nSkipping line 1167916: expected 127 fields, saw 138\nSkipping line 1169383: expected 127 fields, saw 130\nSkipping line 1171852: expected 127 fields, saw 128\n'
b'Skipping line 1172217: expected 127 fields, saw 130\nSkipping line 1178310: expected 

b'Skipping line 1436489: expected 127 fields, saw 128\nSkipping line 1437192: expected 127 fields, saw 133\nSkipping line 1437854: expected 127 fields, saw 128\nSkipping line 1438348: expected 127 fields, saw 130\nSkipping line 1439732: expected 127 fields, saw 130\nSkipping line 1442219: expected 127 fields, saw 129\n'
b'Skipping line 1444192: expected 127 fields, saw 128\nSkipping line 1445893: expected 127 fields, saw 129\nSkipping line 1450159: expected 127 fields, saw 130\n'
b'Skipping line 1452365: expected 127 fields, saw 128\nSkipping line 1452367: expected 127 fields, saw 128\nSkipping line 1452368: expected 127 fields, saw 128\nSkipping line 1452369: expected 127 fields, saw 128\nSkipping line 1452370: expected 127 fields, saw 128\nSkipping line 1452371: expected 127 fields, saw 128\nSkipping line 1452723: expected 127 fields, saw 134\nSkipping line 1452913: expected 127 fields, saw 128\nSkipping line 1453171: expected 127 fields, saw 133\nSkipping line 1455382: expected 127 

b'Skipping line 1714574: expected 127 fields, saw 128\nSkipping line 1715874: expected 127 fields, saw 129\nSkipping line 1716176: expected 127 fields, saw 131\nSkipping line 1716986: expected 127 fields, saw 128\nSkipping line 1720583: expected 127 fields, saw 130\n'
b'Skipping line 1721845: expected 127 fields, saw 138\nSkipping line 1727001: expected 127 fields, saw 128\nSkipping line 1729172: expected 127 fields, saw 129\n'
b'Skipping line 1729847: expected 127 fields, saw 130\nSkipping line 1730161: expected 127 fields, saw 128\nSkipping line 1730162: expected 127 fields, saw 128\nSkipping line 1730163: expected 127 fields, saw 128\nSkipping line 1730164: expected 127 fields, saw 128\nSkipping line 1731539: expected 127 fields, saw 130\nSkipping line 1734799: expected 127 fields, saw 151\nSkipping line 1735282: expected 127 fields, saw 131\nSkipping line 1735356: expected 127 fields, saw 128\nSkipping line 1736705: expected 127 fields, saw 128\n'
b'Skipping line 1744730: expected 

b'Skipping line 2026787: expected 127 fields, saw 128\nSkipping line 2032171: expected 127 fields, saw 130\n'
b'Skipping line 2033667: expected 127 fields, saw 130\nSkipping line 2035036: expected 127 fields, saw 133\nSkipping line 2035362: expected 127 fields, saw 130\nSkipping line 2035967: expected 127 fields, saw 150\nSkipping line 2038089: expected 127 fields, saw 130\nSkipping line 2040247: expected 127 fields, saw 130\n'
b'Skipping line 2043810: expected 127 fields, saw 128\nSkipping line 2047649: expected 127 fields, saw 130\nSkipping line 2048458: expected 127 fields, saw 128\n'
b'Skipping line 2049835: expected 127 fields, saw 128\nSkipping line 2049836: expected 127 fields, saw 130\nSkipping line 2055149: expected 127 fields, saw 132\nSkipping line 2055230: expected 127 fields, saw 130\n'
b'Skipping line 2060945: expected 127 fields, saw 143\nSkipping line 2062724: expected 127 fields, saw 130\n'
b'Skipping line 2065803: expected 127 fields, saw 130\nSkipping line 2072015: e

b'Skipping line 2296407: expected 127 fields, saw 131\n'
b'Skipping line 2303428: expected 127 fields, saw 134\nSkipping line 2309310: expected 127 fields, saw 128\nSkipping line 2309382: expected 127 fields, saw 151\n'
b'Skipping line 2312483: expected 127 fields, saw 128\nSkipping line 2312484: expected 127 fields, saw 128\nSkipping line 2312485: expected 127 fields, saw 128\nSkipping line 2313675: expected 127 fields, saw 133\nSkipping line 2316809: expected 127 fields, saw 128\nSkipping line 2317913: expected 127 fields, saw 132\nSkipping line 2319562: expected 127 fields, saw 128\n'
b'Skipping line 2324118: expected 127 fields, saw 132\nSkipping line 2324522: expected 127 fields, saw 136\nSkipping line 2324909: expected 127 fields, saw 128\n'
b'Skipping line 2328812: expected 127 fields, saw 128\nSkipping line 2329812: expected 127 fields, saw 134\nSkipping line 2329813: expected 127 fields, saw 138\n'
b'Skipping line 2336944: expected 127 fields, saw 146\nSkipping line 2343343: e

b'Skipping line 2574653: expected 127 fields, saw 143\nSkipping line 2575429: expected 127 fields, saw 130\nSkipping line 2577944: expected 127 fields, saw 128\nSkipping line 2579050: expected 127 fields, saw 128\n'
b'Skipping line 2585041: expected 127 fields, saw 138\nSkipping line 2585513: expected 127 fields, saw 130\nSkipping line 2585955: expected 127 fields, saw 128\nSkipping line 2586274: expected 127 fields, saw 134\nSkipping line 2586416: expected 127 fields, saw 151\nSkipping line 2587052: expected 127 fields, saw 130\nSkipping line 2589807: expected 127 fields, saw 130\nSkipping line 2589815: expected 127 fields, saw 128\n'
b'Skipping line 2591189: expected 127 fields, saw 133\nSkipping line 2592749: expected 127 fields, saw 132\nSkipping line 2592775: expected 127 fields, saw 132\nSkipping line 2594980: expected 127 fields, saw 128\nSkipping line 2594981: expected 127 fields, saw 128\nSkipping line 2594982: expected 127 fields, saw 128\nSkipping line 2595528: expected 127 

b'Skipping line 2852507: expected 127 fields, saw 128\nSkipping line 2854649: expected 127 fields, saw 131\nSkipping line 2858257: expected 127 fields, saw 130\nSkipping line 2858882: expected 127 fields, saw 130\n'
b'Skipping line 2861895: expected 127 fields, saw 130\nSkipping line 2864322: expected 127 fields, saw 128\nSkipping line 2864328: expected 127 fields, saw 128\nSkipping line 2868424: expected 127 fields, saw 128\n'
b'Skipping line 2873548: expected 127 fields, saw 128\nSkipping line 2874146: expected 127 fields, saw 136\nSkipping line 2874575: expected 127 fields, saw 128\nSkipping line 2875027: expected 127 fields, saw 128\nSkipping line 2876078: expected 127 fields, saw 131\n'
b'Skipping line 2880032: expected 127 fields, saw 129\nSkipping line 2882199: expected 127 fields, saw 129\nSkipping line 2883429: expected 127 fields, saw 129\nSkipping line 2883920: expected 127 fields, saw 134\n'
b'Skipping line 2886960: expected 127 fields, saw 130\nSkipping line 2889404: expec

(canonicalize) Operations: fill n/a + dehyphenate + replace_values + trim_tail + fill others (non-target classes)
> dim(ts_ctrl): (2891340, 127)
(save_data) Saved dataframe (dim=(2891340, 127)) to:
/Users/barnett/Documents/work/loinc_predictor/data/andromeda-pond-hepatitis-c-ctrl.csv

> Adding control data | N: 71224 -> 142448


### Define Feature Set 

Note: Subsetting columns could potentially reduce the size of the data

In [6]:
"""
Memo
----
1. medivo_test_result_type is a function of the following attributes: 
      "meta_sender_name",
      "receiving_organization_id",
      "test_order_code",
      "test_order_name",
      "test_result_code",
      "test_result_name",
      "test_result_loinc_code",
      "test_result_units_of_measure"
      
"""
from transformer import to_age
from analyzer import sample_col_values
from loinc import FeatureSet

cat_cols = FeatureSet.cat_cols

# ['patient_gender',
#  'patient_state',
#  'patient_bill_type',
#  'fasting',
#  'performing_organization_id',
#  'receiving_organization_id',
#  'test_result_status',
#  'test_order_code',
#  'test_order_name',
#  'test_result_code',
#  'test_result_name',
#  'test_result_value',
#  'test_result_range',
#  'test_result_abnormal_flag',
#  'test_result_reference_range',
#  'test_result_units_of_measure',
#  'test_result_comments',
#  'test_cpt_code',
#  'panel_order_code',
#  'panel_order_name',
#  'meta_sender_name',
#  'medivo_test_result_type']

cont_cols = FeatureSet.cont_cols  

target_cols = FeatureSet.target_cols # ['test_result_loinc_code', ]

derived_cols = FeatureSet.derived_cols
# ['count',  # due to resolve_duplicate()
# ]

# cardinality < 100
low_card_cols = ['patient_gender', 'fasting', 'meta_sender_name' ]
high_card_cols = list(set(cat_cols)-set(low_card_cols))

representative_cols = ["meta_sender_name",
      # "receiving_organization_id",
      # "test_order_code",
      "test_order_name",
      # "test_result_code",
      "test_result_name",
      "test_result_loinc_code",
      "test_result_units_of_measure"]

target_columns = cat_cols + cont_cols + target_cols

# feature transformation
#####################################################
# to_age(ts0)
# values = sample_col_values(ts0, col='age', n=10)
# print("> age: {}".format(values))

### Balance Classes 

note: balance classes from external dataset

In [None]:
from analyzer import balance_data_incr, load_data_incr, stratify
from transformer import resolve_duplicate
import loinc as lc
from loinc import LoincTSet

# run here or just load the curated data generated by analyzer.t_stratify()
tLoad = True
tSave = not tLoad

col_target = LoincTSet.col_target
token_default = token_missing = 'unknown'

ts = ts0
ds = stratify(ts, col='test_result_loinc_code', ascending=False)
print("(balance_classes) data size distribution:\n{}\n".format(ds[:25]))
ds = [(code, size) for code, size in ds if not code in lc.LoincTSet.non_codes]
max_size = ds[0][1] # ds[2][1]
codes_low_sz = set([code for code, size in ds if size < max_size])
print("(balance_classes) We have n={} with low sample size (< {})".format(len(codes_low_sz), max_size))

if tLoad: 
    input_file=f"andromeda-pond-{cohort}-balanced.csv"
    ts = load_src_data(input_file=input_file, warn_bad_lines=False)
    ts[col_target] = ts[col_target].astype(str)
    # ts = ts.drop_duplicates(keep='last')  # drop duplicates 
    # ts = lc.canonicalize(ts, col_target=col_target, token_missing=token_default, target_labels=loinc_set)
    if not lc.is_canonicalized(ts, col_target=col_target, token_missing=token_default, target_labels=loinc_set): 
        ts = ts.drop_duplicates(keep='last')  # drop duplicates 
        ts = lc.canonicalize(ts, col_target=col_target, token_missing=token_default) # noisy_values/[]
    
else: 
    # max_size = 1000
    input_file = f"andromeda-pond-{cohort}-loinc.csv"
    codes = set(loinc_set)
    codes_hit = set([])
    for i, tsi in enumerate(load_data_incr(input_file=input_file, chunksize=1000000, warn_bad_lines=False)): 
        N0 = ts.shape[0]
        # tsi = tsi.drop_duplicates(keep='last')  # drop duplicates 
        tsi = resolve_duplicate(tsi)
        tsi = lc.canonicalize(tsi, col_target=col_target, token_missing=token_default, target_labels=loinc_set)
        print("[{}] Processing chunk #{} | n(ts): {}, n(tsi): {} ...".format(i, i+1, N0, tsi.shape[0]))

        ts_incr, hit, missed = balance_data_incr(df=ts, df_extern=tsi, n_samples=max_size, col=col_target)
        if not ts_incr.empty: ts = pd.concat([ts, ts_incr])
            
        # --- analysis --- 
        N = ts.shape[0]
        codes_hit.union(hit) # still has this many codes without a match 

        print(f"[{i}] size: {N0} -> {N}")
        ds = stratify(ts, col='test_result_loinc_code', ascending=False)
        ds = [(code, size) for code, size in ds if size < max_size]
        print("[{}] size(codes) < {} (n={}): \n{}\n".format(i, max_size, len(ds), ds[:20]))

    codes_missed = codes_low_sz - codes_hit  
    print("(t_stratify) At last, we could not find a match for n={} codes among nl={} low-sample-size labels:\n{}\n".format(
        len(codes_missed), len(codes_low_sz), codes_missed))  
    
    # down sample control data (very often the negative examples are too huge)

    if tSave: 
        
        # should not have duplicates at this point (due to resolve_duplicate() call)
        # ts = ts.drop_duplicates(keep='last')  # drop duplicates 
        
        output_file = f"andromeda-pond-{cohort}-balanced.csv"
        save_data(ts, output_file=output_file)

"""
Output
------
   ts: balanced training data; well... as balaned as possible because external data may not be able to supply sufficient 
       data for a subset of the classes/loincs
"""

(balance_classes) data size distribution:
[('unknown', 11981), ('67686', 5639), ('17517', 4457), ('17426', 4451), ('19752', 4333), ('21600', 4179), ('178616', 3862), ('7773', 3757), ('7187', 3755), ('486431', 3745), ('30973', 3605), ('19208', 3582), ('66902', 3329), ('7310', 3035), ('28233', 3016), ('7138', 2767), ('7112', 2766), ('110114', 2554), ('7518', 2411), ('7708', 2404), ('45443', 2355), ('134577', 2251), ('25718', 2240), ('20933', 2215), ('30940', 2170)]

(balance_classes) We have n=1517 with low sample size (< 5639)
[load] Loading default input data: andromeda-pond-hepatitis-c-balanced.csv
(canonicalize) Operations: fill n/a + dehyphenate + replace_values + trim_tail + fill others (non-target classes)
(is_canonicalized) Found n=1 codes not in target set:
{'nan'}

(canonicalize) Operations: fill n/a + dehyphenate + replace_values + trim_tail + fill others (non-target classes)


### Feature Transformation

note: patient_date_of_birth => age

In [None]:
from transformer import to_age
from analyzer import col_values
# from loinc import FeatureSet

to_age(ts)
values = col_values(ts, col='age', n=10)
print("> age: {}".format(values))

# resolve_duplicate() call adds a new column: count (of duplicates)
# assert 'count' in ts.columns

# datatime columns

### Subset Features and Handling missing values

In [None]:
tCategorify = False
tDropHighMissing = False # drop columns with high rate of missing values
p_null = 0.9
token_default = token_missing = 'unknown'

df = ts # ... :)

# V = list(feature_lookup.keys())
V = cont_cols + cat_cols # + derived_cols
L = target_cols
dfX = df[V]
dfy = df[L]

print("> Given features set:\n{}\n".format(V))

assert np.sum(dfy[target_cols[0]].isnull()) == 0

# drop columns/vars with too many missing values 
N = dfX.shape[0]
n_thresh = int(N * p_null)
nf0 = nf = dfX.shape[1]
fset0 = set(dfX.columns.values)

if tDropHighMissing: 
    dfX = dfX[dfX.columns[dfX.isnull().mean() < p_null]]
    fset = set(dfX.columns.values)
    nf = dfX.shape[1]
    print("> Dropped n={} features:\n{}\n".format(nf-nf0, fset0-fset))
    
fset = set(dfX.columns.values)
print("> Final feature set (nf={}):\n{}\n".format(nf, fset))

# fill in missing values (also see default_values)
dfX.fillna(value=token_default, inplace=True)
#################################################
# Convert our three categorical columns to category dtypes.

cat_cols = [cat for cat in cat_cols if cat in dfX.columns]
cont_cols = [c for c in cont_cols if c in dfX.columns]

"""
Output
------
   dfX 
   dfy
"""

### Encode Variables

In [None]:
from transformer import encode_vars 
from loinc import FeatureSet
# high_card_cols = FeatureSet.high_card_cols
nf0 = dfX.shape[1]
dfX, encoder = encode_vars(dfX, fset=cat_cols, high_card_cols=high_card_cols)
print("> After variable encoding we have dim(dfX): {} | nf: {} -> {}".format(dfX.shape, nf0, dfX.shape[1]))
print("> New feature set:\n{}\n".format(dfX.columns))
print(df[representative_cols].head(10).to_string(index=False))

### Encode Labels 

In [None]:
from analyzer import encode_labels, summarize_dict, get_sample_sizes
import collections, operator

# verify
assert dfX.shape[0] == dfy.shape[0], "> dim(dfX): {} | dfy.cols: {}".format(dfX.shape, dfy.columns.values)

codebook={'pos': 1, 'neg': 0, '+': 1, '-': 0}

# choose the one with a large sample size as 'positive'
col_label = 'test_result_loinc_code' # strings

topn = 5
sizes = get_sample_sizes(dfy[col_label])
# ... sizes: (loinc) label -> sample size
# print("> n(sizes): {}".format(len(sizes)))  # 734 for cohort='hepatitis-c'

# Q: How many classes/codes have less than N instances? 
N_low = 1000
n_low = sum(1 for l, c in sizes.items() if c < N_low)
print("> Low sample size classes | n={} (< {})".format(n_low, N_low))

N_elow = 10
n_elow = sum(1 for l, c in sizes.items() if c < N_elow)
print("> Extreme low sample size classes | n={} (< {})".format(n_elow, N_elow))

# Q: How many classes/codes were able to match the most enriched class in terms of sample size (e.g. 5707)? 
eps = 10
n_matched = sum(1 for l, c in sizes.items() if c >= max_size-eps)
print("> n_matched: {} | max_size: {}".format(n_matched, max_size))

# sort by values
sizes_sorted = sorted(sizes.items(), key=operator.itemgetter(1))
summarize_dict(sizes, topn=15, sort_=True)

print("> sizes: {}".format(sizes.most_common(20)))
most_sample_sizes = sizes.most_common(topn)  # take(topn, sizes.items())
print("> Sample sizes | Top N={} codes:\n{}\n".format(topn, most_sample_sizes))
least_sample_sizes = sizes.most_common()[:-topn-1:-1]
print("> Sample sizss | Last N={} codes:\n{}\n".format(topn, least_sample_sizes))

# test
target = most_sample_sizes[0][0]
y = encode_labels(dfy, pos_label=target, codebook=codebook, verbose=1)

## Initial Model Training

### 1. Feature Selection

In [None]:
"""
Ref
---
1. pip install feature-selector

   https://github.com/WillKoehrsen/feature-selector
   
   possible dependency 
      brew install libomp
      
   <debug> 
       + RuntimeError: Python is not installed as a framework.
          > https://stackoverflow.com/questions/34977388/matplotlib-runtimeerror-python-is-not-installed-as-a-framework
   
"""
import feature_selector 

### 2. Save a copy of encoded training set (optional)

In [None]:
tSaveEncodedTSet = True 

if tSaveEncodedTSet: 
    output_file = f"andromeda-pond-{cohort}-encoded.csv"
    save_data(ts, output_file=output_file)
    

### 3. Model Training

In [None]:
import utils_tree, utils_sys, analyzer
import collections
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from analyzer import balance_by_downsampling

# data transformation
col = 'test_result_loinc_code'
X, y = dfX.values, dfy[col].values
print("> dim(X): {}, sample(y): {}".format(X.shape, np.random.choice(np.unique(y),20) ))

# feature scaling
scaler = MinMaxScaler() # MinMaxScaler(), StandardScaler()
X = scaler.fit_transform(X)

n_fold = 5
n_min = n_fold

# to save performance data
header = ['code', 'mean', 'std', 'n_pos']
sdict = {h:[] for h in header}
for code in loinc_set: 
    y_eff = analyzer.encode_labels(y, pos_label=code)
    
    counter = collections.Counter(y_eff)
    n_pos, n_neg = counter[codebook['pos']], counter[codebook['neg']]
    print("> sample size | n(+): {}, n(-): {} | dim(y_encoded): {}".format(n_pos, n_neg, y_eff.shape))
    
    if n_pos >= n_min: 
        # downsampling majority classes (very often we have too many negative instances)
        X_eff, y_eff = balance_by_downsampling(X, y_eff, method='multiple', majority_max=3)
        n_pos, n_neg = counter[codebook['pos']], counter[codebook['neg']]
        print(f"> sample size (after downsampling majority) | n(+): {n_pos}, n(-): {n_neg}")
        
        scores = analyzer.eval_performance(X_eff, y_eff, model=None, cv=n_fold, random_state=53, verbose=1)
        mean_score = np.mean(scores)
        std_score = np.std(scores)
        print("> average: {}, std: {}".format(mean_score, std_score))
    else: 
        print("> (positive) sample size too small, n={}".format(n_pos))
        mean_score = -1 
        std_score = -1
    sdict['code'].append(code)
    sdict['mean'].append(mean_score)
    sdict['std'].append(std_score)
    sdict['n_pos'].append(n_pos)

# --------------------------------------------------
# save performance dataframe
df_perf = DataFrame(sdict, columns=header)
df_perf = df_perf.sort_values(by=['mean', ]) # ascending=False
analyzer.save_performnace(df, output_dir='result') # output_file/'' (performance-<cohort>.csv)

cohort = 'hepatitis-c'
output_dir = os.path.join(os.getcwd(), 'result')
output_file = f"performance-{cohort}-0.csv" 
output_path = os.path.join(output_dir, output_file)
df_perf.to_csv(output_path, sep='|', index=False, header=True)

for code, score in zip(df_perf['code'], df_perf['mean']):
    print(f"[{code}] -> {score}")

### Visualize Results

In [None]:
"""

Memo
---- 
1. performance plot

   perplot: https://pypi.org/project/perfplot/
"""
import seaborn as sns
import matplotlib.pyplot as plt
from analyzer import load_performance

sns.set(style="whitegrid")

# Initialize the matplotlib figure
f, ax = plt.subplots(figsize=(6, 20))
sns.set_color_codes("pastel")

#---------------------------------------------

# load performance data
cohort = 'hepatitis-c'
df_perf = load_performance(input_dir='result', cohort=cohort)
print("> dim(performance matrix): {}".format(df_perf.shape))

# sort ~ performance scores 
# df_perf = df_perf.sort_values(by=['mean', ], ascending=False)

header = ['code', 'mean', 'std', 'n_pos']
codes = df_perf['code']
n_codes = len(codes)
scores = df_perf['mean']

# some statistics
score_high = 0.90
score_low = 0.50

codes_low_sz = df_perf.loc[df_perf['mean'] < 0]['code']
codes_scored = df_perf.loc[df_perf['mean'] >= 0]['code']
codes_high_score = df_perf.loc[df_perf['mean'] >= score_high]['code']
assert n_codes == len(codes_low_sz) + len(codes_scored)

print("1. Total number of codes: {} | n(low_sample): {}, n(scored):{}, n(high scored):{}".format(n_codes, 
   len(codes_low_sz), len(codes_scored), len(codes_high_score)))
r_scored = len(codes_scored)/(n_codes+0.0)
rh = len(codes_high_score)/(n_codes+0.0)
print("2. Fraction of scored codes: {}".format(r_scored))
print("3. Fraction of highly scored codes: {}".format(rh))

# Effective performance dataframe, ruling out those codes without scores (due to low sample sizes)
df_eff = df_perf.loc[df_perf['mean'] >= 0.0]

n_offset = 25
df_topn = df_eff.sort_values(['mean', ], ascending=False).head(n_offset)
df_botn = df_eff.sort_values(['mean', ], ascending=True).head(n_offset)
# print(df_botn)

# codes = [str(c) for c in df_botn['code'].values]
# print('lower codes: {}'.format(codes))
# scores = df_botn['mean'].values
# print('scores: {}'.format(scores))

# top n + bottom n
dfe = pd.concat([df_topn, df_botn], ignore_index=True)
dfe.sort_values(by=['mean', ], ascending=False, inplace=True)
codes = [str(c) for c in dfe['code'].values]
scores = dfe['mean'].values
# print('lower(n)+higher codes(n): {}'.format(codes))
# print('scores: {}'.format(scores))
print(dfe)

# sns.barplot(x="total", y="abbrev", data=crashes,
#             label="Total", color="b")

# --------------------
# ax = sns.barplot(x='mean', y='code', data=df_botn)
# print("-------------------------\n\n")
# print("> dtype: {}".format(df_botn.dtypes))
# print(df_botn.head(10))

# dfe = dfe[['mean', 'code']]
# dfe.plot(kind='bar')

sns.barplot(x='mean', y='code', data=dfe, order=dfe['code'], # order has to be specified; even if already sorted!!!
            label="LOINC", color="b", orient='h')

# ax = sns.barplot(x='mean', y='code', data=df)

# ax.set_xlabel('Fmax Score')
# ax.set_ylabel('LOINC')
