In [1]:
import pandas as pd 
from pandas import DataFrame, Series
import os, sys, re
import numpy as np

from decimal import Decimal
from tabulate import tabulate

import warnings
warnings.filterwarnings('ignore')  # action='once'

%matplotlib inline

## Utility Functions

In [2]:
from analyzer import interpret

def col_values(df, col='age', n=10):
    df_subset = df.sample(n=n, random_state=1) # for each column, sample n values (usually n=1)
    return df_subset[col].values

def summary(df, n=1): 
    msg = ""
    msg += "> sample sizes: {}\n".format(df.shape[0])
    msg += "> n(features):  {}\n".format(df.shape[1])
    # msg += "> list of features:\n{}\n".format(df.columns.values)
    print(msg)

    interpret(df, n=n, verbose=True)
def show_dict(adict, topn=-1, by='', header=[], n_samples=-1, ascending=False, print_=False): 
    # print(adict)
    
    # convert to two-column dataframe format 
    if not header: 
        header = ['key', 'value']
    else: 
        assert len(header) == 2
    D = {h:[] for h in header}
    for k, v in adict.items(): 
        D[header[0]].append(k)
        D[header[1]].append(v)
    
    df = DataFrame(D, columns=header)
    msg = ''
    if topn > 0: 
        assert by in df.columns
        df = df.sort_values([by, ], ascending=ascending)
        msg = tabulate(df[:topn], headers='keys', tablefmt='psql')
    else: 
        if n_samples < 0: 
            msg = tabulate(df, headers='keys', tablefmt='psql')
        else: 
            n = min(df.shape[0], n_samples)
            msg = tabulate(df.sample(n=n), headers='keys', tablefmt='psql')
    if print_: print(msg)
    return msg

### Determine and Retrieve Patient Cohort(s)

In [3]:
"""
1. Find rows whose column match a substring 
   https://davidhamann.de/2017/06/26/pandas-select-elements-by-string/
   
   https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.contains.html

"""
import json 
from cohort_search import gen_code_set, gen_query_str

# use module cohort_search to generate the desired query strings 
# run CohortRetrieval on Databricks (https://dbc-25924283-13f6.cloud.databricks.com/#notebook/1971793/command/1973735)

### Load Base/Positive Dataset

In [4]:
from analyzer import load_data, save_data, load_performance, stratify
# from transformer import canonicalize  # ... obsolete
from transformer import resolve_duplicate
import loinc as lc
# from loinc import canonicalize

######################################
# params
cohort = domain = 'hepatitis-c'
token_default = 'unknown'
col_target = 'test_result_loinc_code'
######################################

# load source data
ts0 = load_data(input_file='andromeda-pond-hepatitis-c.csv', warn_bad_lines=False)
# ts0 = ts0.drop_duplicates(keep='last')  # drop duplicates 
ts0 = resolve_duplicate(ts0)  # add_count/True: adding extra column: 'count'
ts0 = lc.canonicalize(ts0, col_target=col_target, token_missing=token_default)

print("> dim(ts0): {}".format(ts0.shape))  
loinc_set = codes0 = ts0[col_target].unique()
codes0_subset = np.random.choice(codes0, 30)
print("> N(loinc_set): {} | example codes (source):\n{}\n".format(len(loinc_set), list(codes0_subset)))

# stratify the data by class labels (e.g. LOINC codes)
ds = stratify(ts0, col=col_target) # code|mean|std|n_pos
Nmax0 = ds[0][1]  
# ... here we have not added extra control data yet (i.e. patients without the target disease such hepatitis C)

# compare to the previously generated performance dataframe
df_perf = load_performance(input_dir='result', cohort=cohort)
df_perf = df_perf.sort_values(by=['n_pos', ], ascending=False)
Nmax = np.max(df_perf['n_pos'].values)
Nm = np.median(df_perf['n_pos'].values)

ss_dict = dict(zip(df_perf['code'].values, df_perf['n_pos'].values))
print("> Set the baseline class sample size: Nmax: {} (Nmax0: {}, median: {})".format(Nmax, Nmax0, Nm))

topn = 10
print("> top {} sample sizes:\n{}\n".format(topn, show_dict(ss_dict, topn=topn, 
        by='n_pos', header=['code', 'n_pos'])))

######################################
"""
Conclusion 
----------

Cohort: hepatitis-c

1. max sample size with known code: 67686 => 57202
2. loinc_set: target loinc codes
   size(loinc_set): 733
3. ts0: dim=(71224, 127)


"""

(load_data) Loaded dataframe (dim=(71224, 127)) from:
/Users/barnett/Documents/work/loinc_predictor/data/andromeda-pond-hepatitis-c.csv

(resolve_duplicate) n0: 71224 =?= n1: 71224
(resolve_duplicate) Found 596 duplicate rows wrt cols: ['test_result_loinc_code']
(canonicalize) Operations> fillna, dehyphenate, replace_values, trim_tail, fill_others
> dim(ts0): (69710, 128)
> N(loinc_set): 733 | example codes (source):
['51953', '755082', '885178', '98426', '7310', '63313', '483453', '425959', '178640', '30741', '139956', '204552', '161901', '30130', '154120', '204966', '191395', '133629', '111567', '349993', '351684', '19869', '332551', '505511', '7518', '7047', '28902', '20008', '237610', '58214']

> dim(performance matrix): (733, 4)
> Set the baseline class sample size: Nmax: 12033 (Nmax0: 4079, median: 7.0)
> top 10 sample sizes:
+----+---------+---------+
|    | code    |   n_pos |
|----+---------+---------|
|  0 | unknown |   12033 |
|  1 | 67686   |    5720 |
|  2 | 19752   |    4

'\nConclusion \n----------\n\nCohort: hepatitis-c\n\n1. max sample size with known code: 67686 => 57202\n2. loinc_set: target loinc codes\n   size(loinc_set): 733\n3. ts0: dim=(71224, 127)\n\n\n'

### Load Control Data

In [5]:
# patients not in given domain/disease
from cohort_search import filter_by_diagnosis
from analyzer import save_data
from transformer import resolve_duplicate

######################################
# params
tFilter = False
tSave = True
tAddCtrl = True
output_dir = 'data'
######################################

N0 = ts0.shape[0]

# load source data
ts_ctrl = load_data(input_file='andromeda_pond-10p.csv', warn_bad_lines=False)
# ts_ctrl = ts_ctrl.drop_duplicates(keep='last')  # drop duplicates
ts_ctrl = resolve_duplicate(ts_ctrl)
ts_ctrl = lc.canonicalize(ts_ctrl, col_target=col_target, token_missing=token_default, target_labels=loinc_set)
print("> dim(ts_ctrl): {}".format(ts_ctrl.shape))  

if tFilter: 
    ts_ctrl = filter_by_diagnosis(ts_ctrl, condition='hepatitis c')
    nctrl = ts_ctrl.shape[0]
    print("> sample size | orig: {}, filtered(control): {}".format(n0, nctrl))

# summary(df_ctrl, n=1)
# df_ctrl.info()   

# save 
if tSave: 
    output_file = f"andromeda-pond-{cohort}-ctrl.csv" 
    save_data(ts_ctrl,  output_file=output_file, sep=',')
    
# mix-in 
if tAddCtrl: 
    Nctrl = ts_ctrl.shape[0]
    assert Nctrl > N0, f"Control data is too small | n({cohort})={N0} > n(ctrl)={Nctrl}"
    ts_ctrl = ts_ctrl.sample(n=N0, replace=False)
    
    ts0 = pd.concat([ts0, ts_ctrl], ignore_index=True)
    
assert np.sum(ts0[col_target].isnull()) == 0
print("> Adding control data | N: {} -> {}".format(N0, ts0.shape[0]))

(load_data) Loaded dataframe (dim=(2891340, 127)) from:
/Users/barnett/Documents/work/loinc_predictor/data/andromeda_pond-10p.csv

(resolve_duplicate) n0: 2891340 =?= n1: 2891340
(resolve_duplicate) Found 4442 duplicate rows wrt cols: ['test_result_loinc_code']
(canonicalize) Operations> fillna, dehyphenate, replace_values, trim_tail, fill_others
(canonicalize) Focus only on target labels (n=733), labeling the rest as other
> dim(ts_ctrl): (2639054, 128)
(save_data) Saved dataframe (dim=(2639054, 128)) to:
/Users/barnett/Documents/work/loinc_predictor/data/andromeda-pond-hepatitis-c-ctrl.csv

> Adding control data | N: 69710 -> 139420


### Define Feature Set 

Note: Subsetting columns could potentially reduce the size of the data

In [6]:
"""
Memo
----
1. medivo_test_result_type is a function of the following attributes: 
      "meta_sender_name",
      "receiving_organization_id",
      "test_order_code",
      "test_order_name",
      "test_result_code",
      "test_result_name",
      "test_result_loinc_code",
      "test_result_units_of_measure"
      
"""
from transformer import to_age
from analyzer import sample_col_values
from loinc import FeatureSet

cat_cols = ['patient_gender', 
            'patient_state',  # n_uniq=199
            'patient_bill_type',  # n_uniq=31
            'fasting',   # n_uniq=5
            
            'performing_organization_id', # n_uniq=151, m=40%+, NOT part of medivo_test_result_type
            
            'receiving_organization_id', # n_uniq=43, m=50%+, part of medivo_test_result_type
            # 'receiving_organization_name', 
            
            # 'receiving_organization_state', 
            # 'receiving_organization_zip_code', 
            
            # 'ordering_practice_lab_account_name',  # high card
            # 'ordering_practice_lab_account_number', # high card
            
            # 'ordering_practice_city', # high card 
            # 'ordering_practice_state', # high card 124? 
            
            # 'ordering_practice_zip_code', # high card,  n_uniq=79392
            # 'ordering_provider_alternate_id_type',   # n_uniq=32
            
            # 'ordering_provider_alternate_id', # n_uniq=132768
            
            # ---------------------------------
            
            'test_result_status', # n_uniq=144
            # 'test_turnaround_time', # n_uniq=417, high missing
            
            'test_order_code',  # n_uniq=27668
            'test_order_name',  # n_uniq=20039
            
            'test_result_code', # n_uniq=23731 (2771052/2891340)
            'test_result_name',  # n_uniq=15581    # <<<< 
            
            'test_result_value',  # n_uniq=35441    # <<<< 
            'test_result_range',   # n_uniq=151, mostly missing   # <<<< 
            
            'test_result_abnormal_flag',  # n_uniq=524, high missing
            
            'test_result_reference_range',  # n_uniq=5735, moderate missing
            
            'test_result_units_of_measure',  # n_uniq=669, m=40%+
            
            # 'test_result_comment_source', # mostly missing
            
            'test_result_comments',  # mostly missing > 80%   # <<<< 
            
            # 'test_priority', 
            # 'test_specimen_collection_volume',
            
            # 'test_specimen_type',  # mostly missing
            
            # 'test_specimen_source', # n_uniq=15971
            # 'test_relevant_clinical_information', # n_uniq=26/
            
            'test_cpt_code',    # n_uniq=655
            
            # 'parent_test_order_code', # n_uniq=5088
            # 'parent_test_order_name', # high missing
            
            # --- datetime ---
            # 'test_specimen_draw_datetime',  # e.g. '2019-08-07T14:47:00.000Z'
            # 'test_specimen_receipt_datetime', #  e.g. '2016-10-06T10:54:00.000Z
            
            # 'test_specimen_analysis_datetime', # high missin
            # 'test_observation_datetime', 
            
            # 'test_observation_reported_datetime', 
            
            'panel_order_code',  # n_uniq=18018
            'panel_order_name',  # n_uniq=11663
            
            # 'parent_panel_order_code', # high missing
            # 'parent_panel_order_name', # high missing
            
            # 'datetime_of_processing',  # no year e.g. 'Jun 29 14:44:25'
            
            # 'meta_ingestion_datetime',
            
            'meta_sender_name',  #  n_uniq=7, m=0 # <<< 
            'medivo_test_result_type',  # n_uniq=3493, <<<<
        
            ]

cont_cols = ['age',   # patient_gender -> age  # <<< 
     ]  

target_cols = ['test_result_loinc_code', ]

derived_cols = ['count',  # due to resolve_duplicate()
               ]

# cardinality < 100
low_card_cols = ['patient_gender', 'fasting', 'meta_sender_name' ]
high_card_cols = list(set(cat_cols)-set(low_card_cols))

representative_cols = ["meta_sender_name",
      # "receiving_organization_id",
      # "test_order_code",
      "test_order_name",
      # "test_result_code",
      "test_result_name",
      "test_result_loinc_code",
      "test_result_units_of_measure"]

target_columns = cat_cols + cont_cols + target_cols

# feature transformation
#####################################################
# to_age(ts0)
# values = sample_col_values(ts0, col='age', n=10)
# print("> age: {}".format(values))

### Balance Classes 

note: balance classes from external dataset

In [7]:
from analyzer import balance_data_incr, load_data_incr, stratify
from transformer import resolve_duplicate
import loinc as lc

# run here or just load the curated data generated by analyzer.t_stratify()
tLoad = False
tSave = True

ts = ts0
ds = stratify(ts, col='test_result_loinc_code', ascending=False)
print("(balance_classes) data size distribution:\n{}\n".format(ds[:25]))
ds = [(code, size) for code, size in ds if not code in lc.LoincTSet.non_codes]
max_size = ds[0][1] # ds[2][1]
codes_low_sz = set([code for code, size in ds if size < max_size])
print("(balance_classes) We have n={} with low sample size (< {})".format(len(codes_low_sz), max_size))

if tLoad: 
    input_file=f"andromeda-pond-{cohort}-balanced.csv"
    ts = load_data(input_file=input_file, warn_bad_lines=False)
    # ts = ts.drop_duplicates(keep='last')  # drop duplicates 
    # ts = lc.canonicalize(ts, col_target=col_target, token_missing=token_default, target_labels=loinc_set)
    assert lc.is_canonicalized(ts, col_target=col_target, token_missing=token_default, target_labels=loinc_set)
    
else: 
    # max_size = 1000
    input_file = f"andromeda-pond-{cohort}-loinc.csv"
    codes = set(loinc_set)
    codes_hit = set([])
    for i, tsi in enumerate(load_data_incr(input_file=input_file, chunksize=1000000, warn_bad_lines=False)): 
        N0 = ts.shape[0]
        # tsi = tsi.drop_duplicates(keep='last')  # drop duplicates 
        tsi = resolve_duplicate(tsi)
        tsi = lc.canonicalize(tsi, col_target=col_target, token_missing=token_default, target_labels=loinc_set)
        print("[{}] Processing chunk #{} | n(ts): {}, n(tsi): {} ...".format(i, i+1, N0, tsi.shape[0]))

        ts_incr, hit, missed = balance_data_incr(df=ts, df_extern=tsi, n_samples=max_size, col=col_target)
        if not ts_incr.empty: ts = pd.concat([ts, ts_incr])
            
        # --- analysis --- 
        N = ts.shape[0]
        codes_hit.union(hit) # still has this many codes without a match 

        print(f"[{i}] size: {N0} -> {N}")
        ds = stratify(ts, col='test_result_loinc_code', ascending=False)
        ds = [(code, size) for code, size in ds if size < max_size]
        print("[{}] size(codes) < {} (n={}): \n{}\n".format(i, max_size, len(ds), ds[:20]))

    codes_missed = codes_low_sz - codes_hit  
    print("(t_stratify) At last, we could not find a match for n={} codes among nl={} low-sample-size labels:\n{}\n".format(
        len(codes_missed), len(codes_low_sz), codes_missed))  
    
    # down sample control data (very often the negative examples are too huge)

    if tSave: 
        
        # should not have duplicates at this point (due to resolve_duplicate() call)
        # ts = ts.drop_duplicates(keep='last')  # drop duplicates 
        
        output_file = f"andromeda-pond-{cohort}-balanced.csv"
        save_data(ts, output_file=output_file)

"""
Output
------
   ts: balanced training data; well... as balaned as possible because external data may not be able to supply sufficient 
       data for a subset of the classes/loincs
"""

(balance_classes) data size distribution:
[('67686', 5872), ('17426', 4647), ('17517', 4602), ('19752', 4562), ('21600', 4379), ('unknown', 4267), ('178616', 4001), ('7773', 3938), ('7187', 3917), ('486431', 3861), ('30973', 3851), ('19208', 3726), ('66902', 3405), ('28233', 3142), ('7310', 3104), ('7112', 2848), ('7138', 2827), ('110114', 2566), ('7708', 2503), ('7518', 2471), ('45443', 2452), ('20859', 2373), ('25718', 2372), ('20933', 2323), ('134577', 2262)]

(balance_classes) We have n=731 with low sample size (< 5872)
(resolve_duplicate) n0: 1000000 =?= n1: 1000000
(resolve_duplicate) Found 840 duplicate rows wrt cols: ['test_result_loinc_code']
(canonicalize) Operations> fillna, dehyphenate, replace_values, trim_tail, fill_others
(canonicalize) Focus only on target labels (n=733), labeling the rest as other
[0] Processing chunk #1 | n(ts): 139420, n(tsi): 1000000 ...
(balance_data_incr) n_baseline=5872
(balance_data_incr) Found n=734 unique codes from source | nc=476 unique code

(resolve_duplicate) n0: 1000000 =?= n1: 1000000
(resolve_duplicate) Found 804 duplicate rows wrt cols: ['test_result_loinc_code']
(canonicalize) Operations> fillna, dehyphenate, replace_values, trim_tail, fill_others
(canonicalize) Focus only on target labels (n=733), labeling the rest as other
[3] Processing chunk #4 | n(ts): 629564, n(tsi): 1000000 ...
(balance_data_incr) n_baseline=5872
(balance_data_incr) Found n=734 unique codes from source | nc=482 unique codes from external
... found 0=?=0 extra codes from the df_extern:
[]

... found 482 common codes from the df_extern:
['30130', '823799', '384834', '52472', '594192', '7021', '304469', '130682', '58115', '45484', '191395', '7898', '40865', '7120', '33902', '63313', '7328', '62372', '557520', '31427']

(balance_data_incr) Added n=9 cases to code=122861
(balance_data_incr) Added n=1 cases to code=143149
(balance_data_incr) Added n=112 cases to code=205070
(balance_data_incr) Added n=5 cases to code=23366
(balance_data_incr) Added

(balance_data_incr) Added n=8 cases to code=122861
(balance_data_incr) Added n=1 cases to code=143149
(balance_data_incr) Added n=28 cases to code=17442
(balance_data_incr) Added n=330 cases to code=206243
(balance_data_incr) Added n=183 cases to code=237610
(balance_data_incr) Added n=20 cases to code=264507
(balance_data_incr) Added n=27 cases to code=304337
(balance_data_incr) Added n=442 cases to code=327312
(balance_data_incr) Added n=4 cases to code=381772
(balance_data_incr) Added n=49 cases to code=45765
(balance_data_incr) Added n=5 cases to code=51268
(balance_data_incr) Added n=284 cases to code=57703
(balance_data_incr) Added n=22 cases to code=62372
(balance_data_incr) Added n=257 cases to code=7146
(balance_data_incr) Added n=1 cases to code=772020
(balance_data_incr) Added n=1 cases to code=82206
(balance_data_incr) Added n=474 cases in total | n_miss:258 ... #
... missed (n=258):
['451765', '115659', '384834', '110064', '744441', '424838', '487967', '426171', '303842', 

(balance_data) N_extra: 34558
[9] size: 949211 -> 983769
[9] size(codes) < 5872 (n=626): 
[('28688', 5867), ('182626', 5700), ('303768', 5651), ('178566', 5169), ('7328', 5135), ('622910', 5122), ('339440', 5052), ('16493', 4878), ('81232', 4835), ('24729', 4770), ('21626', 4768), ('7641', 4629), ('244673', 4532), ('191130', 4517), ('265074', 4501), ('327312', 4430), ('514356', 4416), ('unknown', 4267), ('130682', 4132), ('21329', 4039)]

(resolve_duplicate) n0: 1000000 =?= n1: 1000000
(resolve_duplicate) Found 780 duplicate rows wrt cols: ['test_result_loinc_code']
(canonicalize) Operations> fillna, dehyphenate, replace_values, trim_tail, fill_others
(canonicalize) Focus only on target labels (n=733), labeling the rest as other
[10] Processing chunk #11 | n(ts): 983769, n(tsi): 1000000 ...
(balance_data_incr) n_baseline=5872
(balance_data_incr) Found n=734 unique codes from source | nc=476 unique codes from external
... found 0=?=0 extra codes from the df_extern:
[]

... found 476 com

(balance_data_incr) Added n=9 cases to code=122861
(balance_data_incr) Added n=10 cases to code=143081
(balance_data_incr) Added n=108 cases to code=205070
(balance_data_incr) Added n=139 cases to code=28712
(balance_data_incr) Added n=61 cases to code=322156
(balance_data_incr) Added n=1 cases to code=351684
(balance_data_incr) Added n=85 cases to code=437277
(balance_data_incr) Added n=4 cases to code=490243
(balance_data_incr) Added n=52 cases to code=54031
(balance_data_incr) Added n=6 cases to code=602797
(balance_data_incr) Added n=17 cases to code=7021
(balance_data_incr) Added n=158 cases to code=81224
(balance_data_incr) Added n=1 cases to code=96644
(balance_data_incr) Added n=478 cases in total | n_miss:254 ... #
... missed (n=254):
['451765', '115659', '384834', '744441', '487967', '303842', '397786', '530170', '204537', '151522', '305524', '111567', '422410', '505628', '142364', '30741', '345355', '320184', '461285', '67702']

... hit (n=478):
['30130', '823799', '52472', 

(save_data) Saved dataframe (dim=(1123499, 128)) to:
/Users/barnett/Documents/work/loinc_predictor/data/andromeda-pond-hepatitis-c-balanced.csv



'\nOutput\n------\n   ts: balanced training data; well... as balaned as possible because external data may not be able to supply sufficient \n       data for a subset of the classes/loincs\n'

### Feature Transformation

note: patient_date_of_birth => age

In [8]:
from transformer import to_age
from analyzer import col_values
# from loinc import FeatureSet

to_age(ts)
values = col_values(ts, col='age', n=10)
print("> age: {}".format(values))

# resolve_duplicate() call adds a new column: count (of duplicates)
assert 'count' in ts.columns

# datatime columns

> age: [60 34 21 84 69 78 45 56 43 24]


### Subset Features and Handling missing values

In [9]:
tCategorify = False
tDropHighMissing = False # drop columns with high rate of missing values
p_null = 0.9
token_default = token_missing = 'unknown'

df = ts # ... :)

# V = list(feature_lookup.keys())
V = cont_cols + cat_cols + derived_cols
L = target_cols
dfX = df[V]
dfy = df[L]

print("> Given features set:\n{}\n".format(V))

assert np.sum(dfy[target_cols[0]].isnull()) == 0

# drop columns/vars with too many missing values 
N = dfX.shape[0]
n_thresh = int(N * p_null)
nf0 = nf = dfX.shape[1]
fset0 = set(dfX.columns.values)

if tDropHighMissing: 
    dfX = dfX[dfX.columns[dfX.isnull().mean() < p_null]]
    fset = set(dfX.columns.values)
    nf = dfX.shape[1]
    print("> Dropped n={} features:\n{}\n".format(nf-nf0, fset0-fset))
    
fset = set(dfX.columns.values)
print("> Final feature set (nf={}):\n{}\n".format(nf, fset))

# fill in missing values (also see default_values)
dfX.fillna(value=token_default, inplace=True)
#################################################
# Convert our three categorical columns to category dtypes.

cat_cols = [cat for cat in cat_cols if cat in dfX.columns]
cont_cols = [c for c in cont_cols if c in dfX.columns]

"""
Output
------
   dfX 
   dfy
"""

> Given features set:
['age', 'patient_gender', 'patient_state', 'patient_bill_type', 'fasting', 'performing_organization_id', 'receiving_organization_id', 'test_result_status', 'test_order_code', 'test_order_name', 'test_result_code', 'test_result_name', 'test_result_value', 'test_result_range', 'test_result_abnormal_flag', 'test_result_reference_range', 'test_result_units_of_measure', 'test_result_comments', 'test_cpt_code', 'panel_order_code', 'panel_order_name', 'meta_sender_name', 'medivo_test_result_type', 'count']

> Final feature set (nf=24):
{'test_result_status', 'count', 'test_result_comments', 'test_result_reference_range', 'test_result_value', 'panel_order_name', 'test_result_name', 'test_result_range', 'receiving_organization_id', 'test_order_code', 'panel_order_code', 'meta_sender_name', 'patient_gender', 'patient_state', 'medivo_test_result_type', 'test_result_abnormal_flag', 'fasting', 'performing_organization_id', 'test_order_name', 'test_result_units_of_measure', 'te

'\nOutput\n------\n   dfX \n   dfy\n'

### Encode Variables

In [10]:
from transformer import encode_vars 
from loinc import FeatureSet
# high_card_cols = FeatureSet.high_card_cols
nf0 = dfX.shape[1]
dfX = encode_vars(dfX, fset=cat_cols, high_card_cols=high_card_cols)
print("> After variable encoding we have dim(dfX): {} | nf: {} -> {}".format(dfX.shape, nf0, dfX.shape[1]))
print("> New feature set:\n{}\n".format(dfX.columns))
print(df[representative_cols].head(10).to_string(index=False))

(encoder_vars) low card vars (n=['fasting', 'meta_sender_name', 'patient_gender']):
3
 ... high card vars (n=['test_result_status', 'patient_state', 'medivo_test_result_type', 'test_result_abnormal_flag', 'test_result_comments', 'test_result_reference_range', 'performing_organization_id', 'test_order_name', 'test_result_units_of_measure', 'test_result_value', 'test_result_code', 'panel_order_name', 'test_result_name', 'test_result_range', 'receiving_organization_id', 'test_order_code', 'patient_bill_type', 'panel_order_code', 'test_cpt_code']):
19

... transforming var: patient_gender ...
... transforming var: patient_state ...
... transforming var: patient_bill_type ...
... transforming var: fasting ...
... transforming var: performing_organization_id ...
... transforming var: receiving_organization_id ...
... transforming var: test_result_status ...
... transforming var: test_order_code ...
... transforming var: test_order_name ...
... transforming var: test_result_code ...
... trans

### Encode Labels 

In [11]:
from analyzer import encode_labels, summarize_dict, get_sample_sizes
import collections, operator

# verify
assert dfX.shape[0] == dfy.shape[0], "> dim(dfX): {} | dfy.cols: {}".format(dfX.shape, dfy.columns.values)

codebook={'pos': 1, 'neg': 0, '+': 1, '-': 0}

# choose the one with a large sample size as 'positive'
col_label = 'test_result_loinc_code' # strings

topn = 5
sizes = get_sample_sizes(dfy[col_label])
# ... sizes: (loinc) label -> sample size
# print("> n(sizes): {}".format(len(sizes)))  # 734 for cohort='hepatitis-c'

# Q: How many classes/codes have less than N instances? 
N_low = 1000
n_low = sum(1 for l, c in sizes.items() if c < N_low)
print("> Low sample size classes | n={} (< {})".format(n_low, N_low))

N_elow = 10
n_elow = sum(1 for l, c in sizes.items() if c < N_elow)
print("> Extreme low sample size classes | n={} (< {})".format(n_elow, N_elow))

# Q: How many classes/codes were able to match the most enriched class in terms of sample size (e.g. 5707)? 
eps = 10
n_matched = sum(1 for l, c in sizes.items() if c >= max_size-eps)
print("> n_matched: {} | max_size: {}".format(n_matched, max_size))

# sort by values
sizes_sorted = sorted(sizes.items(), key=operator.itemgetter(1))
summarize_dict(sizes, topn=15, sort_=True)

print("> sizes: {}".format(sizes.most_common(20)))
most_sample_sizes = sizes.most_common(topn)  # take(topn, sizes.items())
print("> Sample sizes | Top N={} codes:\n{}\n".format(topn, most_sample_sizes))
least_sample_sizes = sizes.most_common()[:-topn-1:-1]
print("> Sample sizss | Last N={} codes:\n{}\n".format(topn, least_sample_sizes))

# test
target = most_sample_sizes[0][0]
y = encode_labels(dfy, pos_label=target, codebook=codebook, verbose=1)

> Low sample size classes | n=489 (< 1000)
> Extreme low sample size classes | n=176 (< 10)
> n_matched: 128 | max_size: 5872
[356188] -> 1
[206649] -> 1
[223123] -> 1
[156679] -> 1
[530170] -> 1
[264663] -> 1
[154120] -> 1
[162511] -> 1
[196618] -> 1
[302505] -> 1
[724864] -> 1
[634642] -> 1
[336677] -> 1
[161265] -> 1
[451765] -> 1
> sizes: [('17426', 5872), ('19752', 5872), ('7047', 5872), ('30973', 5872), ('28233', 5872), ('7369', 5872), ('7898', 5872), ('21600', 5872), ('7708', 5872), ('20933', 5872), ('339143', 5872), ('67686', 5872), ('20859', 5872), ('7773', 5872), ('7112', 5872), ('7187', 5872), ('7138', 5872), ('24984', 5872), ('28571', 5872), ('66902', 5872)]
> Sample sizes | Top N=5 codes:
[('17426', 5872), ('19752', 5872), ('7047', 5872), ('30973', 5872), ('28233', 5872)]

> Sample sizss | Last N=5 codes:
[('206060', 1), ('162339', 1), ('64204', 1), ('80945', 1), ('162057', 1)]

(encode_labels) sample size: Counter({0: 1117627, 1: 5872})


## Initial Model Training

### 1. Feature Selection

In [12]:
"""
Ref
---
1. pip install feature-selector

   https://github.com/WillKoehrsen/feature-selector
   
   possible dependency 
      brew install libomp
      
   <debug> 
       + RuntimeError: Python is not installed as a framework.
          > https://stackoverflow.com/questions/34977388/matplotlib-runtimeerror-python-is-not-installed-as-a-framework
   
"""
import feature_selector 

### 2. Save a copy of encoded training set (optional)

In [13]:
tSaveEncodedTSet = True 

if tSaveEncodedTSet: 
    output_file = f"andromeda-pond-{cohort}-encoded.csv"
    save_data(ts, output_file=output_file)
    

(save_data) Saved dataframe (dim=(1123499, 129)) to:
/Users/barnett/Documents/work/loinc_predictor/data/andromeda-pond-hepatitis-c-encoded.csv



### 3. Model Training

In [14]:
import utils_tree, utils_sys, analyzer
import collections
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from analyzer import balance_by_downsampling

# data transformation
col = 'test_result_loinc_code'
X, y = dfX.values, dfy[col].values
print("> dim(X): {}, sample(y): {}".format(X.shape, np.random.choice(np.unique(y),20) ))

# feature scaling
scaler = MinMaxScaler() # MinMaxScaler(), StandardScaler()
X = scaler.fit_transform(X)

n_fold = 5
n_min = n_fold

# to save performance data
header = ['code', 'mean', 'std', 'n_pos']
sdict = {h:[] for h in header}
for code in loinc_set: 
    y_eff = analyzer.encode_labels(y, pos_label=code)
    
    counter = collections.Counter(y_eff)
    n_pos, n_neg = counter[codebook['pos']], counter[codebook['neg']]
    print("> sample size | n(+): {}, n(-): {} | dim(y_encoded): {}".format(n_pos, n_neg, y_eff.shape))
    
    if n_pos >= n_min: 
        # downsampling majority classes (very often we have too many negative instances)
        X_eff, y_eff = balance_by_downsampling(X, y_eff, method='multiple', majority_max=3)
        n_pos, n_neg = counter[codebook['pos']], counter[codebook['neg']]
        print(f"> sample size (after downsampling majority) | n(+): {n_pos}, n(-): {n_neg}")
        
        scores = analyzer.eval_performance(X_eff, y_eff, model=None, cv=n_fold, random_state=53, verbose=1)
        mean_score = np.mean(scores)
        std_score = np.std(scores)
        print("> average: {}, std: {}".format(mean_score, std_score))
    else: 
        print("> (positive) sample size too small, n={}".format(n_pos))
        mean_score = -1 
        std_score = -1
    sdict['code'].append(code)
    sdict['mean'].append(mean_score)
    sdict['std'].append(std_score)
    sdict['n_pos'].append(n_pos)

# --------------------------------------------------
# save performance dataframe
df_perf = DataFrame(sdict, columns=header)
df_perf = df_perf.sort_values(by=['mean', ]) # ascending=False
analyzer.save_performnace(df, output_dir='result') # output_file/'' (performance-<cohort>.csv)

cohort = 'hepatitis-c'
output_dir = os.path.join(os.getcwd(), 'result')
output_file = f"performance-{cohort}-2.csv" 
output_path = os.path.join(output_dir, output_file)
df_perf.to_csv(output_path, sep='|', index=False, header=True)

for code, score in zip(df_perf['code'], df_perf['mean']):
    print(f"[{code}] -> {score}")

> dim(X): (1123499, 243), sample(y): ['28233' '28571' '326231' '45443' '7856' '60855' '51797' '10058' '28571'
 '162297' '7138' '728600' '30163' '264531' '311472' '204081' '542183'
 '303503' '139808' '33944']
(encode_labels) sample size: Counter({0: 1117627, 1: 5872})
> sample size | n(+): 5872, n(-): 1117627 | dim(y_encoded): (1123499,)
balance_by_downsampling) nl: 2, labels: [0 1] | nf=243
... After merging (X, y) => dim(X): (1123499, 244)


KeyError: 244

### Visualize Results

In [None]:
"""

Memo
---- 
1. performance plot

   perplot: https://pypi.org/project/perfplot/
"""
import seaborn as sns
import matplotlib.pyplot as plt
from analyzer import load_performance

sns.set(style="whitegrid")

# Initialize the matplotlib figure
f, ax = plt.subplots(figsize=(6, 20))
sns.set_color_codes("pastel")

#---------------------------------------------

# load performance data
cohort = 'hepatitis-c'
df_perf = load_performance(input_dir='result', cohort=cohort)
print("> dim(performance matrix): {}".format(df_perf.shape))

# sort ~ performance scores 
# df_perf = df_perf.sort_values(by=['mean', ], ascending=False)

header = ['code', 'mean', 'std', 'n_pos']
codes = df_perf['code']
n_codes = len(codes)
scores = df_perf['mean']

# some statistics
score_high = 0.90
score_low = 0.50

codes_low_sz = df_perf.loc[df_perf['mean'] < 0]['code']
codes_scored = df_perf.loc[df_perf['mean'] >= 0]['code']
codes_high_score = df_perf.loc[df_perf['mean'] >= score_high]['code']
assert n_codes == len(codes_low_sz) + len(codes_scored)

print("1. Total number of codes: {} | n(low_sample): {}, n(scored):{}, n(high scored):{}".format(n_codes, 
   len(codes_low_sz), len(codes_scored), len(codes_high_score)))
r_scored = len(codes_scored)/(n_codes+0.0)
rh = len(codes_high_score)/(n_codes+0.0)
print("2. Fraction of scored codes: {}".format(r_scored))
print("3. Fraction of highly scored codes: {}".format(rh))

# Effective performance dataframe, ruling out those codes without scores (due to low sample sizes)
df_eff = df_perf.loc[df_perf['mean'] >= 0.0]

n_offset = 25
df_topn = df_eff.sort_values(['mean', ], ascending=False).head(n_offset)
df_botn = df_eff.sort_values(['mean', ], ascending=True).head(n_offset)
# print(df_botn)

# codes = [str(c) for c in df_botn['code'].values]
# print('lower codes: {}'.format(codes))
# scores = df_botn['mean'].values
# print('scores: {}'.format(scores))

# top n + bottom n
dfe = pd.concat([df_topn, df_botn], ignore_index=True)
dfe.sort_values(by=['mean', ], ascending=False, inplace=True)
codes = [str(c) for c in dfe['code'].values]
scores = dfe['mean'].values
# print('lower(n)+higher codes(n): {}'.format(codes))
# print('scores: {}'.format(scores))
print(dfe)

# sns.barplot(x="total", y="abbrev", data=crashes,
#             label="Total", color="b")

# --------------------
# ax = sns.barplot(x='mean', y='code', data=df_botn)
# print("-------------------------\n\n")
# print("> dtype: {}".format(df_botn.dtypes))
# print(df_botn.head(10))

# dfe = dfe[['mean', 'code']]
# dfe.plot(kind='bar')

sns.barplot(x='mean', y='code', data=dfe, order=dfe['code'], # order has to be specified; even if already sorted!!!
            label="LOINC", color="b", orient='h')

# ax = sns.barplot(x='mean', y='code', data=df)

# ax.set_xlabel('Fmax Score')
# ax.set_ylabel('LOINC')
