© 2018 Institute for Clinical Evaluative Sciences. All rights reserved.

TERMS OF USE:
##Not for distribution.## This code and data is provided to the user solely for its own non-commercial use by individuals and/or not-for-profit corporations. User shall not distribute without express written permission from the Institute for Clinical Evaluative Sciences.

##Not-for-profit.## This code and data may not be used in connection with profit generating activities.

##No liability.## The Institute for Clinical Evaluative Sciences makes no warranty or representation regarding the fitness, quality or reliability of this code and data.

##No Support.## The Institute for Clinical Evaluative Sciences will not provide any technological, educational or informational support in connection with the use of this code and data.

##Warning.## By receiving this code and data, user accepts these terms, and uses the code and data, solely at its own risk.

In [None]:
%cd ../../
%load_ext autoreload
%autoreload 2

In [None]:
import json
import pandas as pd
import plotly.graph_objects as go

from src.config import (
    root_path, can_folder, split_date, 
    DATE,
    SCr_rise_threshold, SCr_rise_threshold2
)
from src.utility import numpy_ffill
from src.prep_data import PrepDataCAN

In [None]:
main_dir = f'{root_path}/projects/{can_folder}'
prep = PrepDataCAN(adverse_event='ckd')
chemo_df = prep.load_data()

# Cohort Numbers Before and After Exclusions

In [None]:
# cohort after exclusions (of treatments without one or more of baseline/target blood values)
model_data = prep.get_data()
dev_cohort, test_cohort = prep.create_cohort(model_data, split_date, verbose=False)

# cohort before exclusions
first_visit_date = chemo_df.groupby('ikn')[DATE].min()
mask = chemo_df['ikn'].map(first_visit_date) <= split_date
dev_cohort2, test_cohort2 = chemo_df[mask], chemo_df[~mask]

In [None]:
show = lambda x: f"NSessions={len(x)}. NPatients={x['ikn'].nunique()}"
cohorts = {'Development': (dev_cohort, dev_cohort2), 'Testing': (test_cohort, test_cohort2)}
for name, (post_exc_cohort, pre_exc_cohort) in cohorts.items():
    print(f'{name} cohort')
    print(f'Before exclusions: {show(pre_exc_cohort)}')
    print(f'After exclusions: {show(post_exc_cohort)}\n')

# Create Alluvial Plot

In [None]:
def get_combined_ckd_and_aki_data():
    ckd_cols = ['ikn', 'baseline_eGFR', 'next_eGFR', 'next_SCr_value', 'dialysis']
    aki_cols = ['ikn', 'baseline_eGFR', 'SCr_peak', 'SCr_rise', 'SCr_fold_increase']

    # get ckd cohort
    prep_ckd = PrepDataCAN(adverse_event='ckd')
    ckd_cohort = prep_ckd.get_data()
    ckd_cohort = pd.concat([ckd_cohort[ckd_cols], prep_ckd.event_dates[['visit_date', 'first_visit_date']]], axis=1)

    # get aki cohort
    prep_aki = PrepDataCAN(adverse_event='aki')
    aki_cohort = prep_aki.get_data()
    aki_cohort = pd.concat([aki_cohort[aki_cols], prep_aki.event_dates[['visit_date', 'first_visit_date']]], axis=1)

    # get the union of aki and ckd cohort
    # WARNING: 2% of first visit dates differ between ckd and aki cohorts
    # TODO: fix this issue
    aki_data, ckd_data = aki_cohort.align(ckd_cohort, join='outer')
    df = aki_data.fillna(ckd_data).astype(aki_data.dtypes)
    df = df.sort_values(['ikn', 'visit_date'])

    return df

def compute_ckd_stages(df):
    name_map = {
        'Pre-treatment CKD': 'baseline_eGFR',
        'Post-treatment CKD': 'next_eGFR',
    }
    for name, eGFR_col in name_map.items():
        mask = df[eGFR_col] > 60
        df.loc[mask, name] = '1-2'

        mask = df[eGFR_col].between(45, 60, inclusive='left')
        df.loc[mask, name] = '3a'

        mask = df[eGFR_col].between(30, 45, inclusive='left')
        df.loc[mask, name] = '3b'

        mask = df[eGFR_col] < 30
        df.loc[mask, name] = '4-5'

        mask = df[eGFR_col].isnull()
        df.loc[mask, name] = 'No Information'

        assert df[name].notnull().all()

    return df

def compute_worst_aki_stages(df):
    # compute aki stages for each treatment session
    aki_stages = pd.Series(index=df.index, dtype=int)

    mask = (df['SCr_rise'] < SCr_rise_threshold) & (df['SCr_fold_increase'] < 1.5)
    aki_stages[mask] = 0

    mask = (df['SCr_rise'] >= SCr_rise_threshold) | (df['SCr_fold_increase'] >= 1.5)
    aki_stages[mask] = 1

    mask = df['SCr_fold_increase'] >= 2
    aki_stages[mask] = 2

    mask = (df['SCr_rise'] >= SCr_rise_threshold2) | (df['SCr_fold_increase'] >= 3)
    aki_stages[mask] = 3

    mask = df['SCr_peak'].isnull()
    aki_stages[mask] = -1

    assert aki_stages.notnull().all()

    # compute the worst aki stages for each patient
    worst_aki_stages = aki_stages.groupby(df['ikn']).max()
    worst_aki_stages = worst_aki_stages.replace(-1, 'No Information')

    return worst_aki_stages

In [None]:
df = get_combined_ckd_and_aki_data()
prep.event_dates = df[['visit_date', 'first_visit_date']] # hotfix
dev, test = prep.create_cohort(df, split_date, verbose=False)
cohorts = {'Development': dev, 'Testing': test, 'All': df}

In [None]:
for cohort, data in cohorts.items():
    # compute pre-treatment and post-treatment CKD stages
    result = data.groupby('ikn').first()
    result = compute_ckd_stages(result)
    # compute worst AKI stages
    result['Worst AKI'] = compute_worst_aki_stages(data)

    postfix = {'All': '', 'Development': '_dev', 'Testing': '_test'}[cohort]
    count = pd.DataFrame(result[['Pre-treatment CKD', 'Post-treatment CKD', 'Worst AKI']].value_counts(), columns=['Freq'])
    count = count.sort_index()
    count.to_csv(f'{main_dir}/data/alluvial/alluvial_raw{postfix}.csv')
    mask = count['Freq'] < 6
    count.loc[mask, 'Freq'] = '<6'
    count.to_csv(f'{main_dir}/data/alluvial/alluvial{postfix}.csv')

In [None]:
# populate and prepare source, target, value for alluvial plot
source, target, value = [], [], []
for pre_trt_ckd_stage, pre_trt_ckd_group in result.groupby('Pre-treatment CKD'):
    pre_trt_ckd_name = f'Pre-treatment CKD (stage{pre_trt_ckd_stage})'

    for aki_stage, aki_group in pre_trt_ckd_group.groupby('Worst AKI'):
        aki_name = f'Worst AKI (stage{aki_stage})'
        if aki_stage == 'No Information': aki_name = 'No AKI Information'
        if aki_stage == 0: aki_name = 'No AKI'
        source.append(pre_trt_ckd_name)
        target.append(aki_name)
        value.append(len(aki_group))

        for post_trt_ckd_stage, post_trt_ckd_group in aki_group.groupby('Post-treatment CKD'):
            post_trt_ckd_name = f'Post-treatment CKD (stage{post_trt_ckd_stage})'
            if post_trt_ckd_stage == 'No Information': post_trt_ckd_name = 'No CKD Information'
            source.append(aki_name)
            target.append(post_trt_ckd_name)
            value.append(len(post_trt_ckd_group))

label = list(set(source + target))
label_map = {name: i for i, name in enumerate(label)}
source, target = [label_map[s] for s in source], [label_map[t] for t in target]

In [None]:
fig = go.Figure(
    data=[
        go.Sankey(
            node=dict(label=label, align='left'),
            link=dict(source=source, target=target, value=value)
        )
    ],
    layout=dict(height=800)
)
fig.show()
fig.write_image(f'{main_dir}/data/alluvial_v2.png')

In [None]:
# display the numbers
count = {}
for pre_trt_ckd_stage, pre_trt_ckd_group in result.groupby('Pre-treatment CKD'):
    pre_trt_ckd_name = f'Pre-treatment CKD (stage{pre_trt_ckd_stage}) [N={len(pre_trt_ckd_group)}]'
    count[pre_trt_ckd_name] = {}

    for aki_stage, aki_group in pre_trt_ckd_group.groupby('Worst AKI'):
        aki_name = f'Worst AKI (stage{aki_stage})'
        if aki_stage == 'No Information': aki_name = 'No AKI Information'
        if aki_stage == 0: aki_name = 'No AKI'
        aki_name = f'{aki_name} [N={len(aki_group)}]'
        count[pre_trt_ckd_name][aki_name] = {}

        for post_trt_ckd_stage, post_trt_ckd_group in aki_group.groupby('Post-treatment CKD'):
            post_trt_ckd_name = f'Post-treatment CKD (stage{post_trt_ckd_stage})'
            if post_trt_ckd_stage == 'No Information': post_trt_ckd_name = 'No CKD Information'
            count[pre_trt_ckd_name][aki_name][post_trt_ckd_name] = len(post_trt_ckd_group)
print(json.dumps(count, indent=2))

In [None]:
count = pd.read_csv(f'{main_dir}/data/alluvial/alluvial_raw.csv')
count = count[count['Post-treatment CKD'] != 'No Information']
count = count[count['Pre-treatment CKD'] == '1-2']
N = count['Freq'].sum()
count = count[count['Worst AKI'] != '0.0']
n1 = count['Freq'].sum()
count = count[count['Post-treatment CKD'] != '1-2']
n2 = count['Freq'].sum()
print(f'AKI affected {n1} ({n1/N*100:0.1f}%) patients without pre-treatment CKD, '
      f'of whom {n2} ({n2/N*100:0.1f}%) subsequently developed CKD.')