# $\fbox{2-2}$ レセプトの擬似生成

## 擬似生成に必要なデータの用意

In [1]:
# グラフを移動拡大縮小したい場合
%matplotlib notebook
# グラフをインラインで表示
# %matplotlib inline

import gc

import mylibs.mylib1 as mylib1  # 本書 1 章の関数群を含む自作ライブラリ
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import norm

# 表示する行・列の数を設定
pd.options.display.max_rows = 8
pd.options.display.max_columns = 10

# matplotlib のフォントを設定
plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['mathtext.fontset'] = 'stix'

### 加入者情報データ

In [2]:
df_info = pd.read_csv('./pseudo_medical/records/excl_bp/info.csv')

df_info.shape

(4815, 7)

### 月次入院発生率

In [3]:
df_admission = pd.read_csv('./public_stats/processed/ps_admission.csv')

df_admission

Unnamed: 0,sex,alb_min,alb_max,A00,C00,...,O00,P00,Q00,R00,S00
0,M,0.0,0.0,0.000000,0.000000,...,0.0,0.015865,0.000000,0.000000,0.000000
1,M,1.0,4.0,0.000000,0.000000,...,0.0,0.000000,0.001291,0.000000,0.001291
2,M,5.0,9.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000969
3,M,10.0,14.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000937
...,...,...,...,...,...,...,...,...,...,...,...
34,F,70.0,74.0,0.000000,0.003797,...,0.0,0.000000,0.000000,0.000000,0.001266
35,F,75.0,79.0,0.000000,0.004194,...,0.0,0.000000,0.000000,0.000000,0.002796
36,F,80.0,84.0,0.000831,0.004155,...,0.0,0.000000,0.000000,0.000831,0.004155
37,F,85.0,130.0,0.000690,0.003448,...,0.0,0.000000,0.000000,0.000690,0.006206


In [4]:
less_age_75 = (df_admission.alb_max < 75)
df_admission = df_admission.loc[less_age_75]
cols = df_admission.columns[3:]
df_admission_cumrate = df_admission.copy()
df_admission_cumrate[cols] = df_admission[cols].cumsum(axis=1)

df_admission_cumrate

Unnamed: 0,sex,alb_min,alb_max,A00,C00,...,O00,P00,Q00,R00,S00
0,M,0.0,0.0,0.000000,0.000000,...,0.005288,0.021153,0.021153,0.021153,0.021153
1,M,1.0,4.0,0.000000,0.000000,...,0.002581,0.002581,0.003872,0.003872,0.005163
2,M,5.0,9.0,0.000000,0.000000,...,0.000969,0.000969,0.000969,0.000969,0.001938
3,M,10.0,14.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000937
...,...,...,...,...,...,...,...,...,...,...,...
31,F,55.0,59.0,0.000685,0.002740,...,0.005480,0.005480,0.005480,0.005480,0.006165
32,F,60.0,64.0,0.000000,0.002635,...,0.007246,0.007246,0.007246,0.007246,0.008563
33,F,65.0,69.0,0.000000,0.003053,...,0.010687,0.010687,0.010687,0.010687,0.011705
34,F,70.0,74.0,0.000000,0.003797,...,0.016453,0.016453,0.016453,0.016453,0.017718


In [5]:
df_admission_cumrate[cols[-1]].max()

0.025144669399999993

### 月次外来発生率

In [6]:
df_outpatient = pd.read_csv('./public_stats/processed/ps_outpatient.csv')

df_outpatient

Unnamed: 0,sex,alb_min,alb_max,A00,C00,...,O00,P00,Q00,R00,S00
0,M,0.0,0.0,0.068748,0.010577,...,0.0,0.052883,0.063460,0.042307,0.037018
1,M,1.0,4.0,0.101963,0.002581,...,0.0,0.006453,0.019360,0.019360,0.064533
2,M,5.0,9.0,0.092074,0.004846,...,0.0,0.000969,0.006784,0.014538,0.066875
3,M,10.0,14.0,0.051562,0.003750,...,0.0,0.000000,0.005625,0.011250,0.105936
...,...,...,...,...,...,...,...,...,...,...,...
34,F,70.0,74.0,0.045562,0.095553,...,0.0,0.000000,0.001266,0.026578,0.081631
35,F,75.0,79.0,0.044035,0.093661,...,0.0,0.000000,0.001398,0.030056,0.100651
36,F,80.0,84.0,0.044048,0.084772,...,0.0,0.000000,0.000831,0.032413,0.093083
37,F,85.0,130.0,0.035166,0.057921,...,0.0,0.000000,0.000690,0.027582,0.093777


In [7]:
less_age_75 = (df_outpatient.alb_max < 75)
df_outpatient = df_outpatient.loc[less_age_75]
cols = df_outpatient.columns[3:]

df_outpatient_cumrate = df_outpatient.copy()
df_outpatient_cumrate[cols] = df_outpatient[cols].cumsum(axis=1)

df_outpatient_cumrate

Unnamed: 0,sex,alb_min,alb_max,A00,C00,...,O00,P00,Q00,R00,S00
0,M,0.0,0.0,0.068748,0.079325,...,1.200449,1.253332,1.316792,1.359099,1.396117
1,M,1.0,4.0,0.101963,0.104544,...,1.383593,1.390046,1.409406,1.428766,1.493299
2,M,5.0,9.0,0.092074,0.096920,...,0.961445,0.962414,0.969198,0.983736,1.050611
3,M,10.0,14.0,0.051562,0.055312,...,0.562490,0.562490,0.568115,0.579365,0.685301
...,...,...,...,...,...,...,...,...,...,...,...
31,F,55.0,59.0,0.028770,0.098641,...,1.331657,1.331657,1.333712,1.356317,1.423448
32,F,60.0,64.0,0.036229,0.114614,...,1.545974,1.545974,1.548609,1.567711,1.632264
33,F,65.0,69.0,0.037659,0.124174,...,1.867189,1.867189,1.869224,1.892634,1.961337
34,F,70.0,74.0,0.045562,0.141115,...,2.420466,2.420466,2.421732,2.448310,2.529941


In [8]:
# 月次外来発生率
df_outpatient[(df_outpatient['sex']=='F') & (df_outpatient['alb_min']==55)]

Unnamed: 0,sex,alb_min,alb_max,A00,C00,...,O00,P00,Q00,R00,S00
31,F,55.0,59.0,0.02877,0.069871,...,0.0,0.0,0.002055,0.022605,0.067131


### 平均在院日数

In [9]:
df_days = pd.read_csv('./public_stats/processed/ps_days.csv')

df_days

Unnamed: 0,sex,alb_min,alb_max,A00,C00,...,O00,P00,Q00,R00,S00
0,M,0.0,0.0,4.5,9.3,...,0.0,11.3,16.1,4.0,2.6
1,M,1.0,4.0,3.8,12.1,...,0.0,38.7,7.5,3.1,1.7
2,M,5.0,9.0,7.7,16.0,...,0.0,8.2,7.6,2.9,4.1
3,M,10.0,14.0,3.9,19.2,...,0.0,5.2,16.7,4.8,7.8
...,...,...,...,...,...,...,...,...,...,...,...
34,F,70.0,74.0,18.4,15.2,...,0.0,0.0,12.0,16.9,28.4
35,F,75.0,79.0,20.1,18.4,...,0.0,0.0,123.9,38.4,34.7
36,F,80.0,84.0,28.1,21.8,...,0.0,0.0,51.9,34.3,42.6
37,F,85.0,130.0,39.1,26.9,...,0.0,0.0,30.0,31.8,47.4


In [10]:
less_age_75 = (df_days.alb_max < 75)
df_days = df_days.loc[less_age_75]

df_days

Unnamed: 0,sex,alb_min,alb_max,A00,C00,...,O00,P00,Q00,R00,S00
0,M,0.0,0.0,4.5,9.3,...,0.0,11.3,16.1,4.0,2.6
1,M,1.0,4.0,3.8,12.1,...,0.0,38.7,7.5,3.1,1.7
2,M,5.0,9.0,7.7,16.0,...,0.0,8.2,7.6,2.9,4.1
3,M,10.0,14.0,3.9,19.2,...,0.0,5.2,16.7,4.8,7.8
...,...,...,...,...,...,...,...,...,...,...,...
31,F,55.0,59.0,10.8,10.9,...,0.0,0.0,9.6,6.1,25.8
32,F,60.0,64.0,37.4,11.8,...,0.0,0.0,10.1,11.0,20.1
33,F,65.0,69.0,31.7,14.3,...,0.0,0.0,25.2,13.9,26.4
34,F,70.0,74.0,18.4,15.2,...,0.0,0.0,12.0,16.9,28.4


## 擬似生成するレセプトのフォーマット定義

## レセプトの擬似生成

### 擬似コードによるアウトライン

### 関数の定義

In [11]:
def get_info_for_i(df_info, i):
    cols = ['iid', 'sex', 'birth_ym', 'start_obs_ym', 'end_obs_ym']
    (iid, sex, birth_ym, start_obs_ym, end_obs_ym) = df_info.iloc[i][cols]
    birth_t = mylib1.ym_to_t(birth_ym)
    start_obs_t = mylib1.ym_to_t(start_obs_ym)
    end_obs_t = mylib1.ym_to_t(end_obs_ym)
    return (iid, sex, birth_t, start_obs_t, end_obs_t)


get_info_for_i(df_info, 1000)

('i001696', 'F', 2001.375, 2017.7083333333333, 2019.9583333333333)

In [12]:
def get_ss_admission_cumrate(df_admission_cumrate, sex, alb):
    cols = df_admission_cumrate.columns[3:]
    is_sex = (df_admission_cumrate.sex == sex)
    more_alb_min = (df_admission_cumrate.alb_min <= alb)
    less_alb_max = (alb <= df_admission_cumrate.alb_max)
    df = df_admission_cumrate.loc[(is_sex & more_alb_min & less_alb_max), cols]
    ss_adm_cumrate = pd.Series(df.columns, index=df.values[0])
    return ss_adm_cumrate


(sex, alb) = ('F', 57)
ss_adm_cumrate = get_ss_admission_cumrate(df_admission_cumrate, sex, alb)

ss_adm_cumrate

0.000685    A00
0.002740    C00
0.002740    D50
0.002740    E00
           ... 
0.005480    P00
0.005480    Q00
0.005480    R00
0.006165    S00
Length: 19, dtype: object

In [13]:
def get_random_num_for_adm(rs):
    np.random.seed(rs)
    u = np.random.rand()  # 一様乱数
    rs = rs + 1           # 乱数シード更新
    return (u, rs)


rs = 100
get_random_num_for_adm(rs)

(0.5434049417909654, 101)

In [14]:
def get_admission_disease(ss_adm_cumrate, u):
    ss = ss_adm_cumrate[u < ss_adm_cumrate.index]
    if len(ss) == 0:
        return (False, None)
    else:
        return (True, ss.iloc[0])


print(get_admission_disease(ss_adm_cumrate, 0.005))
print(get_admission_disease(ss_adm_cumrate, 0.006))
print(get_admission_disease(ss_adm_cumrate, 0.007))

(True, 'M00')
(True, 'S00')
(False, None)


In [15]:
def get_avg_hospdays(df_days, sex, alb, dis):
    is_sex = (df_days.sex == sex)
    more_alb_min = (df_days.alb_min <= alb)
    less_alb_max = (alb <= df_days.alb_max)
    return df_days.loc[(is_sex & more_alb_min & less_alb_max), dis].values[0]


(sex, alb, dis) = ('F', 57, 'R00')
get_avg_hospdays(df_days, sex, alb, dis)

6.1

In [16]:
import math


def get_random_days(mu, rs):
    np.random.seed(rs)
    # 指数分布乱数を整数値に切り上げ
    days = math.ceil(np.random.exponential(mu))
    rs = rs + 1  # 乱数シード更新
    return (days, rs)


(mu, rs) = (6.1, 100)
get_random_days(mu, rs)

(5, 101)

In [17]:
def make_new_rid(rid):
    i = int(rid[1:]) + 1
    return 'r' + str(i).zfill(8)


rid = 'r00000010'
make_new_rid(rid)

'r00000011'

In [18]:
def create_df_common(iid, rid, ym, receipt_type, admission_ym, days_in_month):
    cols = ['iid', 'rid', 'ym', 'receipt_type', 'admission_ym', 'days']
    values = [iid, rid, ym, receipt_type, admission_ym, days_in_month]
    return pd.DataFrame(values, cols).T


(iid, ym, receipt_type, admission_ym, days_in_month) \
    = ('i000001', '2010/02', 'impatient', '2010/01', 25)
create_df_common(iid, rid, ym, receipt_type, admission_ym, days_in_month)

Unnamed: 0,iid,rid,ym,receipt_type,admission_ym,days
0,i000001,r00000010,2010/02,impatient,2010/01,25


In [19]:
def create_df_common_inpatient_after_m_months(
        iid, rid, ym, admission_ym, days, m):
    ym = mylib1.t_to_ym(mylib1.ym_to_t(admission_ym) + m / 12)
    if m == 0:
        days_in_month = min(15, days)
    else:
        days_in_month = min(30, days - 15 - 30 * (m - 1))
    return create_df_common(iid, rid, ym, 'inpatient',
                           admission_ym, days_in_month)


create_df_common_inpatient_after_m_months(
    iid, rid, ym, admission_ym, 40, 1)

Unnamed: 0,iid,rid,ym,receipt_type,admission_ym,days
0,i000001,r00000010,2010/02,inpatient,2010/01,25


In [20]:
def add_df_xxx_on_csv(open_csv_object, df_xxx):
    if len(df_xxx) == 0:
        pass
    else:
        open_csv_object.writerow(df_xxx.values[0])

In [21]:
def create_df_disease(iid, rid, first_ym, dis):
    cols = ['iid', 'rid', 'first_ym', 'icd10_code']
    values = [iid, rid, first_ym, dis]
    return pd.DataFrame(values, cols).T


first_ym = admission_ym
create_df_disease(iid, rid, first_ym, dis)

Unnamed: 0,iid,rid,first_ym,icd10_code
0,i000001,r00000010,2010/01,R00


In [22]:
def create_df_treatment(iid, rid, treatment_code):
    cols = ['iid', 'rid', 'treatment_code']
    values = [iid, rid, treatment_code]
    return pd.DataFrame(values, cols).T


treatment_code = 'A100'
create_df_treatment(iid, rid, treatment_code)

Unnamed: 0,iid,rid,treatment_code
0,i000001,r00000010,A100


In [23]:
def get_outpatient_rate(df_outpatient, sex, alb):
    cols = df_outpatient.columns[3:]
    is_sex = (df_outpatient.sex == sex)
    more_alb_min = (df_outpatient.alb_min <= alb)
    less_alb_max = (alb <= df_outpatient.alb_max)
    df = df_outpatient.loc[(is_sex & more_alb_min & less_alb_max), cols]
    ss_out_rate = pd.Series(df.columns, index=df.values[0])
    return ss_out_rate


(sex, alb) = ('F', 57)
ss_out_rate = get_outpatient_rate(df_outpatient, sex, alb)

ss_out_rate

0.028770    A00
0.069871    C00
0.004110    D50
0.110286    E00
           ... 
0.000000    P00
0.002055    Q00
0.022605    R00
0.067131    S00
Length: 19, dtype: object

In [24]:
def get_random_num_for_out(rs):
    np.random.seed(rs)
    u = np.random.rand(19)  # 一様乱数
    rs = rs + 1             # 乱数シード更新
    return (u, rs)


rs = 100
(us, rs) = get_random_num_for_out(rs)

(us, rs)

(array([0.54340494, 0.27836939, 0.42451759, 0.84477613, 0.00471886,
        0.12156912, 0.67074908, 0.82585276, 0.13670659, 0.57509333,
        0.89132195, 0.20920212, 0.18532822, 0.10837689, 0.21969749,
        0.97862378, 0.81168315, 0.17194101, 0.81622475]), 101)

In [25]:
def get_outpatient_diseases(ss_out_rate, us):
    diss = ss_out_rate[us < ss_out_rate.index.values].values
    if len(diss) == 0:
        return (False, None)
    else:
        return (True, diss)

print(get_outpatient_diseases(ss_out_rate, np.zeros(19)))
print(get_outpatient_diseases(ss_out_rate, us))
print(get_outpatient_diseases(ss_out_rate, np.ones(19)))

(True, array(['A00', 'C00', 'D50', 'E00', 'F00', 'G00', 'H00', 'H60', 'I00',
       'J00', 'K00', 'L00', 'M00', 'N00', 'Q00', 'R00', 'S00'],
      dtype=object))
(True, array(['F00', 'I00', 'M00'], dtype=object))
(False, None)


In [26]:
create_df_common(iid, rid, ym, 'outpatient', '-', 1)

Unnamed: 0,iid,rid,ym,receipt_type,admission_ym,days
0,i000001,r00000010,2010/02,outpatient,-,1


### 関数を組み合わせてコードを組む

In [27]:
import csv

dict_csv_paths = {'common': './pseudo_medical/records/excl_bp/commons.csv',
                  'disease': './pseudo_medical/records/excl_bp/diseases.csv',
                  'treatment': './pseudo_medical/records/excl_bp/treatments.csv'}

commons_csv = open(dict_csv_paths['common'], 'w')
open_common_object = csv.writer(commons_csv)
cols = ['iid', 'rid', 'ym', 'receipt_type', 'admission_ym', 'days']
open_common_object.writerow(cols)

diseases_csv = open(dict_csv_paths['disease'], 'w')
open_disease_object = csv.writer(diseases_csv)
cols = ['iid', 'rid', 'first_ym', 'icd10_code']
open_disease_object.writerow(cols)

treatments_csv = open(dict_csv_paths['treatment'], 'w')
open_treatment_object = csv.writer(treatments_csv)
cols = ['iid', 'rid', 'treatment_code']
open_treatment_object.writerow(cols)

rid = 'r00000000'
rs = 0

for i in np.arange(len(df_info)):  # df_info を上から順に参照
    gc.collect()
    # df_info の i 番目の (iid, sex, birth_t, start_obs_t, end_obs_t) を取得 # 1
    (iid, sex, birth_t, start_obs_t, end_obs_t) = get_info_for_i(df_info, i)
    t = start_obs_t
    while start_obs_t <= t <= end_obs_t:
        alb = int(t - birth_t)
        # 入院累積発生率の pd.Series を取得 # 2
        ss_adm_cumrate = get_ss_admission_cumrate(
            df_admission_cumrate, sex, alb)
        # 入院用に一様乱数 u(rs) (0 <= r <= 1)を生成 & ランダムシード更新 # 3
        (u, rs) = get_random_num_for_adm(rs)
        # (入院発生？, 入院原因傷病 dis) = 入院乱数シミュレーション(u) # 4
        (does_adm_occur, dis) = get_admission_disease(ss_adm_cumrate, u)
        if does_adm_occur:
            mu = get_avg_hospdays(df_days, sex, alb, dis) # 5
            # 在院日数 days を指数分布(mu) で決定 # 6
            (days, rs) = get_random_days(mu, rs)
            # 入院発生年月、入院は全て月央に発生すると仮定
            admission_ym = mylib1.t_to_ym(t)
            # 月をまたぐ継続入院でなくても次のコードブロックを実行
            does_hosp_continue = True
            m = 0  # 初月入院を 0、次月から月をまたぐごとに +1
            while does_hosp_continue:
                ym = mylib1.t_to_ym(t)
                rid = make_new_rid(rid) # 7
                # 共通レコード(入院)の発行
                # 共通レコード(入院)の作成
                df_common = create_df_common_inpatient_after_m_months(
                    iid, rid, ym, admission_ym, days, m) # 8
                add_df_xxx_on_csv(open_common_object, df_common) # 9
                # 傷病レコードの発行
                df_disease = create_df_disease(iid, rid, admission_ym, dis) # 10
                add_df_xxx_on_csv(open_disease_object, df_disease) # 9
                # 診療行為レコード(入院)の発行
                df_treatment = create_df_treatment(iid, rid, 'A100') # 11
                add_df_xxx_on_csv(open_treatment_object, df_treatment) # 9
                # 診療行為レコード(手術)を入院の 10% に発行
                (u, rs) = get_random_num_for_adm(rs)
                if u <= 0.1:
                    df_treatment = create_df_treatment(iid, rid, 'K000')
                    add_df_xxx_on_csv(open_treatment_object, df_treatment) # 9

                t = t + 1 / 12  # t を１ヶ月ずらす
                does_hosp_continue = (days - 15 - 30 * m > 0)
                m = m + 1

        else:  # 入院発生なし
            # 外来発生率 out(sex, alb) を取得 # 12
            ss_out_rare = get_outpatient_rate(df_outpatient, sex, alb)
            # 外来用に一様乱数 u(rs) (0 <= r <= 1)を生成 & ランダムシード更新 # 13
            (us, rs) = get_random_num_for_out(rs)
            # (外来発生？, 複数外来原因傷病 diss) = 外来乱数シミュレーション(u) # 14
            (does_occur_outpatient, diss) \
                = get_outpatient_diseases(ss_out_rate, us)
            if does_occur_outpatient:
                # 外来発生年月、外来は全て月央に発生すると仮定
                ym = mylib1.t_to_ym(t)
                for dis in diss:
                    rid = make_new_rid(rid)
                    # 共通レコード(外来)の発行
                    # 共通レコード(外来) df_common(iid, rid, ym) の作成 # 15
                    df_common = create_df_common(iid, rid, ym,
                                               'outpatient', '-', 1)
                    add_df_xxx_on_csv(open_common_object, df_common) # 9
                    # 傷病レコードの発行
                    df_disease = create_df_disease(iid, rid, ym, dis)
                    add_df_xxx_on_csv(open_disease_object, df_disease) # 9
                    # 診療行為レコード(外来)の発行
                    df_treatment = create_df_treatment(iid, rid, 'A000')
                    add_df_xxx_on_csv(open_treatment_object, df_treatment) # 9
                    df_treatment = create_df_treatment(iid, rid, 'F000')
                    add_df_xxx_on_csv(open_treatment_object, df_treatment) # 9
                    # 診療行為レコード(手術)を外来の 1% に発行
                    (u, rs) = get_random_num_for_adm(rs)
                    if u <= 0.01:
                        df_treatment = create_df_treatment(iid, rid, 'K000')
                        add_df_xxx_on_csv(open_treatment_object, df_treatment) # 9

            t = t + 1 / 12  # t を１ヶ月ずらす

commons_csv.close()
diseases_csv.close()
treatments_csv.close()

In [28]:
df_commons = pd.read_csv('./pseudo_medical/records/excl_bp/commons.csv')

df_commons

Unnamed: 0,iid,rid,ym,receipt_type,admission_ym,days
0,i000000,r00000001,2010/01,outpatient,-,1
1,i000000,r00000002,2010/02,outpatient,-,1
2,i000000,r00000003,2010/02,outpatient,-,1
3,i000000,r00000004,2010/03,outpatient,-,1
...,...,...,...,...,...,...
433309,i008264,r00433310,2010/09,outpatient,-,1
433310,i008264,r00433311,2010/09,outpatient,-,1
433311,i008264,r00433312,2010/10,outpatient,-,1
433312,i008264,r00433313,2010/10,outpatient,-,1


In [29]:
df_diseases = pd.read_csv('./pseudo_medical/records/excl_bp/diseases.csv')

df_diseases

Unnamed: 0,iid,rid,first_ym,icd10_code
0,i000000,r00000001,2010/01,D50
1,i000000,r00000002,2010/02,H60
2,i000000,r00000003,2010/02,S00
3,i000000,r00000004,2010/03,G00
...,...,...,...,...
433309,i008264,r00433310,2010/09,C00
433310,i008264,r00433311,2010/09,N00
433311,i008264,r00433312,2010/10,I00
433312,i008264,r00433313,2010/10,K00


In [30]:
df_treatments = pd.read_csv('./pseudo_medical/records/excl_bp/treatments.csv')

df_treatments

Unnamed: 0,iid,rid,treatment_code
0,i000000,r00000001,A000
1,i000000,r00000001,F000
2,i000000,r00000002,A000
3,i000000,r00000002,F000
...,...,...,...
868941,i008264,r00433312,A000
868942,i008264,r00433312,F000
868943,i008264,r00433313,A000
868944,i008264,r00433313,F000


## まとめ