# 房颤合集

# with leaking feature removal
# with ICD9 code included
# with updated common data preparation

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
from datetime import datetime

In [4]:
from sklearn.utils import shuffle

# Load data
## 经处理后电子病历信息数据集,包含生成训练测试集所需特征信息

In [9]:
df_data = pd.read_pickle('../data/processed_mimic/df_admin_pat_surgery_diag_lab.pkl')

In [10]:
df_data.head()

Unnamed: 0,PATIENTID,STAYID,ADMISSION_TIME,DISCHARGE_TIME,ADMISSION_TYPE,ADMISSION_LOCATION,INSURANCE,LANGUAGE,MARITAL_STATUS,RACE,AGE_GROUP,GENDER,ADMISSION_YEAR_GROUP,SURGERY_HISTORY,DIAGNOSIS_HISTORY,LAB_RESULT
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,adm_typ_urgent,adm_loc_transfer_from_hospital,ins_other,lan_other,ms_widowed,race_white,age_group_5,gender_f,2014 - 2016,,,
1,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,adm_typ_ew_emer_,adm_loc_emergency_room,ins_medicaid,lan_other,ms_widowed,race_white,age_group_5,gender_f,2014 - 2016,icd_9_5491,"icd_9_07070, icd_9_29680, icd_9_30981, icd_9_4...","lab_bilirubin_normal,lab_blood_normal,lab_gluc..."
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,adm_typ_ew_emer_,adm_loc_emergency_room,ins_medicaid,lan_other,ms_widowed,race_white,age_group_5,gender_f,2014 - 2016,icd_9_5491,"icd_9_07044, icd_9_07070, icd_9_07071, icd_9_2...",
3,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,adm_typ_ew_emer_,adm_loc_emergency_room,ins_medicaid,lan_other,ms_widowed,race_white,age_group_5,gender_f,2014 - 2016,,"icd_9_07070, icd_9_07071, icd_9_2761, icd_9_28...","lab_anion_gap_normal,lab_bicarbonate_abnormal,..."
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,adm_typ_eu_observation,adm_loc_emergency_room,ins_other,lan_other,ms_single,race_white,age_group_2,gender_f,2008 - 2010,,,


In [11]:
df_data.STAYID.nunique()

431231

## 暂时基于以上信息作为模型训练特征

## 训练集准备

In [8]:
features_training = [
    'STAYID',
    'GENDER',
    'AGE_GROUP',
    'ADMISSION_TYPE',
    'DIAGNOSIS_HISTORY',
    'SURGERY_HISTORY',
    'LAB_RESULT',
    'ADMISSION_LOCATION',
    'INSURANCE',
    'LANGUAGE',
    'MARITAL_STATUS',
    'RACE'
    ]      

## 训练集准备

In [14]:
## instance of diagnosis

df_diagnoses = pd.read_csv('../data/mimic-iv-2.2/hosp/diagnoses_icd.csv.gz')

In [15]:
## 诊断码
df_d_icd_diagnoses = pd.read_csv('../data/mimic-iv-2.2/hosp/d_icd_diagnoses.csv.gz')

In [16]:
#定义预测对象
use_case = 'AF4'

#定义预测相关编码

# 基于ICD10 编码系统
df_d_icd_10 = df_d_icd_diagnoses[df_d_icd_diagnoses.icd_code.str.contains('I48')]

# 基于ICD9 编码系统
df_d_icd_9 = df_d_icd_diagnoses[df_d_icd_diagnoses.icd_version==9]
df_d_icd_9 = df_d_icd_9[df_d_icd_9.icd_code.str.startswith('42731')]

#合并两种编码系统
df_d_icd = pd.concat([df_d_icd_10, df_d_icd_9], axis=0, ignore_index=True)

In [18]:
# 对于历史诊断码中包含与诊断码相同的特征, 将其从历史诊断码中移除

def remove_leaking_feature(value):
    if pd.isna(value):
        return value
    # 分割字符串为列表
    elements = value.split(',')
    # 移除包含'C22', 'C23', 'C24'的元素
    filtered_elements = [el for el in elements if not any(code in el for code in ['i48','icd_9_42731'])]
    return ','.join(filtered_elements)

# 应用函数到ICD_HISTORICAL列
df_data['DIAGNOSIS_HISTORY'] = df_data['DIAGNOSIS_HISTORY'].apply(remove_leaking_feature)


In [20]:
#确保特征已经移除
df_data[df_data.DIAGNOSIS_HISTORY.str.contains('i48',na=False)]

Unnamed: 0,PATIENTID,STAYID,ADMISSION_TIME,DISCHARGE_TIME,ADMISSION_TYPE,ADMISSION_LOCATION,INSURANCE,LANGUAGE,MARITAL_STATUS,RACE,AGE_GROUP,GENDER,ADMISSION_YEAR_GROUP,SURGERY_HISTORY,DIAGNOSIS_HISTORY,LAB_RESULT


### 病人诊断

In [21]:
df_diagnoses_disease = df_diagnoses[df_diagnoses.icd_code.isin(df_d_icd.icd_code)]
len(df_diagnoses_disease)

62282

In [22]:
df_diagnoses_disease.subject_id.nunique()

26674

In [23]:
df_diagnoses_disease = df_diagnoses_disease[['subject_id','hadm_id']]

In [24]:
df_diagnoses_disease.columns=['PATIENTID','STAYID']

In [25]:
df_diagnoses_disease.head()

Unnamed: 0,PATIENTID,STAYID
577,10001667,22672901
614,10001843,21728396
624,10001860,21441082
634,10001877,21320596
646,10001877,25679292


In [26]:
#保存疾病诊断相关信息

diagnosis_path = 'df_diagnoses_' + use_case + '.csv'

df_diagnoses_disease.to_csv(diagnosis_path,sep=';',index=False)

In [27]:
#生成训练及测试数据集

def create_train_test(df_processed,use_case,df_diag_positive):
    df_case = df_processed[df_processed.STAYID.isin(df_diag_positive.STAYID)]
    df_case[use_case] = 1
    df_control = df_processed[~df_processed.PATIENTID.isin(df_diag_positive.PATIENTID)]
    df_control[use_case] = 0

    df_labelled = pd.concat([df_case, df_control])

    # Shuffle the patient IDs
    IDs_patient = df_processed.PATIENTID.unique()
    IDs_patient = shuffle(IDs_patient, random_state = 12345)
    
    # Split train and test/eval
    IDs_patient_training = IDs_patient[:int(0.8*len(IDs_patient))]
    IDs_patient_test_eval = IDs_patient[int(0.8*len(IDs_patient)):]

    # Define training and test/eval
    df_training = df_labelled[df_labelled.PATIENTID.isin(IDs_patient_training)]
    df_testing = df_labelled[df_labelled.PATIENTID.isin(IDs_patient_test_eval)]

    # Prepare training data
    df_training_case = df_training[df_training[use_case] == 1]
    df_training_control = df_training[df_training[use_case] == 0]
    df_training_control = shuffle(df_training_control, random_state = 12345)
    df_training_control = df_training_control[:5*len(df_training_case)]
    
    df_training = pd.concat([df_training_case, df_training_control])
    df_training = shuffle(df_training, random_state = 12345)

    training_data_path = '../data/processed/df_training_' + use_case + '.csv'
    evaluation_data_path = '../data/processed/df_eval_' + use_case + '.csv'
    labelled_data_path = '../data/processed/df_labelled_' + use_case + '.csv'
    
    df_training.to_csv(training_data_path, sep = ";", index = False)
    df_testing.to_csv(evaluation_data_path, sep = ";", index = False)

    df_labelled.to_csv(labelled_data_path, sep = ";", index = False)




In [28]:
#生成训练及测试数据集

create_train_test(df_data,use_case,df_diagnoses_disease)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_case[use_case] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_control[use_case] = 0


###  检验训练及测试数据集

In [29]:
training_path = '../data/processed/df_training_' + use_case + '.csv'
df_training = pd.read_csv(training_path, sep = ";")

In [30]:
df_training.head()

Unnamed: 0,PATIENTID,STAYID,ADMISSION_TIME,DISCHARGE_TIME,ADMISSION_TYPE,ADMISSION_LOCATION,INSURANCE,LANGUAGE,MARITAL_STATUS,RACE,AGE_GROUP,GENDER,ADMISSION_YEAR_GROUP,SURGERY_HISTORY,DIAGNOSIS_HISTORY,LAB_RESULT,AF4
0,18714676,25615095,2123-07-02 04:21:00,2123-07-05 18:30:00,adm_typ_observation_admit,adm_loc_transfer_from_hospital,ins_medicare,lan_other,ms_married,race_black_african_american,age_group_5,gender_m,2008 - 2010,,"icd_10_b182, icd_10_d649, icd_10_d684, icd_10_...","lab_inr_pt__normal,lab_inr_pt__abnormal,lab_pt...",0
1,16941448,21932316,2145-01-04 23:14:00,2145-01-05 07:39:00,adm_typ_eu_observation,adm_loc_emergency_room,ins_medicaid,lan_other,ms_single,race_hispanic_or_latino,age_group_4,gender_m,2008 - 2010,,"icd_9_042, icd_9_27651, icd_9_30500, icd_9_305...",,0
2,14523725,27256411,2170-04-05 05:02:00,2170-04-11 16:18:00,adm_typ_ew_emer_,adm_loc_physician_referral,ins_other,lan_other,ms_married,race_white,age_group_5,gender_f,2017 - 2019,,,"lab_hematocrit_normal,lab_hemoglobin_normal,la...",0
3,11715814,22138662,2133-11-29 11:30:00,2133-11-30 18:00:00,adm_typ_surgical_same_day_admission,adm_loc_physician_referral,ins_medicaid,lan_other,ms_single,race_hispanic_or_latino,age_group_2,gender_f,2008 - 2010,,,,0
4,15814891,21120922,2166-03-05 18:24:00,2166-03-12 16:45:00,adm_typ_observation_admit,adm_loc_emergency_room,ins_other,lan_other,ms_married,race_black_african_american,age_group_4,gender_f,2011 - 2013,"icd_10_027034z, icd_10_047l3zz, icd_10_0ybn0zx...","icd_10_b9561, icd_10_e1165, icd_10_e119, icd_1...","lab_inr_pt__abnormal,lab_pt_abnormal,lab_ptt_a...",1


In [31]:
df_training[use_case].value_counts()

0    241240
1     48248
Name: AF4, dtype: int64