# 更新AF标签，加入ICD9编码

## AF - Atrial Fibrillation房颤

## 本notebook使用AF4为标签

In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, confusion_matrix, classification_report, roc_curve, auc, accuracy_score
import itertools, logging
from sklearn.feature_extraction.text import CountVectorizer
import joblib
from datetime import datetime

## 明确标签，load数据集

In [2]:
LABEL = 'AF4'

In [3]:
df_training = pd.read_csv('../data/processed/df_training_'+LABEL+'.csv',sep=';')

In [4]:
df_testing = pd.read_csv('../data/processed/df_eval_'+LABEL+'.csv',sep=';')

## 创建训练、测试数据集

#### 创建训练数据集

In [5]:
## 定义训练用特征

features_training = [
    'GENDER',
    'AGE_GROUP',
    'ADMISSION_TYPE',
    'DIAGNOSIS_HISTORY',
    'SURGERY_HISTORY',
    'LAB_RESULT',
    'ADMISSION_LOCATION',
    'INSURANCE',
    'LANGUAGE',
    'MARITAL_STATUS',
    'RACE'
    ]      

# 确保所有特征值都是字符串
df_training[features_training] = df_training[features_training].astype(str)

# 初始化 inputs 列表
inputs = []

# 遍历数据框的每一行， 生成observation列，聚集所有特征值，之后在此基础上通过TF-IDF生成特征向量集
for index, row in df_training.iterrows():
    # 用逗号分隔的特征字符串
    observation = ','.join(row[features_training].values)
    observation = observation.replace(',nan', '')
    inputs.append(observation)

# 构建训练数据框
train_data = pd.DataFrame({
    'STAYID': df_training.STAYID,
    'OBSERVATIONS': inputs,
    'LABEL': df_training[LABEL]
})

In [6]:
train_data.head()

Unnamed: 0,STAYID,OBSERVATIONS,LABEL
0,25615095,"gender_m,age_group_5,adm_typ_observation_admit...",0
1,21932316,"gender_m,age_group_4,adm_typ_eu_observation,ic...",0
2,27256411,"gender_f,age_group_5,adm_typ_ew_emer_,lab_hema...",0
3,22138662,"gender_f,age_group_2,adm_typ_surgical_same_day...",0
4,21120922,"gender_f,age_group_4,adm_typ_observation_admit...",1


#### 创建测试数据集

In [7]:
# 确保所有特征值都是字符串
df_testing[features_training] = df_testing[features_training].astype(str)

# 初始化 inputs 列表
inputs_test = []

# 遍历数据框的每一行
for index, row in df_testing.iterrows():
    # 用逗号分隔的特征字符串
    observation = ','.join(row[features_training].values)
    observation = observation.replace(',nan', '')
    inputs_test.append(observation)

test_data = pd.DataFrame(
    {'STAYID': df_testing.STAYID,
     'OBSERVATIONS': inputs_test,
     'LABEL': df_testing[LABEL]
    })

In [8]:
test_data.head()

Unnamed: 0,STAYID,OBSERVATIONS,LABEL
0,21728396,"gender_m,age_group_7,adm_typ_observation_admit...",1
1,24982426,"gender_m,age_group_8,adm_typ_urgent,lab_inr_pt...",1
2,23646008,"gender_f,age_group_5,adm_typ_ew_emer_,lab_inr_...",1
3,24817944,"gender_f,age_group_5,adm_typ_surgical_same_day...",1
4,22187210,"gender_m,age_group_4,adm_typ_direct_emer_,icd_...",1


In [9]:
test_data.head().OBSERVATIONS.tolist()

['gender_m,age_group_7,adm_typ_observation_admit,adm_loc_transfer_from_hospital,ins_other,lan_other,ms_single,race_white',
 'gender_m,age_group_8,adm_typ_urgent,lab_inr_pt__abnormal,lab_pt_abnormal,lab_ptt_abnormal,lab_hematocrit_normal,lab_hemoglobin_normal,lab_mch_normal,lab_mchc_normal,lab_mcv_normal,lab_platelet_count_normal,lab_rdw_normal,lab_red_blood_cells_abnormal,lab_white_blood_cells_abnormal,lab_anion_gap_abnormal,lab_bicarbonate_abnormal,lab_calcium__total_abnormal,lab_chloride_normal,lab_creatinine_abnormal,lab_glucose_abnormal,lab_magnesium_normal,lab_phosphate_normal,lab_potassium_normal,lab_sodium_abnormal,lab_urea_nitrogen_abnormal,lab_platelet_smear_normal,lab_absolute_lymphocyte_count_abnormal,lab_basophils_normal,lab_eosinophils_normal,lab_lymphocytes_abnormal,lab_monocytes_normal,lab_neutrophils_abnormal,lab_absolute_basophil_count_abnormal,lab_absolute_eosinophil_count_abnormal,lab_absolute_monocyte_count_abnormal,lab_absolute_neutrophil_count_abnormal,lab_rdw_sd_

## 训练数据向量化，使用TfidfVectorizer 
(https://scikit-learn.org/1.5/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer)

In [10]:
vectorizer = TfidfVectorizer(analyzer = "word"
                                  , max_features=500  #保留特征数量
                                  , dtype = np.float32
                                  #, max_df = 0.95
                                  #, min_df = 0.05
                                 )
train_tfidf = vectorizer.fit_transform(train_data.OBSERVATIONS)

train_X = train_tfidf.toarray()
train_y = train_data["LABEL"]

test_tfidf = vectorizer.transform(test_data.OBSERVATIONS)
test_X = test_tfidf.toarray()
test_y = test_data["LABEL"]

In [11]:
train_tfidf.shape

(289488, 500)

In [12]:
feature_names = vectorizer.get_feature_names_out()
feature_names[:20]

array(['adm_loc_clinic_referral', 'adm_loc_emergency_room',
       'adm_loc_physician_referral', 'adm_loc_procedure_site',
       'adm_loc_transfer_from_hospital', 'adm_loc_walk_in_self_referral',
       'adm_typ_ambulatory_observation', 'adm_typ_direct_emer_',
       'adm_typ_direct_observation', 'adm_typ_elective',
       'adm_typ_eu_observation', 'adm_typ_ew_emer_',
       'adm_typ_observation_admit', 'adm_typ_surgical_same_day_admission',
       'adm_typ_urgent', 'age_group_2', 'age_group_3', 'age_group_4',
       'age_group_5', 'age_group_6'], dtype=object)

## 模型训练

### 使用LGBM, 特征值为2000

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import joblib

# Feature extraction: Convert observations to features using CountVectorizer

vectorizer = TfidfVectorizer(analyzer = "word"
                                  , max_features=2000
                                  #, max_df = 0.95
                                  #, min_df = 0.05
                                  , dtype = np.float32
                                 )


X_train = vectorizer.fit_transform(train_data['OBSERVATIONS'])

y_train = train_data["LABEL"]



X_test = vectorizer.transform(test_data.OBSERVATIONS)
y_test = test_data["LABEL"]


# Model training: Train a RandomForestClassifier
#model = RandomForestClassifier(n_estimators=100, random_state=42)

#model.fit(X_train, y_train)


#model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
#    max_depth=1, random_state=0).fit(X_train, y_train)

#使用LGBM模型
model = LGBMClassifier()
model.fit(X_train, y_train)

# Model evaluation: Predict on the test set and print the classification report
y_pred = model.predict(X_test)
#print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


pred_lr_prob_ref = model.predict_proba(X_test)[:,1]
pred_lr = model.predict(X_test)

conf_mat = confusion_matrix(test_data["LABEL"], pred_lr)

pred_pro = model.predict_proba(X_test)[:,1]

frp_mi, trp_mi, thres_mi = roc_curve(test_data["LABEL"], pred_pro)

auc_val_mi = auc(frp_mi, trp_mi)

print("sensitivity: " + str(round(metrics.recall_score(test_data["LABEL"], pred_lr),2)) + '\n') 
print("specificity: " + str(round(1-metrics.recall_score(1-test_data["LABEL"], pred_lr),2)) + '\n') 
print("discrimination: " + str(round(auc_val_mi,3)) + '\n') 


joblib.dump(model, 'model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

[LightGBM] [Info] Number of positive: 48248, number of negative: 241240
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.328497 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 459386
[LightGBM] [Info] Number of data points in the train set: 289488, number of used features: 2000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.166667 -> initscore=-1.609438
[LightGBM] [Info] Start training from score -1.609438
sensitivity: 0.47

specificity: 0.98

discrimination: 0.906



['vectorizer.pkl']

In [14]:
# 获取特征重要性
feature_importance = model.feature_importances_

# 获取特征名称
feature_names = vectorizer.get_feature_names_out()

# 创建 DataFrame 并按重要性排序
explaining_df = pd.DataFrame({
    "feature_names": feature_names,
    "scores": feature_importance
}).sort_values(by="scores", ascending=False)
explaining_df[:20]

Unnamed: 0,feature_names,scores
906,icd_9_42732,72
1625,icd_9_v5861,58
470,icd_10_z7901,57
19,age_group_3,49
18,age_group_2,46
21,age_group_5,39
471,icd_10_z7902,39
20,age_group_4,38
13,adm_typ_eu_observation,37
27,gender_m,31


### 使用GradientBoosting, 特征值为2000
**没必要再跑vectorizerization,一起再跑下不容易出错**

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import joblib

# Feature extraction: Convert observations to features using CountVectorizer

vectorizer = TfidfVectorizer(analyzer = "word"
                                  , max_features=2000
                                  #, max_df = 0.95
                                  #, min_df = 0.05
                                  , dtype = np.float32
                                 )


X_train = vectorizer.fit_transform(train_data['OBSERVATIONS'])

y_train = train_data["LABEL"]



X_test = vectorizer.transform(test_data.OBSERVATIONS)
y_test = test_data["LABEL"]


# Model training: Train a RandomForestClassifier
#model = RandomForestClassifier(n_estimators=100, random_state=42)

#model.fit(X_train, y_train)


model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=1, random_state=0).fit(X_train, y_train)

#使用LGBM模型
#model = LGBMClassifier()
model.fit(X_train, y_train)

# Model evaluation: Predict on the test set and print the classification report
y_pred = model.predict(X_test)
#print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


pred_lr_prob_ref = model.predict_proba(X_test)[:,1]
pred_lr = model.predict(X_test)

conf_mat = confusion_matrix(test_data["LABEL"], pred_lr)

pred_pro = model.predict_proba(X_test)[:,1]

frp_mi, trp_mi, thres_mi = roc_curve(test_data["LABEL"], pred_pro)

auc_val_mi = auc(frp_mi, trp_mi)

print("sensitivity: " + str(round(metrics.recall_score(test_data["LABEL"], pred_lr),2)) + '\n') 
print("specificity: " + str(round(1-metrics.recall_score(1-test_data["LABEL"], pred_lr),2)) + '\n') 
print("discrimination: " + str(round(auc_val_mi,3)) + '\n') 


joblib.dump(model, 'model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

sensitivity: 0.47

specificity: 0.97

discrimination: 0.897



['vectorizer.pkl']

### 使用XGBClassifier, 特征值为2000

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import joblib

# Feature extraction: Convert observations to features using CountVectorizer

vectorizer = TfidfVectorizer(analyzer = "word"
                                  , max_features=2000
                                  #, max_df = 0.95
                                  #, min_df = 0.05
                                  , dtype = np.float32
                                 )


X_train = vectorizer.fit_transform(train_data['OBSERVATIONS'])

y_train = train_data["LABEL"]



X_test = vectorizer.transform(test_data.OBSERVATIONS)
y_test = test_data["LABEL"]


# Model training: Train a RandomForestClassifier
#model = RandomForestClassifier(n_estimators=100, random_state=42)

#model.fit(X_train, y_train)


#model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
#    max_depth=1, random_state=0).fit(X_train, y_train)

#使用XGB模型
model = XGBClassifier()
model.fit(X_train, y_train)

# Model evaluation: Predict on the test set and print the classification report
y_pred = model.predict(X_test)
#print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


pred_lr_prob_ref = model.predict_proba(X_test)[:,1]
pred_lr = model.predict(X_test)

conf_mat = confusion_matrix(test_data["LABEL"], pred_lr)

pred_pro = model.predict_proba(X_test)[:,1]

frp_mi, trp_mi, thres_mi = roc_curve(test_data["LABEL"], pred_pro)

auc_val_mi = auc(frp_mi, trp_mi)

print("sensitivity: " + str(round(metrics.recall_score(test_data["LABEL"], pred_lr),2)) + '\n') 
print("specificity: " + str(round(1-metrics.recall_score(1-test_data["LABEL"], pred_lr),2)) + '\n') 
print("discrimination: " + str(round(auc_val_mi,3)) + '\n') 


joblib.dump(model, 'model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

sensitivity: 0.47

specificity: 0.97

discrimination: 0.905



['vectorizer.pkl']