In [1]:
import pandas as pd
import numpy as np
import re

In [3]:
df = pd.read_csv('../data/zwdzbl.csv')
df = df[['主诉', '家族史', '现病史', '生命体征', '入院初诊']]

In [4]:
unmeaning_word_list = ['规则']
item_separator = re.compile(',|，')
kv_separator = re.compile(':|：')

def get_vital_signs_dict(vital_signs_str):
    attr_dict = {}
    items = item_separator.split(vital_signs_str)
    for item in items:
        item = item.strip()
        for word in unmeaning_word_list:
            if (item.startswith(word)):
                item = item[len(word): len(item)]
                break
        item = item.strip()
        if (len(item) == 0 or item is None):
            continue
        key, value = kv_separator.split(item)
        attr_dict[key] = value
    return attr_dict

In [5]:
results = []
error_list = []
for idx, vital_signs_str in df.生命体征.iteritems():
    try:
        item_dict = get_vital_signs_dict(vital_signs_str)
    except:
        item_dict = {}
        error_list.append(idx)
    results.append(item_dict)

In [6]:
def get_body_temperature(item):
    return float(item['体温'].replace('℃', ''))

def get_pulse(item):
    return int(item['脉搏'].replace('次/分', ''))

def get_breathe_frequency(item):
    return int(item['呼吸'].replace('次/分', ''))

def get_blood_pressure(item):
    tresult = sorted(list(map(int, item['血压'].replace('mmHg', "").split('/'))))
    assert(len(tresult) == 2)
    return tresult

def fault_tolerant(fun, item):
    try:
        return fun(item)
    except:
        if (fun == get_blood_pressure):
            return [np.nan, np.nan]
        return np.nan

vital_signs_list = ['体温', '脉搏', '呼吸', '高压', '低压']
df = pd.concat([df, pd.DataFrame(columns=vital_signs_list)], sort=False)

In [7]:
# 数据量大时，需要直接提取列加到df里
for idx, item in enumerate(results):
    if (item == {}):
        continue
    for vital_sign in vital_signs_list: 
        df.loc[idx]['体温'] =  fault_tolerant(get_body_temperature, item)
        df.loc[idx]['脉搏'] =  fault_tolerant(get_pulse, item)
        df.loc[idx]['呼吸'] =  fault_tolerant(get_breathe_frequency, item)
        df.loc[idx]['低压'] =  fault_tolerant(get_blood_pressure, item)[0]
        df.loc[idx]['高压'] =  fault_tolerant(get_blood_pressure, item)[1]

In [10]:
df = df[['体温', '脉搏', '呼吸', '高压', '低压', '主诉', '家族史', '现病史', '入院初诊']]
df = df.dropna()

In [14]:
df.to_excel('../data/not_null.xlsx')