In [1]:
import numpy as np
import time
import codecs
import torch



def merge_glucose_data(path, path1):
    # 列表最后为空字符串
    data = codecs.open(path,'r').read().split('\n')[1:-1]
    data1 = codecs.open(path1,'r').read().split('\n')[1:-1]
    # key为每一个病人的id，value为一个时间戳列表和一个剂量列表
    dictionary_glucose = {}  #{'id':[[t0,t1,t2...],[v0,v1,v2...]]}
    # 每一行数据
    for i, content in enumerate(data):

        #try:
        content = content.split(",")
        # 3-145834-211552
        id = content[0].strip() + '-' + content[1].strip() + '-' + content[2].strip()
        # 以15分钟为粒度
        t = time.mktime(time.strptime(content[3], "%Y-%m-%d %H:%M:%S")) / 900
        # 使用的剂量
        v = 0 if content[4] == '' else float(content[4])

        # 此id不是第一次出现
        if id in dictionary_glucose:
            # 时间戳
            dictionary_glucose[id][0].append(t)
            # 观测量
            dictionary_glucose[id][1].append(v)
        else:
            dictionary_glucose[id] = [[t], [v]]

    for i, content in enumerate(data1):

        content = content.split(',')
        id = content[0] + '-' + content[1] + '-' + content[2]
        t = time.mktime(time.strptime(content[3], "%Y-%m-%d %H:%M:%S")) / 900
        v = float(content[4]) if content[4] != '' else 0

        if id in dictionary_glucose:
            dictionary_glucose[id][0].append(t)
            dictionary_glucose[id][1].append(v)
        else:
            dictionary_glucose[id] = [[t],[v]]

    dictionary_glucose_sorted = {}
    for id_name in dictionary_glucose:
        index = np.argsort(dictionary_glucose[id_name][0])
        # 按时间戳排序
        dictionary_glucose_sorted[id_name] = [[dictionary_glucose[id_name][0][i] for i in index], [dictionary_glucose[id_name][1][i] for i in index]]
    torch.save(dictionary_glucose_sorted, 'datasets/dictionary_glucose_sorted.pt')
    # save_dataset(dictionary_glucose_sorted, 'dataset/dictionary_glucose_sorted.pkl')
    return dictionary_glucose_sorted

In [2]:
import math


def readin_insulin(path):
    data = codecs.open(path,'r').read().split('\n')[1:-1]
    dictionary_insulin = {}  #{'id':[[t0,t1,t2...],[v0,v1,v2...]]}
    for i, content in enumerate(data):
        #try:
        content = content.split(',')
        id = content[0].strip() + '-' + content[1].strip() + '-' + content[2].strip()
        st = time.mktime(time.strptime(content[4],"%Y-%m-%d %H:%M:%S")) / 900
        et = time.mktime(time.strptime(content[5],"%Y-%m-%d %H:%M:%S")) / 900
        # v = float(content[6]) if content[6] != '' else 0
        v1 = float(content[6]) if content[6] != '' else 0
        v2 = float(content[8]) if content[8] != '' else 0

        if v2 == 0:
            if id in dictionary_insulin:
                dictionary_insulin[id][0].append(st)
                dictionary_insulin[id][1].append(v1)
                dictionary_insulin[id][2].append(0)
            else:
                dictionary_insulin[id] = [[st], [v1], [0]]
        else:
            # 向上取整
            period = math.ceil(et - st)
            if id in dictionary_insulin:
                dictionary_insulin[id][0] += [st + i for i in range(period)]
                dictionary_insulin[id][1] += [0 for _ in range(period)]
                dictionary_insulin[id][2] += [v1 / period for _ in range(period)]
            else:
                dictionary_insulin[id] = [[st + i for i in range(period)], [0 for _ in range(period)], [v1 / period for _ in range(period)]]
        # if round(et - st) < 1:
        #     if id in dictionary_insulin:
        #         dictionary_insulin[id][0].append(st)
        #         dictionary_insulin[id][1].append(v)
        #         dictionary_insulin[id][2].append(0)
        #     else:
        #         dictionary_insulin[id] = [[st],[v],[0]]
        # else:
        #     period = round(et - st)
        #     # TODO
        #     if id in dictionary_insulin:
        #         dictionary_insulin[id][0] += [st + i for i in range(period)]
        #         dictionary_insulin[id][1] += [0 for _ in range(period)]
        #         dictionary_insulin[id][2] += [v / period for _ in range(period)]
        #     else:
        #         dictionary_insulin[id] = [[st + i for i in range(period)], [0 for _ in range(period)], [v / period for _ in range(period)]]
    dictionary_insulin_sorted = {}

    for id_name in dictionary_insulin:

        # 根据起始时间排序
        index = np.argsort(dictionary_insulin[id_name][0])
        dictionary_insulin_sorted[id_name]=[[dictionary_insulin[id_name][0][i] for i in index], [dictionary_insulin[id_name][1][i] for i in index], [dictionary_insulin[id_name][2][i] for i in index]]
        #print('dictionary_insulin_sorted: '+str(dictionary_insulin_sorted[n]))
        #print(eafioj)
    # save_dataset(dictionary_insulin_sorted, 'dataset/dictionary_insulin_sorted.pkl')
    torch.save(dictionary_insulin_sorted, "datasets/dictionary_insulin_sorted.pt")
    return dictionary_insulin_sorted

In [3]:
def merge_glucose_insulin(dictionary_glucose, dictionary_insulin): #{'id':[[t0,t1,t2...],[v0,v1,v2...]]}
    samples = {}
    for id_name in dictionary_insulin:
        if id_name in dictionary_glucose:
            min_time = dictionary_glucose[id_name][0][0]
            max_time = max(dictionary_insulin[id_name][0][-1], dictionary_glucose[id_name][0][-1])

            # max_time = dictionary_glucose[id_name][0][-1]
            # TODO
            period = math.ceil(max_time - min_time)
            period = period + 1 if math.ceil(period) == math.floor(period) else math.ceil(period)
            # period = round(max_time - min_time)
            #d = [0 for i in range(period+1)]
            r = [0 for _ in range(period)]
            t1 = [0 for _ in range(period)]
            t2 = [0 for _ in range(period)]

            count_r = [0 for _ in range(period)]
            count_t1 = [0 for _ in range(period)]
            count_t2 = [0 for _ in range(period)]
            # TODO
            for i, timestep in enumerate(dictionary_glucose[id_name][0]):

                r[math.floor(timestep - min_time)] += dictionary_glucose[id_name][1][i]
                count_r[math.floor(timestep - min_time)] += 1
            for i, timestep in enumerate(dictionary_insulin[id_name][0]):
                if timestep-min_time < 0 or timestep-max_time > -1:
                    continue
                if dictionary_insulin[id_name][1][i] != 0:
                    count_t1[math.floor(timestep - min_time)] += 1
                    t1[math.floor(timestep - min_time)] += dictionary_insulin[id_name][1][i]
                else:
                    count_t2[math.floor(timestep - min_time)] += 1
                    t2[math.floor(timestep - min_time)] += dictionary_insulin[id_name][2][i]
            for idx, cnt in enumerate(count_r):
                if cnt > 1:
                    r[idx] /= count_r[idx]
            for idx, cnt in enumerate(count_t1):
                if cnt > 1:
                    t1[idx] /= count_t1[idx]
            for idx, cnt in enumerate(count_t2):
                if cnt > 1:
                    t2[idx] /= count_t2[idx]

            samples[id_name] = [r, t1, t2]
    torch.save(samples, "datasets/samples.pt")
    # save_dataset(samples, 'dataset/samples.pkl')
    return samples

In [4]:
def up_sample(r, t1, t2, up = 2):
    if up < 1:
        up = 1
    length = int(len(r) / up)
    r_new = []
    t1_new = []
    t2_new = []
    r_carryon = []
    current = 0
    for i in r:
        # 缺失值填补
        if i == 0:
            r_carryon.append(current)
        else:
            r_carryon.append(i)
            current = i

    for i in range(length):
        # 当前窗口内的平均值
        r_new.append(np.mean(r_carryon[up * i : up * i + up]))
        t1_new.append(np.mean(t1[up * i : up * i + up]))
        t2_new.append(np.mean(t2[up * i : up * i + up]))
    return r_new, t1_new, t2_new

In [5]:
path = "datasets/fingerstick_glucose_icu.csv"
path1 = "datasets/glucose_icu.csv"
dictionary_glucose_sorted = merge_glucose_data(path, path1)
path = 'datasets/insulin_inputeventsmv_icu.csv'
dictionary_insulin_sorted = readin_insulin(path)
samples = merge_glucose_insulin(dictionary_glucose_sorted, dictionary_insulin_sorted)

In [6]:
def standardization(data):
    mu = data.mean()
    sigma = data.std()
    return (data - mu) / sigma

def data_norm(data): # [count, 3, length]
    samples = data.permute(1, 0, 2)  # [3, count, length]
    for i, sample in enumerate(samples):
        # TODO
        if i == len(samples) - 2:
            break
        samples[i] = standardization(sample) #sample_norm(sample, t)
    return samples.permute(1, 0, 2)  # list tensor tensor

In [7]:
samples = torch.load('datasets/samples.pt')
samples_new = []
mask = []
print('samples count: ' + str(len(samples)))
window = [50]

for w in window:
    for id_name in samples:

        r, t1, t2 = samples[id_name]
        if int(len(r) / 50) < 4:
            r, t1, t2 = up_sample(r, t1, t2, up = int(len(r) / 50))
        else:
            r, t1, t2 = up_sample(r, t1, t2, up = 4)

        # for i in range(len(r) - w - 1):
        # 滑动窗口 数据增强
        for i in range(len(r) - w + 1):
            # 统计缺失值的数量
            if t1[i:i + w][-20:].count(0) + t2[i:i + w][-20:].count(0) < 36 and t1[i:i + w][:20].count(0) + t2[i:i + w][:20].count(0) < 36 and t1[i:i + w][20:30].count(0) + t2[i:i + w][20:30].count(0) < 19:
                samples_new.append([r[i:i + w], t1[i:i + w], t2[i:i + w]])
                #print(samples_new[n])
                break
    print('new samples for: ' + str(w))
    print(len(samples_new))
samples_new = torch.FloatTensor(samples_new)
# mask = (samples_new > 0).mul_(1)
mask = samples_new > 0
samples_new = data_norm(samples_new)
# save_dataset((samples_new,mask), 'dataset/dataset.pkl')
torch.save((samples_new, mask), 'datasets/glucose_dataset.pt')

samples count: 11927
new samples for: 50
2307


In [8]:
s1, s2 = torch.load('datasets/glucose_dataset.pt')
print(s2.shape)
print(s1.shape)

torch.Size([2307, 3, 50])
torch.Size([2307, 3, 50])
