In [2]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy.interpolate import InterpolatedUnivariateSpline
import matplotlib.pyplot as plt
from scipy import interpolate
from scipy.ndimage.filters import gaussian_filter
import cPickle as pickle

In [5]:
# Read cleaned (in Excel) CSV data files

# order of the columns: Patient ID, Time, Glucose Measurement
# remove all columns with NaN

gold_data = np.array(pd.read_csv('GoldSample_Data_Cleaned.csv').as_matrix())
gold_data = gold_data[~np.isnan(gold_data).any(axis=1)]

# order of the columns: Patient ID, Date, Time, Glucose Measurement, Watch # (there are 2)
gwb_data = np.array(pd.read_csv('GWB_Data_Cleaned.csv').as_matrix())
gwb_data = gwb_data[~np.isnan(gwb_data).any(axis=1)]

# CGMS data comes in a dictionary, not a CSV, due to the fact that multiple monitors are present.
# We did some hacky things with the raw data to fish out one of the sensors, but it should be fine.
# cgm_data is a dictionary already.
with open('pt_cgm_longest.p', 'r') as fp:
    cgm_data = pickle.load(fp)

In [6]:
# Break up the data into patients
# I average datapoints with the same value
# Each dictionary is created from a single data source

pt_gold_data = {}
pt_cgm_data = {}
pt_gwb_data = {}
for patient_id in np.unique(gold_data[:,0]).astype(int):
    patient_indices = (gold_data[:,0]==patient_id)
    df = pd.DataFrame(gold_data[patient_indices, 1:], columns = ["First Col", "Second Col"])
    pt_gold_data[patient_id] = df.groupby('First Col', as_index=False).mean().as_matrix()
    
for patient_id in np.unique(gwb_data[:,0]).astype(int):
    patient_indices = (gwb_data[:,0]==patient_id)
    df = pd.DataFrame(gwb_data[patient_indices, 1:], columns = ["First Col", "Second Col", "Third Col"])
    pt_gwb_data[patient_id] = df.groupby('First Col', as_index=False).mean().as_matrix()
        
# cgm_data is actually a dictionary, except without the avg processing for same timepoints
for patient_id in cgm_data.keys():
    data = cgm_data[patient_id]
    df = pd.DataFrame(data, columns = ["First Col", "Second Col"])
    pt_cgm_data[patient_id] = df.groupby('First Col', as_index=False).mean().as_matrix()
    

In [13]:
# Interpolate gold standard data and align with cgm data
# This is a dictionary of patients and all of their data (CGM and interpolated gold standard)

verbose = False

pt_inter_cgm = {} # order of the columns: Time, CGM, BGM

for pt_ID in pt_gold_data.keys():
    
    # get patient cgm and gold data
    cgm_data = pt_cgm_data[pt_ID]
    gold_data = pt_gold_data[pt_ID]    

    # create cubic spline fit for gold_data
    cs = interpolate.splrep(gold_data[:,0], gold_data[:,1], s=0, k=1)
    
    # Interpolate gold standard points for all possible CGM
    # Smooth the interpolation with Gaussian filter
    inter_gs = np.reshape(interpolate.splev(cgm_data[:,0], cs, der=0),(len(cgm_data),1))
    smoothed_inter_gs = gaussian_filter(inter_gs, sigma=3)
    temp = np.hstack([cgm_data, smoothed_inter_gs])
    
    # Get rid of meaningless interpolations, outside the window of overlap for original data
    time_lower_limit = np.max([np.min(gold_data[:,0]), np.min(cgm_data[:,0])])
    time_upper_limit = np.min([np.max(gold_data[:,0]), np.max(cgm_data[:,0])])
    limit = [time_lower_limit, time_upper_limit]
    
    # Select the valid time points within the overlap window
    valid_indices = [idx[0] for idx,timepoint in np.ndenumerate(temp[:,0]) if (timepoint >= limit[0] and timepoint <= limit[1])]
    pt_inter_cgm[pt_ID] = temp[valid_indices,:]
    
    # Visualize for debugging purposes
    if verbose:
        plt.scatter(gold_data[:,0], gold_data[:,1], color='gold')
        plt.plot(pt_inter_cgm[pt_ID][:,0], pt_inter_cgm[pt_ID][:,2], color='gold') # show interpolated gold standard
        plt.plot(pt_inter_cgm[pt_ID][:,0], pt_inter_cgm[pt_ID][:,1], color='blue') # show CGM
        plt.show()



In [14]:
# Save patient data as json

# Save the dictionary with interpolated true points and the CGM
with open('pt_cgm_bgm.p', 'wb') as fp:
    pickle.dump(pt_inter_cgm, fp)

# Save the dictionary with the actual blood glucose measurements    
with open('pt_true_bgm.p', 'wb') as fp:
    pickle.dump(pt_gold_data, fp)
    
# with open('pt_gwb_bgm.p', 'wb') as fp:
#     pickle.dump(pt_inter_cgm, fp)

In [15]:
# Create and test and training data set using pickle

def make_test_train(data_set, percent_train):
    length = percent_train*len(data.keys())
    test = {}
    train = {}
    for i, (key, value) in enumerate(data_set.iteritems()):
        if i < length:
            train[key] = value
        else:
            test[key] = value
    with open('train_data.p', 'wb') as fp:
        pickle.dump(train, fp)
    with open('test_data.p', 'wb') as fp:
        pickle.dump(test, fp)

In [16]:
with open('pt_cgm_bgm.p', 'rb') as fp:
    data = pickle.load(fp)

make_test_train(data, 0.7) # split the data into testing and training data