In [1]:
import numpy as np
import pandas as pd
import chardet
from matplotlib import pyplot as plt
import os
from random import randrange
import random





In [22]:
class DataCleaning():

    def __init__(self, filename='Data/DeviceCGM.txt'):
        self.filename = filename

    def import_and_store(self):
        ''' Input is a txt file with pipe delimiter and specified columns for the specific diabetes dataset
        Returns a list of smaller CSVs (12) shards are created from the original'''
        ## file downloaded from ...
        path = self.filename

        ## detect file encoding
        #with open(path,'rb') as f:
        #  rawdata = b''.join([f.readline() for _ in range(20)])
        #  print(chardet.detect(rawdata)['encoding'])
        ##--> File encoding - UTF-16

        ## Import file to dataframe
        cgmData = pd.read_csv(self.filename, delimiter="|", encoding='UTF-16')
        
        ## Create shards for easier data management
        fileNames = []
        for id, df_i in  enumerate(np.array_split(cgmData, 12)):
            df_i.to_csv('Data/CGM_{id}.csv'.format(id=id))
            fileNames.append('Data/CGM_{id}.csv'.format(id=id))

        ## fileNames stores smaller shards of data
        return fileNames

    def openCSV(self):
        '''opens up a CSV shard(s) with given filename index and returns a processed dataframe'''
        fileNames=self.import_and_store()
        df = pd.read_csv(fileNames[0])
        for i in range(1,len(fileNames)-1):
            temp_df = pd.read_csv(fileNames[i])
            df = pd.concat([df,temp_df])
        
        ## Adding features

        df['DeviceDtTm']=pd.to_datetime(df['DeviceDtTm'])
        df['ValueMMOL']=round(df['Value']/18,1)  ## converting to Canadian standard of measurement mmol/L
        df['DDate']=pd.to_datetime(df['DeviceDtTm']).dt.date
        df['hourOfDay'] = df['DeviceDtTm'].dt.hour
        df = df[df['RecordType']=='CGM'] ## remove other record types
        ## ensuring sequence
        df['series']= df['DeviceDtTm'] >= df['DeviceDtTm'].shift() + pd.Timedelta(minutes=6)

        return df

    def resequenceData(self, filename='Data/cleaned.csv'):
        dfs = self.openCSV()
        dfs.reset_index(inplace=True)
        a = len(dfs)
        seed = np.random.randint(10000,80000)
        curr_ptid = dfs['PtID'].loc[0]
        curr_sequence = seed
        dfs['series_id']=0
        dfs['series_id']=curr_sequence
        for index in range(1,a):
            if dfs['series'].loc[index] == False and dfs['PtID'].loc[index]==curr_ptid:
                dfs['series_id'].loc[index]=curr_sequence
            else:
                curr_sequence+=1
                curr_ptid=dfs['PtID'].loc[index]
                dfs['series_id'].loc[index]=curr_sequence

        dfs.to_csv(filename)

    def seriesToTimeSeries(self, X, step_length=6,forecast_dist=6):
        y=[]
        reshapedX = []
        for i in range(len(X)-forecast_dist-step_length):
            y.append(X[i+step_length+forecast_dist])
            reshapedX.append(X[i:i+step_length])
        return reshapedX,y

    def SampleValidSequences(self, numTrainSequences=15, numTestSequences=5, filename='Data/cleaned.csv'):

        samplingDF = pd.read_csv(filename)
        new_df = samplingDF.groupby('series_id').count()
        
        valid_sequences = new_df[new_df['index']>=75].index.to_numpy()
        train_index = valid_sequences[random.sample(range(0,len(valid_sequences)),numTrainSequences)]
        test_index = valid_sequences[random.sample(range(0,len(valid_sequences)),numTestSequences)]

        
        an_X = samplingDF[samplingDF['series_id']==train_index[0]].ValueMMOL.tolist()
        an_X, y = self.seriesToTimeSeries(an_X)
        X_train=an_X
        y_train=y

        for i in train_index[1:]:
            an_X = samplingDF[samplingDF['series_id']==i].ValueMMOL.tolist()
            an_X, y = self.seriesToTimeSeries(an_X)
            
            X_train = X_train+an_X
            y_train = y_train+y

        
        an_X = samplingDF[samplingDF['series_id']==test_index[0]].ValueMMOL.tolist()
        an_X,y = self.seriesToTimeSeries(an_X)
        X_test=an_X
        y_test = y

        for i in test_index[1:]:
            an_X = samplingDF[samplingDF['series_id']==i].ValueMMOL.tolist()
            an_X, y = self.seriesToTimeSeries(an_X)
            
            X_test = X_test+an_X
            y_test = y_test+y

        # X_train_file = open('Data/X_train.txt', 'w')
        # for element in X_train:
        #     X_train_file.write(element)
        # X_train_file.close()

        return X_train, X_test, y_train, y_test

In [23]:
obj = DataCleaning()


In [None]:
obj.openCSV()

In [4]:
obj.resequenceData()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [24]:
obj.SampleValidSequences()

([[12.9, 12.9, 12.8, 12.7, 12.6, 12.1],
  [12.9, 12.8, 12.7, 12.6, 12.1, 11.8],
  [12.8, 12.7, 12.6, 12.1, 11.8, 11.7],
  [12.7, 12.6, 12.1, 11.8, 11.7, 11.3],
  [12.6, 12.1, 11.8, 11.7, 11.3, 11.7],
  [12.1, 11.8, 11.7, 11.3, 11.7, 12.7],
  [11.8, 11.7, 11.3, 11.7, 12.7, 12.8],
  [11.7, 11.3, 11.7, 12.7, 12.8, 12.8],
  [11.3, 11.7, 12.7, 12.8, 12.8, 12.9],
  [11.7, 12.7, 12.8, 12.8, 12.9, 12.7],
  [12.7, 12.8, 12.8, 12.9, 12.7, 12.1],
  [12.8, 12.8, 12.9, 12.7, 12.1, 11.1],
  [12.8, 12.9, 12.7, 12.1, 11.1, 10.4],
  [12.9, 12.7, 12.1, 11.1, 10.4, 10.2],
  [12.7, 12.1, 11.1, 10.4, 10.2, 10.6],
  [12.1, 11.1, 10.4, 10.2, 10.6, 11.3],
  [11.1, 10.4, 10.2, 10.6, 11.3, 12.6],
  [10.4, 10.2, 10.6, 11.3, 12.6, 12.6],
  [10.2, 10.6, 11.3, 12.6, 12.6, 12.4],
  [10.6, 11.3, 12.6, 12.6, 12.4, 12.6],
  [11.3, 12.6, 12.6, 12.4, 12.6, 12.9],
  [12.6, 12.6, 12.4, 12.6, 12.9, 12.8],
  [12.6, 12.4, 12.6, 12.9, 12.8, 12.4],
  [12.4, 12.6, 12.9, 12.8, 12.4, 12.8],
  [12.6, 12.9, 12.8, 12.4, 12.8, 12.9],


In [73]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Bidirectional, LSTM, GRU, SimpleRNN
import data_prep


In [74]:

class RunModel:

    def __init__(self,X_train, X_test, y_train, y_test):
        self.X_train = tf.convert_to_tensor(X_train)
        self.y_train = tf.convert_to_tensor(y_train)
        self.X_test = tf.convert_to_tensor(X_test)
        self.y_test = tf.convert_to_tensor(y_test)

# RMSE, MAPE
    def rnn_model(self):
        model_rnn = Sequential()
        model_rnn.add(SimpleRNN(50, activation='relu', input_shape=(6,1)))
        model_rnn.add(Dense(1))
        print('\nRunning RNN model...')
        model_rnn.compile(optimizer='adam', loss='mse', metrics='mape')
        model_rnn.fit(self.X_train, self.y_train, epochs=5, validation_split=0.2, batch_size=100)
        
        train_loss, train_mape = model_rnn.evaluate(self.X_train, self.y_train)
        print(f'RNN Model: \nTraining set has a loss (MSE) of {train_loss} with Mean Absolute Percentage Error (MAPE) of {train_mape}')

        test_loss, test_mape = model_rnn.evaluate(self.X_test, self.y_test)
        print(f'Test set has a loss (MSE) of {test_loss} with Mean Absolute Percentage Error (MAPE) of {test_mape}\n')


    def lstm_model(self):
        model_lstm = Sequential()
        model_lstm.add(Bidirectional(LSTM(50, activation='relu'), input_shape=(6,1)))
        model_lstm.add(Dense(1))
        print('\nRunning the LSTM model...')
        model_lstm.compile(optimizer='adam', loss='mse', metrics='mape')
        model_lstm.fit(self.X_train, self.y_train, epochs=5, validation_split=0.2, batch_size=100)
        
        train_loss, train_mape = model_lstm.evaluate(self.X_train, self.y_train)
        print(f'LSTM Model: \nTraining set has a loss (MSE) of {train_loss} with Mean Absolute Percentage Error (MAPE) of {train_mape}')

        test_loss, test_mape = model_lstm.evaluate(self.X_test, self.y_test)
        print(f'Test set has a loss (MSE) of {test_loss} with Mean Absolute Percentage Error (MAPE) of {test_mape}\n')


    def gru_model(self):
        model_gru = Sequential()
        model_gru.add(GRU(50, activation='relu', input_shape=(6,1)))
        model_gru.add(Dense(1))
        print('\nRunning RNN model...')
        model_gru.compile(optimizer='adam', loss='mse', metrics='mape')
        model_gru.fit(self.X_train, self.y_train, epochs=5, validation_split=0.2, batch_size=100)
        
        train_loss, train_mape = model_gru.evaluate(self.X_train, self.y_train)
        print(f'GRU Model: \nTraining set has a loss (MSE) of {train_loss} with Mean Absolute Percentage Error (MAPE) of {train_mape}')

        test_loss, test_mape = model_gru.evaluate(self.X_test, self.y_test)
        print(f'Test set has a loss (MSE) of {test_loss} with Mean Absolute Percentage Error (MAPE) of {test_mape}\n')


    def run_all_models(self):
        self.rnn_model()
        self.lstm_model()
        self.gru_model()


if __name__ == '__main__':
    clean_data = data_prep.DataCleaning()
    X_train, X_test, y_train, y_test = clean_data.SampleValidSequences()

    model_obj = RunModel(X_train, X_test, y_train, y_test)
    model_obj.run_all_models()

        


Running RNN model...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
RNN Model: 
Training set has a loss (MSE) of 4.528862476348877 with Mean Absolute Percentage Error (MAPE) of 16.478273391723633
Test set has a loss (MSE) of 4.168188095092773 with Mean Absolute Percentage Error (MAPE) of 14.481978416442871


Running the LSTM model...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
LSTM Model: 
Training set has a loss (MSE) of 4.53615140914917 with Mean Absolute Percentage Error (MAPE) of 17.669544219970703
Test set has a loss (MSE) of 4.203997611999512 with Mean Absolute Percentage Error (MAPE) of 15.933649063110352


Running RNN model...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
GRU Model: 
Training set has a loss (MSE) of 4.36659049987793 with Mean Absolute Percentage Error (MAPE) of 16.933208465576172
Test set has a loss (MSE) of 3.9616358280181885 with Mean Absolute Percentage Error (MAPE) of 15.032307624816895

