In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm.notebook import tqdm
from imblearn.over_sampling import RandomOverSampler

import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix, roc_curve

from matplotlib.dates import DateFormatter
dateformat = DateFormatter(fmt = '%H:%M:%S:%f')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def import_data(patient_num):

    # First, get the labels
    labels = pd.read_csv(f'../data/centered/original/centered_{patient_num}.csv')

    others = pd.read_csv(f'../data/centered/reviewed/dc_{patient_num}.csv')
    others['Review'] = others['Review'].str.strip()
    others = others[others['Review'] == 'Yes']

    sgb_times = pd.concat([labels[['Time']], others[['Time']]])
    sgb_times['Time'] = pd.to_datetime(sgb_times['Time'])
    sgb_times['sgb'] = True

    # Then the raw data
    readings = pd.read_csv(f'../data/Completed file set Oct13/{patient_num} RAW 00.csv')
    readings['Time'] = pd.to_datetime(readings['Time'])
    readings['CapturedTime'] = readings['Time'].dt.time

    # Combine them to label the events times
    readings = pd.merge(left = readings,
             right = sgb_times[['Time', 'sgb']],
            how = 'outer').fillna('False')
    
    # Then remove the meal times
    events = pd.read_csv(f'../data/Completed file set Oct13/{patient_num} EVT 00.csv')
    events = events[events['EventName'] == 'Meal']
    events['Time'] = pd.to_datetime(events['Time'])
    events['Duration'] = pd.to_timedelta(events['Duration'])
    events = events.assign(End = events['Time'] + events['Duration'])
    for idx, row in events.iterrows():
        readings = readings[(readings['Time'] < row['Time']) | (readings['Time'] > row['End'])]
        
    # Normalize the readings to be between 0 and 1
    for channel in [
        'Channel_1', 'Channel_2', 'Channel_3',
        'Channel_4', 'Channel_5', 'Channel_6'
    ]:
        readings[channel] = readings[channel] / 10000
    
    # There is a section for patient 008 at the end which is all maxed outs
    if patient_num == '008':
        readings = readings.loc[:837833]
    
    # There is a missing segment of time for 009, so we need to repair the index
    if patient_num == '009':
        readings.index = [x + 4 if x >= 276 else x for x in readings.index]
        
    # There is a section for patient 010 at the end which is all maxed outs
    if patient_num == '010':
        readings = readings.loc[:731110]
    
    return readings

In [3]:
def prepare_training(patient_num):
    readings = import_data(patient_num)
    
    readings.loc[readings['sgb'] == True, 'next_sgb'] = readings.loc[readings['sgb'] == True, 'Time']
    readings['next_sgb'] = readings['next_sgb'].bfill()
    readings.loc[readings['sgb'] == True, 'last_sgb'] = readings.loc[readings['sgb'] == True, 'Time']
    readings['last_sgb'] = readings['last_sgb'].ffill()

    readings['next_sgb'] = (readings['next_sgb'] - readings['Time']).dt.total_seconds()
    readings['last_sgb'] = (readings['Time'] - readings['last_sgb']).dt.total_seconds()

    potential_training_index = readings[~(readings['next_sgb'] < 2) & ~(readings['last_sgb'] < 2)].index

    nonsgb = [x for x in potential_training_index if x%10 == 0]
    sgb = readings[readings['sgb'] == True].index.tolist()
    
    keeps = 0
    
    for idx in tqdm(nonsgb):
        offset = 0
        duration = 20

        idx = idx - offset

        readings_sub = readings.loc[idx:idx + duration - 1][['Channel_' + str(j) for j in range(1,7)]].values
        if readings_sub.shape == (duration, 6) and np.abs(np.diff(readings_sub, axis = 0)).sum() > 1:
            X_train.append(readings_sub)
            y_train.append(0)
            train_indices.append(idx)
            keeps += 1
            
    print(keeps)

    for idx in tqdm(sgb * 10):
        offset = 0
        duration = 20

        idx = idx - offset

        readings_sub = readings.loc[idx:idx + duration - 1][['Channel_' + str(j) for j in range(1,7)]].values
        if readings_sub.shape == (duration, 6):
            X_train.append(readings_sub)
            y_train.append(1)
            train_indices.append(idx)

In [4]:
X_train = []
y_train = []
train_indices = []

for patient_num, sampling_strategy in [
    ('001', 1/10),
    ('002', 1/50),
    ('003', 1/10),
    ('004', 1/10),
    ('005', 1/10),
    ('006', 1/10),
    ('007', 1/10),
    ('008', 1/10),
    ('009', 1/10),
    ('010', 1/50),
    ('011', 1/10),
    ('012', 1/10),
    ('013', 1/10),
]:
    prepare_training(patient_num)

  0%|          | 0/77053 [00:00<?, ?it/s]

10489


0it [00:00, ?it/s]

  0%|          | 0/68301 [00:00<?, ?it/s]

8048


  0%|          | 0/4030 [00:00<?, ?it/s]

  0%|          | 0/79790 [00:00<?, ?it/s]

2887


  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/72927 [00:00<?, ?it/s]

1550


  0%|          | 0/290 [00:00<?, ?it/s]

  0%|          | 0/74816 [00:00<?, ?it/s]

5200


  0%|          | 0/140 [00:00<?, ?it/s]

  0%|          | 0/71162 [00:00<?, ?it/s]

2738


  0%|          | 0/180 [00:00<?, ?it/s]

  0%|          | 0/76775 [00:00<?, ?it/s]

2139


  0%|          | 0/110 [00:00<?, ?it/s]

  0%|          | 0/78569 [00:00<?, ?it/s]

9529


  0%|          | 0/90 [00:00<?, ?it/s]

  0%|          | 0/76244 [00:00<?, ?it/s]

6390


  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/66589 [00:00<?, ?it/s]

7734


  0%|          | 0/1290 [00:00<?, ?it/s]

  0%|          | 0/78031 [00:00<?, ?it/s]

6417


  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/78105 [00:00<?, ?it/s]

6531


  0%|          | 0/190 [00:00<?, ?it/s]

  0%|          | 0/73879 [00:00<?, ?it/s]

9328


  0%|          | 0/1400 [00:00<?, ?it/s]

In [5]:
model = models.Sequential()
model.add(
    layers.Conv2D(
        16, 
        (3, 6), 
        activation='relu', 
        input_shape=(20,6,1), 
        padding = 'same',
        data_format='channels_last'
    )
)

### This next layer wasn't in there before
model.add(layers.MaxPooling2D(pool_size = (2,1)))
model.add(layers.Conv2D(16, (3,6), activation='relu', padding = 'same'))
model.add(layers.MaxPooling2D(pool_size = (2,1)))
model.add(layers.Conv2D(16, (3,6), activation='relu', padding = 'same'))
model.add(layers.MaxPooling2D(pool_size = (2,1)))
model.add(layers.Flatten())
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation = 'sigmoid'))

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy'],
)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [6]:
history = model.fit(
    np.array(X_train).reshape(
        len(X_train), 
        X_train[0].shape[0], 
        6, 
        1), 
    np.array(y_train),
    epochs=30
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [8]:
model.save('../models/2024_11_28_model')

In [8]:
def import_training_data(patient_num):
    
    # First, get the labels
    labels = pd.read_csv(f'../data/new_data/marked/labels/labels_{patient_num}.csv')
    sgb = labels['labels'].tolist()
        
    # Then the raw data
    readings = pd.read_csv(f'../data/new_data/eval_data/{patient_num} RAW 00.csv')
    readings['Time'] = pd.to_datetime(readings['Time'])
    readings['CapturedTime'] = readings['Time'].dt.time
    
    # Then remove the meal times
    events = pd.read_csv(f'../data/new_data/eval_data/{patient_num} EVT 00.csv')
    events = events[events['EventName'] == 'Meal']
    events['Time'] = pd.to_datetime(events['Time'])
    events['Duration'] = pd.to_timedelta(events['Duration'])
    events = events.assign(End = events['Time'] + events['Duration'])
    for idx, row in events.iterrows():
        readings = readings[(readings['Time'] < row['Time']) | (readings['Time'] > row['End'])]
    
#     # There is a section for patient 008 at the end which is all maxed outs
#     if patient_num == '008':
#         readings = readings.loc[:837833]
    
#     # There is a missing segment of time for 009, so we need to repair the index
#     if patient_num == '009':
#         readings.index = [x + 4 if x >= 276 else x for x in readings.index]
        
#     # There is a section for patient 010 at the end which is all maxed outs
#     if patient_num == '010':
#         readings = readings.loc[:731110]
    
    return sgb, readings

In [10]:
def import_data(patient_num):

    # First, get the labels
    labels = pd.read_csv(f'../data/new_data/centered/centered_{patient_num}.csv')

    #others = pd.read_csv(f'../data/centering/double_check/reviewed/dc_{patient_num}.csv')
    #others['Review'] = others['Review'].str.strip()
    #others = others[others['Review'] == 'Yes']

    #sgb_times = pd.concat([labels[['Time']], others[['Time']]])
    sgb_times = labels[['Time']]
    sgb_times['Time'] = pd.to_datetime(sgb_times['Time'])
    sgb_times['sgb'] = True

    # Then the raw data
    readings = pd.read_csv(f'../data/new_data/eval_data/{patient_num} RAW 00.csv')
    readings['Time'] = pd.to_datetime(readings['Time'])
    readings['CapturedTime'] = readings['Time'].dt.time

    # Combine them to label the events times
    readings = pd.merge(left = readings,
             right = sgb_times[['Time', 'sgb']],
            how = 'outer').fillna('False')
    
    # Then remove the meal times
    events = pd.read_csv(f'../data/new_data/eval_data/{patient_num} EVT 00.csv')
    events = events[events['EventName'] == 'Meal']
    events['Time'] = pd.to_datetime(events['Time'])
    events['Duration'] = pd.to_timedelta(events['Duration'])
    events = events.assign(End = events['Time'] + events['Duration'])
    for idx, row in events.iterrows():
        readings = readings[(readings['Time'] < row['Time']) | (readings['Time'] > row['End'])]
        
    # Normalize the readings to be between 0 and 1
    for channel in ['Channel_1', 'Channel_2', 'Channel_3',
           'Channel_4', 'Channel_5', 'Channel_6']:
        readings[channel] = readings[channel] / 10000    
    return readings

In [78]:
patient_num = '89'
readings = import_data(patient_num)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [79]:
X_test = []
test_indices = []

for idx in tqdm(readings.index.tolist()):
    offset = 0
    duration = 20

    idx = idx - offset

    readings_sub = readings.loc[idx:idx + duration - 1][['Channel_' + str(j) for j in range(1,7)]].values
    if readings_sub.shape == (duration, 6) and np.abs(np.diff(readings_sub, axis = 0)).sum() > 1:
        X_test.append(readings_sub)
        test_indices.append(idx)

  0%|          | 0/769667 [00:00<?, ?it/s]

In [80]:
predictions = model.predict(np.array(X_test).reshape(len(X_test), X_train[0].shape[0], 6, 1))

In [90]:
def plot_event(idx, duration = 50, shift = 15, filepath = None):
    readings_sub = (
        readings
        .loc[idx - shift:idx - shift + duration - 1]
        [['Channel_' + str(i) for i in range(1,7)]]
        .values
    ) * 10000

    fig, ax = plt.subplots(figsize = (12,6))

    times = readings.loc[idx - shift:idx - shift + duration - 1, 'Time']

    for i in range(6):
        plt.plot(times, readings_sub[:, i], label = 'Channel_' + str(i + 1))

    plt.ylim(-100, 10300)
        
    ax.xaxis.set_major_formatter(dateformat)

    plt.legend(bbox_to_anchor = (1, 0.5), loc = 'center left')
    
    if filepath:
        plt.tight_layout()
        plt.savefig(filepath, 
                    dpi = 150, 
                    transparent = False, 
                    facecolor = 'white')
        
        plt.close();

In [82]:
prediction_indices = [test_indices[x] for x in np.where(predictions.flatten() > 0.5)[0]]

true_indices = readings[readings['sgb'] == True].index.tolist()

true_positives = []
false_positives = []
for idx in tqdm(prediction_indices):
    fp = True
    for i in range(15):
        if idx - i in true_indices:
            true_positives.append(idx - i)
            fp = False
            continue
        if idx + i in true_indices:
            true_positives.append(idx + i)
            fp = False
            continue
    if fp:
        false_positives.append(idx)
        
true_positives = set(true_positives)

  0%|          | 0/1381 [00:00<?, ?it/s]

In [83]:
len(true_positives)

467

In [84]:
len(true_indices)

604

In [85]:
len(prediction_indices)

1381

In [86]:
len(false_positives)

103

In [87]:
false_positives = pd.DataFrame({'fp_index': false_positives})

false_positives['diff'] = false_positives['fp_index'].diff()

false_positives = false_positives[~(false_positives['diff'].isin([1, 2, 3, 4, 5]))]

In [88]:
len(false_positives)

51

In [91]:
for i, fp in enumerate(false_positives['fp_index'].to_list()):
    plot_event(fp, filepath=f'../data/new_data/images/images_{patient_num}/fp/fp_{str(i).zfill(3)}.png')

In [92]:
fp_df = false_positives[['fp_index']]
fp_df['patient_num'] = patient_num
fp_df = fp_df.reset_index(drop = True)
fp_df = fp_df.reset_index()
fp_df['image_number'] = 'fp_' + fp_df['index'].astype('string').str.zfill(3)
fp_df['timestamp'] = readings.loc[fp_df['fp_index'].tolist(), 'Time'].tolist()
fp_df = fp_df[['patient_num', 'image_number', 'timestamp']]
fp_df.to_csv(f'../data/new_data/images/images_{patient_num}/fp.csv', index = False)

In [93]:
false_negatives = [x for x in true_indices if x not in true_positives]

In [94]:
for i, fp in enumerate(false_negatives):
    plot_event(fp, filepath=f'../data/new_data/images/images_{patient_num}/fn/fn_{str(i).zfill(3)}.png')

In [95]:
fn_df = pd.DataFrame({'fp_index': false_negatives})
fn_df['patient_num'] = patient_num
fn_df = fn_df.reset_index(drop = True)
fn_df = fn_df.reset_index()
fn_df['image_number'] = 'fn_' + fn_df['index'].astype('string').str.zfill(3)
fn_df['timestamp'] = readings.loc[fn_df['fp_index'].tolist(), 'Time'].tolist()
fn_df = fn_df[['patient_num', 'image_number', 'timestamp']]
fn_df.to_csv(f'../data/new_data/images/images_{patient_num}/fn.csv', index = False)

In [59]:
readings.loc[false_positives['fp_index'], ['Time']].to_csv(f'../data/new_data/images/images_{patient_num}/fp_{patient_num}.csv', index = False)
readings.loc[false_negatives, ['Time']].to_csv(f'../data/new_data/images/images_{patient_num}/fn_{patient_num}.csv', index = False)

In [58]:
readings.loc[false_negatives]

Unnamed: 0,Time,ElapsedTime,Channel_1,Channel_2,Channel_3,Channel_4,Channel_5,Channel_6,Channel_7,Channel_8,CapturedTime,sgb
345,2017-11-06 15:25:29.500,00:00:34.50,0.418052,0.422936,0.169712,0.149688,0.133572,0.115013,6.89,5.14,15:25:29.500000,True
2541,2017-11-06 15:29:09.100,00:04:14.10,0.244434,0.284725,0.135281,0.110862,0.094257,0.116967,6.77,1.99,15:29:09.100000,True
97364,2017-11-06 18:07:11.400,02:42:16.40,0.912536,0.875175,0.935001,0.387284,0.351389,0.999956,4.47,5.75,18:07:11.400000,True
99075,2017-11-06 18:10:02.500,02:45:07.50,0.179235,0.228073,0.17313,0.121851,0.12942,0.100606,6.18,5.57,18:10:02.500000,True
142284,2017-11-06 19:22:03.400,03:57:08.40,0.182898,0.253224,0.120141,0.056408,0.04835,0.045419,3.33,4.07,19:22:03.400000,True
188798,2017-11-06 20:39:34.800,05:14:39.80,0.46103,0.43246,0.328191,0.252736,0.157258,0.098653,6.0,4.4,20:39:34.800000,True
216228,2017-11-06 21:25:17.800,06:00:22.80,0.307923,0.441495,0.069594,0.048105,0.045663,0.052257,6.29,4.02,21:25:17.800000,True
225601,2017-11-06 21:40:55.100,06:16:00.10,0.347237,0.45175,0.298155,0.201945,0.260306,0.224654,6.08,3.49,21:40:55.100000,True
227396,2017-11-06 21:43:54.500,06:18:59.50,0.326237,0.325016,0.288144,0.427087,0.336737,0.140165,6.32,4.06,21:43:54.500000,True
227849,2017-11-06 21:44:39.800,06:19:44.80,0.193154,0.328679,0.476414,0.364331,0.235399,0.147002,6.34,3.74,21:44:39.800000,True
