In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import h5py

# Loading the dataset in a matrix X

In [None]:
"""
X is stored in a h5 file. 
There are 4400 different samples. 
Each sample is made of 8 concatenated signals.
Each signal is 90 sec window, sampled with a 100Hz frequency (ie 9000 values.)
(so size of one sample is 9000*8= 72000 values)


- 0: sample index
- 1: subject index
- 2 to 9001: Abdominal belt
- 9002 to 18001: Airflow
- 18002 to 27001: PPG (Photoplethysmogram)
- 27002 to 36001: Thoracic belt
- 36002 to 45001: Snoring indicator
- 45002 to 54001: SPO2
- 54002 to 63001: C4-A1
- 63002 to 72001:O2-A1
"""

X = h5py.File('X_train.h5', 'r')  
print(X.keys())

X = X['data'][:] # convert h5 file to a numpy array
X = X[:,2:] #remove patient id and index (useless)
X.shape

# Visualize the sleeping apneas

In [None]:
# y is, for each of the 4400 samples,  a boolean mask of length 90 sec. "1" means sleeping apnea. "0" means normal sleep

y = pd.read_csv('y_train.csv')
y = y.iloc[:,1:] #remove the index column (useless)
y.shape

In [None]:
import seaborn as sns               #visualising sleeping apneas spread

plt.figure(figsize=(20,20))
sns.heatmap(y,cbar=False) 

# Reshaping X for the 1d convolutional network : size (4400, 8 ,9) to preserve temporal structure

In [None]:
X = X.T
X.shape

In [None]:
X_reshape = np.zeros( (4400, 8, 9000) )

for i in range(4400):
    X_reshape[i] = X[:,i].reshape(8, 9000)

X_reshape.shape

# Normalizing each row of each sample. The 8 signals differ in range.

In [None]:
"""BEFORE NORMALIZATION """
plt.figure(figsize=(20,5))
plt.plot(X_reshape[1,1,:])

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() 

def normalisation( X ):
    
    """ gets a matrix and returns this matrix normalized on per ROW """
    
    X = X.T
    X = scaler.fit_transform(X)
    return X.T

X_norm = np.zeros( (4400, 8, 9000) )

for i in range (4400) :
    X_norm[i,:,:] = normalisation( X_reshape[i,:,:] )

In [None]:
""" AFTER NORMALIZATION """
plt.figure(figsize=(20,5))
plt.plot(X_norm[1,1,:])


# Train and test.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( X_norm, y, test_size=0.2, random_state=0 )  #splitting the dataset between train set and test set.
print( X_train.shape )
print( y_train.shape )
print( X_test.shape )
print( y_test.shape )

# Building the network

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential([    
    
#BASE                                          
    layers.Conv1D(filters=64, kernel_size=8 , strides=1 , padding='same', input_shape=[8, 9000]), 
    layers.Activation('relu'),
    
    layers.Conv1D(filters=128, kernel_size=5 , strides=1 , padding='same'),  
    layers.Activation('relu'),


#HEAD
    layers.Flatten(), 
    layers.Dense(90, activation='sigmoid') 
])

model.summary()

In [None]:
# cost function : cross entropy on a vector? (and for 4400 samples?)

model.compile( optimizer ='adam', loss ='binary_crossentropy', metrics =['binary_accuracy'] ) 


In [None]:
#fitting the network

record = model.fit(
    X_train, y_train,
    validation_data =( X_test, y_test ),
    batch_size=100,
    epochs=20,
)

In [None]:
# plotting learning curves

history_df = pd.DataFrame( record.history )

history_df.loc[:, ['loss', 'val_loss']].plot()
history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot() 

print(("Best Validation Loss: {}"  + "Best Validation accuracy: {}").format(history_df['val_loss'].min(), 
            history_df['val_binary_accuracy'].min()))

# Predictions

In [None]:
X_norm[18:19,:,:].shape

In [None]:
pred_example = model.predict( X_norm[18:19,:,:] )
print( pred_example.shape )
plt.plot(pred_example.T)

# Same preprocessing on X_true

In [None]:
X_true= h5py.File('X_test.h5', 'r') # loading X ground truth set
X_true=X_true['data'][:] 
X_true=X_true[:,2:]
X_true.shape

In [None]:
X_true = X_true.T
X_true.shape

X_true_reshape = np.zeros( (4400, 8, 9000) )

for i in range(4400):
    X_true_reshape[i] = X_true[:,i].reshape(8, 9000)

X_true_reshape.shape

In [None]:
X_true_norm = np.zeros( (4400, 8, 9000) )

for i in range (4400) :
    X_true_norm[i,:,:] = normalisation( X_true_reshape[i,:,:] )

# Prediction for X_true

In [None]:
y_pred = model.predict( X_true_norm )  # our prediction
y_pred.shape

In [None]:
plt.figure(figsize=(20,20)) # visualizing the coherence of our prediction
sns.heatmap( y_pred, cbar=False)

# Output for the challenge & decision rule

In [None]:
y_submission = pd.read_csv('y_benchmark.csv') # excample of prediction
y_submission



In [None]:
for i in range(y_pred.shape[0]):
    for j in range(y_pred.shape[1]):
        if y_pred[i,j]>0.5:          
            
            y_submission.iloc[i,j+1]=1
        else :
            y_submission.iloc[i,j+1]=0


In [None]:
plt.figure(figsize=(20,20)) # visualizing the  prediction
sns.heatmap(y_submission.iloc[:,1:])

In [None]:
# export
y_submission.to_csv('essai de prediction2.csv' ,index=False)

# ##########   PERSONNAL NOTES, DO NOT READ####################


In [None]:
SIGNALS_NAME = [
    "AbdoBelt",
    "AirFlow",
    "PPG",
    "ThorBelt",
    "Snoring",
    "SPO2",
    "C4A1",
    "O2A1",
]


def extract_events_from_binary_mask(binary_mask, fs=1):
    binary_mask = np.array([0] + binary_mask.tolist() + [0])
    diff_data = np.diff(binary_mask)
    starts = np.where(diff_data == 1)[0] / fs
    ends = np.where(diff_data == -1)[0] / fs

    assert len(starts) == len(ends)
    events = []
    for i, _ in enumerate(starts):
        events += [(starts[i], ends[i])]

    return events


def visualize_signal_and_event(X, mask, signals_name=SIGNALS_NAME, signal_freq=100):
    n_signal = X.shape[0]
    fig, axs = plt.subplots(n_signal, sharex=True)
    events = extract_events_from_binary_mask(mask)
    for i in range(n_signal):
        axs[i].plot(np.arange(0, X[i].shape[0]) / signal_freq, X[i])
        axs[i].set_ylabel(signals_name[i])
        for elt in events:
            axs[i].axvspan(elt[0], elt[1], color='red', alpha=0.3)
    plt.xlim(0, X[0].shape[0]/ signal_freq)
    plt.show()


def visualise_index(idx, data_h5, masks, N_signals=8):
    x = data_h5['data'][idx, 2:]
    x = x.reshape(N_signals, -1)
    visualize_signal_and_event(x, np.array(masks[idx, 1:]))


if __name__ == "__main__":
    import h5py
    PATH_TO_TRAINING_DATA = "C:\\Users\\pierr\\REPERTOIRE_PROJETS_DATA\\DREEM_PROJECT\\X_train.h5"
    PATH_TO_TRAINING_TARGET = "C:\\Users\\pierr\\REPERTOIRE_PROJETS_DATA\\DREEM_PROJECT\\y_train.csv"
    h5_file = h5py.File(PATH_TO_TRAINING_DATA)
    mask = np.array(pd.read_csv(PATH_TO_TRAINING_TARGET))
    visualise_index(18, h5_file, mask)