In [34]:
import pandas as pd
import os


import tensorflow as tf
from tensorflow import keras

# Helper libraries
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

In [46]:
import itertools

In [42]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM

from keras.callbacks import EarlyStopping


In [3]:
# Previously sorted train.csv 
#     using df.sort_values(by=['crew', 'experiment', 'time'])
df = pd.read_csv('data/sorted_train.csv')

In [26]:
df.iloc[0]

crew                  1
experiment           CA
time          0.0117188
seat                  1
eeg_fp1        -5.28545
eeg_f7          26.7758
eeg_f8         -9.52731
eeg_t4         -12.7932
eeg_t6          16.7178
eeg_t5          33.7375
eeg_t3          23.7123
eeg_fp2        -6.69587
eeg_o1          29.2321
eeg_p3          24.8429
eeg_pz          3.92134
eeg_f3           18.447
eeg_fz          1.07547
eeg_f4          3.09029
eeg_c4           37.369
eeg_p4          17.4376
eeg_poz         19.2019
eeg_c3          20.5968
eeg_cz         -3.95115
eeg_o2          14.5076
ecg               -4520
r               817.706
gsr              388.83
event                 A
Name: 0, dtype: object

In [23]:
df.shape

(4867421, 28)

In [57]:
def what_records(dfx):
    return [(int(x), y) for (x,y) in json.loads(dfx['time'].to_json(orient='columns')).items()]

In [62]:
        df[['crew', 'seat', 'time', 'r', 'experiment', 'event']].iloc[6600:6615]


Unnamed: 0,crew,seat,time,r,experiment,event
6600,1,1,109.988281,817.437988,CA,C
6601,1,0,109.988281,664.265991,CA,C
6602,1,0,109.992188,664.265991,CA,C
6603,1,1,109.992188,817.442017,CA,C
6604,1,1,109.996094,817.442017,CA,C
6605,1,0,109.996094,664.265991,CA,C
6606,1,0,11.0,664.33197,CA,C
6607,1,1,11.0,817.898987,CA,C
6608,1,0,11.003906,664.33197,CA,C
6609,1,1,11.003906,817.898987,CA,C


In [63]:
choices = ([1,2], [0,1], ['CA', 'DA', 'SS'])
for crew, seat, experiment in itertools.product(*choices):
    query = (df.crew == crew)&(df.seat == seat)& (df.experiment == experiment)
    print(df[query].shape)

(92131, 28)
(92077, 28)
(39563, 28)
(92168, 28)
(92130, 28)
(39583, 28)
(92133, 28)
(92194, 28)
(92131, 28)
(92099, 28)
(92099, 28)
(92212, 28)


In [None]:
trainseqs = get_windows(df.iloc[:1000], 256)

In [55]:
len(trainseqs[0]), len(trainseqs[1])
trainseqs[0].shape
trainseqs[0][0][0]
trainseqs[1][0]

array([666.185974, 'A'], dtype=object)

In [57]:
df[['crew', 'event']].iloc[:5].values.shape

(5, 2)

In [48]:
# Okay so since there are 9 crews in this data , and indeed I assume the test data 
#     crews are the same people. Anyway, not too important for now.
#     But I can split the data into 6 for train and 3 for testing.
#
# - So as a preliminary simple model, I want to just use the `r` , the respiration data.
# - And I suppose it doesn't really matter all too much if both crew member data
#      are intermingled, but I think I will split that away for now.

simple_scaler = lambda x, a: x*a 

def make_data(df, crews={'training': [1],
                        'test': [2]},
              sequence_window=256, row_cap_per_person=None,
             feature_cols={'r': simple_scaler}):
    # current sorted as ['crew', 'experiment', 'time']
    [0, 1] # each seat
    ['CA', 'DA', 'SS'] # experiment
    
    sort_cols = ['crew', 'seat', 'experiment', 'time']
    target_col = 'event'
    what_cols = sort_cols + feature_cols.keys() + target_col
    # Training
    traindf = df[df.crew.isin(crews['training'])][what_cols].sort_values(
        by=sort_cols)
    
    train_seqs = get_windows(traindf, sequence_window)
    
    # Testing
    testdf = df[df.crew.isin(crews['test'])][what_cols].sort_values(
        by=sort_cols)
    test_seqs = get_windows(testdf, sequence_window)
    
    
def get_windows(df, window_size):
    #
    windows = []
    Y = []
    choices = (df.crew.unique().tolist(), [0, 1], ['CA', 'DA', 'SS'])
    for crew, seat, experiment in itertools.product(*choices):
        query = (df.crew == crew)&(df.seat == seat)& (df.experiment == experiment)
        thisdf = df[query][['r', 'event']]
        return to_sequences(thisdf.values, window_size) 
        # y = thisdf.iloc[-1].event


def to_sequences(obs, seq_size, incols=[0], outcols=[1]):
    x = []
    y = []

    for i in range(len(obs)-seq_size-1):
        #print(i)
        window = obs[i:(i+seq_size)]
        after_window = obs[i+seq_size]
        # window = [[x] for x in window]

        x.append(window)
        y.append(after_window)
        
    return np.array(x), np.array(y)

def bake_model(x_train, y_train, x_test, y_test):
    model = Sequential()
    model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2,
                   input_shape=(None, 1)))
    # model.add(Dense(32))

    # 4 because 'A', 'B', 'C', 'D'.
    model.add(Dense(4))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    # initial_state ... for the LSTM , hmm


    monitor = EarlyStopping(monitor='val_loss',
                            min_delta=1e-3, patience=5, verbose=1, mode='auto')
    print('Train...')

    # https://www.tensorflow.org/api_docs/python/tf/keras/models/Model#arguments_8
    # - hmm so fit() can take a generator sometimes.
    # - use_multiprocessing=True 
    model.fit(x_train, y_train, validation_data=(x_test, y_test),
              callbacks=[monitor], verbose=2, epochs=1000)

    return model