In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import math

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split


# Load Data

First step is to ingest all the data we have available and merge them into a flattened datastructure containing all measurements. Indexes are ignored and rewritten to allow all readings to be added to the DF:

In [None]:
# parse the labels.csv
labels = pd.read_csv('labels.csv', index_col=0)
labels = labels.sort_values('id')

# grab filenames from the data directory
filenames = os.listdir('data')
filenames.sort()

dataframes = []

# parse and concatenate all csv files into df
for filename in filenames:
  if filename.endswith('.csv'):
    batch = pd.read_csv(os.path.join('data',filename), index_col=0)
    batch['batch'] = int(filename.replace('.csv', ''))
    dataframes.append(batch)

df = pd.concat(dataframes, ignore_index=True)

# clean up original dataframes
del dataframes

# add label column (if it is not already available)
if (not 'label' in df.columns):
  df = df.merge(labels, left_on=["batch"], right_on=["id"])


In [None]:
def time_to_float(inputstr):
  hours, minutes, seconds = map(float, inputstr.split(':'))

  # return hours * 3600 + minutes * 60 + seconds
  # this is sufficient because hours should always be 0
  return minutes * 60 + seconds

if (not df['sensorid'].dtype == 'int'):
  df['sensorid'] = df['sensorid'].astype('int')
if (not df['label'].dtype == 'category'):
  df['label'] = df['label'].astype('category')
if (not df['zeit'].dtype == 'float64'):
  df['zeit'] = df['zeit'].apply(time_to_float)

# print(df[:10])
# print(labels[:10])


# Test Data Preprocessing

In [None]:
SEQUENCE_LENGTH = 128

sequences = []
sequence_labels = []

grouped = df.groupby('batch')

for batch, readings in df.groupby('batch'):
  readings = readings.sort_values('zeit')
  for i in range(0, len(readings) - SEQUENCE_LENGTH + 1, SEQUENCE_LENGTH):
    sequence = readings.iloc[i:i + SEQUENCE_LENGTH]
    sequences.append(sequence[['zeit', 'sensorid', 'messwert']].values)
    sequence_labels.append(sequence['label'].values[0])

sequences = np.array(sequences)
sequence_labels = np.array(sequence_labels)

X_train, X_test, y_train, y_test = train_test_split(sequences, sequence_labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


# Modelling

In [None]:
BATCH_SIZE = 128
CHECKPOINT_PATH = '.checkpoints/cp-{epoch:04d}.ckpt'
# CHECKPOINT_DIR = os.path.dirname(CHECKPOINT_PATH)
N_BATCHES = math.ceil(len(X_train) / BATCH_SIZE)

model = Sequential()
model.add(LSTM(128, input_shape=(SEQUENCE_LENGTH, 3)))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Training
cp_cb = ModelCheckpoint(filepath=CHECKPOINT_PATH, save_weights_only=True, save_freq=8*N_BATCHES, verbose=1)
stp_cb = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)

model.fit(X_train, y_train, epochs=512, batch_size=BATCH_SIZE, callbacks=[cp_cb, stp_cb], validation_data=(X_val, y_val))

# Save Model
model.save('classifier.keras')


# Evaluation

In [None]:
# TBD
# model = tf.keras.models.load_model('classifier.keras');
print(model.predict(X_test[:5]))
loss, acc = model.evaluate(X_test, y_test, verbose=2)

print("Model accuracy: {:5.2f}%".format(100 * acc))


In [None]:
# Additional Save steps for models I like
model.save('classifier-softmax-256.keras')
