In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import math

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split


2023-11-16 13:59:14.274621: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-16 13:59:14.293481: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-16 13:59:14.293501: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-16 13:59:14.294006: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-16 13:59:14.297063: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-16 13:59:14.297646: I tensorflow/core/platform/cpu_feature_guard.cc:1

# Load Data

First step is to ingest all the data we have available and merge them into a flattened datastructure containing all measurements. Indexes are ignored and rewritten to allow all readings to be added to the DF:

In [2]:
# parse the labels.csv
labels = pd.read_csv('labels.csv', index_col=0)
labels = labels.sort_values('id')

# grab filenames from the data directory
filenames = os.listdir('data')
filenames.sort()

dataframes = []

# parse and concatenate all csv files into df
for filename in filenames:
  if filename.endswith('.csv'):
    batch = pd.read_csv(os.path.join('data',filename), index_col=0)
    batch['batch'] = int(filename.replace('.csv', ''))
    dataframes.append(batch)

df = pd.concat(dataframes, ignore_index=True)

# add label column (if it is not already available)
if (not 'label' in df.columns):
  df = df.merge(labels, left_on=["batch"], right_on=["id"])


In [3]:
def time_to_float(inputstr):
  hours, minutes, seconds = map(float, inputstr.split(':'))

  # return hours * 3600 + minutes * 60 + seconds
  # this is sufficient because hours should always be 0
  return minutes * 60 + seconds

if (not df['sensorid'].dtype == 'int'):
  df['sensorid'] = df['sensorid'].astype('int')
if (not df['label'].dtype == 'category'):
  df['label'] = df['label'].astype('category')
if (not df['zeit'].dtype == 'float64'):
  df['zeit'] = df['zeit'].apply(time_to_float)

# print(df[:10])
# print(labels[:10])


# Test Data Preprocessing

In [4]:
sequence_length = 100

sequences_df = []
sequence_labels_df = []

grouped = df.groupby('batch')

for batch, readings in grouped:
  readings = readings.sort_values('zeit')
  for i in range(0, len(readings) - sequence_length + 1, sequence_length):
    sequence = readings.iloc[i:i + sequence_length]
    sequences_df.append(sequence[['zeit', 'sensorid', 'messwert']].values)
    sequence_labels_df.append(sequence['label'].values[0])

sequences = np.array(sequences_df)
sequence_labels = np.array(sequence_labels_df)

X_train, X_test, y_train, y_test = train_test_split(sequences, sequence_labels, test_size=0.2, random_state=42)


# Modelling

In [6]:
BATCH_SIZE = 32
CHECKPOINT_PATH = '.checkpoints/cp-{epoch:04d}.ckpt'
CHECKPOINT_DIR = os.path.dirname(CHECKPOINT_PATH)
N_BATCHES = math.ceil(len(X_train) / BATCH_SIZE)

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=CHECKPOINT_PATH, save_weights_only=True, save_freq=5*N_BATCHES, verbose=1)

def create_model():
  clf = Sequential()
  clf.add(LSTM(100, input_shape=(sequence_length, 3)))
  clf.add(Dense(3, activation='softmax'))
  clf.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
  return clf

if (os.path.exists('classifier.keras')):
  clf = tf.keras.models.load_model('classifier.keras')
else:
  # Build Model
  clf = create_model()

# Training
clf.fit(X_train, y_train, epochs=100, batch_size=BATCH_SIZE)
# Save Model
clf.save('classifier.keras')


Epoch 1/10


2023-11-16 14:00:02.117017: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 37860000 exceeds 10% of free system memory.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [7]:
# additional save (different names)
# clf.save('classifier_softmax.keras')
# clf.save('classifier_relu.keras')


# Evaluation

In [10]:
# TBD


[0.6006531119346619,
 0.5860811471939087,
 0.5851655602455139,
 0.5875413417816162,
 0.585140585899353,
 0.5804042816162109,
 0.5864750146865845,
 0.5895113945007324,
 0.5865926742553711,
 0.5770363211631775]