In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay


# Load Data

First step is to ingest all the data we have available and merge them into a flattened datastructure containing all measurements. Indexes are ignored and rewritten to allow all readings to be added to the DF:

In [None]:
# parse the labels.csv
labels = pd.read_csv('labels.csv', index_col=0)
labels = labels.sort_values('id')

# grab filenames from the data directory
filenames = os.listdir('data')
filenames.sort()

dataframes = []

# parse and concatenate all csv files into df
for filename in filenames:
  if filename.endswith('.csv'):
    batch = pd.read_csv(os.path.join('data',filename), index_col=0)
    batch['batch'] = int(filename.replace('.csv', ''))
    dataframes.append(batch)

df = pd.concat(dataframes, ignore_index=True)

# add label column (if it is not already available)
if (not 'label' in df.columns):
  df = df.merge(labels, left_on=["batch"], right_on=["id"])


In [None]:
def time_to_float(inputstr):
  hours, minutes, seconds = map(float, inputstr.split(':'))

  # return hours * 3600 + minutes * 60 + seconds
  # this is sufficient because hours should always be 0
  return minutes * 60 + seconds

if (not df['sensorid'].dtype == 'int'):
  df['sensorid'] = df['sensorid'].astype('int')
if (not df['label'].dtype == 'category'):
  df['label'] = df['label'].astype('category')
if (not df['zeit'].dtype == 'float64'):
  df['zeit'] = df['zeit'].apply(time_to_float)

# print(df[:10])
# print(labels[:10])


# Test Data Preprocessing

In [None]:
sequence_length = 100

sequences_df = []
sequence_labels_df = []

grouped = df.groupby('batch')

for batch, readings in grouped:
  readings = readings.sort_values('zeit')
  for i in range(0, len(readings) - sequence_length + 1, sequence_length):
    sequence = readings.iloc[i:i + sequence_length]
    # print(sequence[:1])
    # print(sequence[['zeit', 'sensorid', 'messwert']].values[:1])
    sequences_df.append(sequence[['zeit', 'sensorid', 'messwert']].values)
    sequence_labels_df.append(sequence['label'].values[0])

sequences = np.array(sequences_df)
sequence_labels = np.array(sequence_labels_df)

X_train, X_test, y_train, y_test = train_test_split(sequences, sequence_labels, test_size=0.2, random_state=42)


# Modelling

In [None]:
# Build Model
clf = Sequential()
clf.add(LSTM(100, input_shape=(sequence_length, 3)))
clf.add(Dense(3, activation='softmax'))
clf.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Training
clf.fit(X_train, y_train, epochs=100, batch_size=32)


# Evaluation

In [None]:
y_pred = clf.predict(X_test)
print(y_pred)

acc = accuracy_score(y_test, y_pred)
print(acc)

cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
disp.plot()
plt.show()
