In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import plotly.express as px


# Load Data

First step is to ingest all the data we have available and merge them into a flattened datastructure containing all measurements. Indexes are ignored and rewritten to allow all readings to be added to the DF:

In [2]:
# parse the labels.csv
labels = pd.read_csv('labels.csv', index_col=0)

# grab filenames from the data directory
filenames = os.listdir('data')

dataframes = []

# parse and concatenate all csv files into df
for filename in filenames:
  if filename.endswith('.csv'):
    batch = pd.read_csv(os.path.join('data',filename), index_col=0)
    batch['batch'] = int(filename.replace('.csv', ''))
    dataframes.append(batch)

df = pd.concat(dataframes, ignore_index=True)

# print(df[:10])


In [5]:
# add label column (if it is not already available)
if (not 'label' in df.columns):
  df = df.merge(labels, left_on=["batch"], right_on=["id"])

def time_to_float(inputstr):
  hours, minutes, seconds = map(float, inputstr.split(':'))

  # return hours * 3600 + minutes * 60 + seconds
  # this is sufficient because hours should always be 0
  return minutes * 60 + seconds

if (not df['zeit'].dtype == 'category'):
  df['label'] = df['label'].astype('category')
if (not df['zeit'].dtype == 'float64'):
  df['zeit'] = df['zeit'].apply(time_to_float)

# print(df[:10])
# print(df['sensorid'].value_counts())


sensorid
117    177135
145    170732
118    170604
119    170429
120    170261
121    170094
95     137808
96     137636
94     137585
97     137472
98     137423
99     137330
100    137243
101    137152
102    136974
130     99517
133     99496
127     99262
132     99237
128     99204
129     99143
131     99115
126     99058
134     98869
114     90205
115     89865
107     89830
112     89795
108     89761
111     89665
110     89578
113     89527
109     88326
Name: count, dtype: int64


# Data Visualisation

In [4]:
sensors = df['sensorid'].unique()

if not os.path.exists('plots'):
  os.makedirs('plots')

for sensor in sensors:
  fig = px.scatter(df[df['sensorid'] == sensor], x='zeit', y='messwert', color='label')
  # fig.show(observed=False, interactive=False)
  fig.write_image(os.path.join('plots', f'{sensor}.png'))

# print(df['label'].value_counts())


  grouped = df.groupby(required_grouper, sort=False)  # skip one_group groupers
































































