# Data Preprocessing, Averaging over all csv-files and Saving the averaged data into new csv-file

In [2]:
import os
import pandas as pd
import json 
from collections import Counter

# extract the code into a function called preprocess_data so that we can use it later for new data
def preprocess_data(averaged_df, directory, file):
    df = pd.read_csv(directory + file, sep=';')
    
    # Drop unnecessary and outdated columns
    deleted_columns = ['timeStampNetwork', 'timeStampGPS', 'locationDescription', 'people', 'latitudeGPS', 'longitudeGPS', 'latitudeNetwork', 'longitudeNetwork', 'minCn0GPS', 'maxCn0GPS', 'meanCn0GPS', 'minCn0Bluetooth', 'maxCn0Bluetooth', 'minCn0Wifi', 'maxCn0Wifi', 'meanCn0Wifi', 'bAccuracyNetwork', 'speedAccuracyNetwork', 'cellType', 'networkLocationType', 'hAccuracyNetwork', 'vAccuracyNetwork', 'speedAccuracyNetwork', 'bAccuracyNetwork', 'nrBlDevices', 'hAccuracyGPS', 'minCn0Bl', 'meanCn0Bl','maxCn0Bl', 'bAccuracyGPS', 'speedAccuracyGPS', 'vAccuracyGPS', 'nrWifiDevices']
    df = safe_delete(df, deleted_columns)
    
    # Remove first x rows and reset begin index to 0
    removedRows = 3
    df = df.iloc[removedRows:]
    df = df.reset_index(drop=True)
    
    # Load satellites json
    df['satellites'] = df['satellites'].apply(lambda x: json.loads(x))
    
    # Add cn0 column for easier computation of statistics
    df['satellite_cn0'] = df['satellites'].apply(lambda x: [sat['cn0'] for sat in x])
    
    # Calculate min, max, mean. median, mode, variance, standard deviation and range of the satellite cn0
    df['satellite_cn0_min'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).min() if not pd.Series(x).empty else 0)
    df['satellite_cn0_max'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).max() if not pd.Series(x).empty else 0)
    df['satellite_cn0_mean'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).mean() if not pd.Series(x).empty else 0)    
    df['satellite_cn0_median'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).median() if not pd.Series(x).empty else 0)
    df['satellite_cn0_mode'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).mode()[0] if not pd.Series(x).mode().empty else 0)
    df['satellite_cn0_std'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).std() if not pd.Series(x).empty else 0)
    df['satellite_cn0_range'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).max() - pd.Series(x).min() if not pd.Series(x).empty else 0)
    
    # load the bluetooth json and load rssi into a new column
    df['bluetoothDevices'] = df['bluetoothDevices'].apply(lambda x: json.loads(x))
    df['bluetooth_rssi'] = df['bluetoothDevices'].apply(lambda x: [device['rssi'] for device in x])
    
     # Calculate statistical figures for the bluetooth devices
    df['bluetooth_rssi_min'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).min() if not pd.Series(x).empty else 0)
    df['bluetooth_rssi_max'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).max() if not pd.Series(x).empty else 0)
    df['bluetooth_rssi_mean'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).mean() if not pd.Series(x).empty else 0)
    df['bluetooth_rssi_median'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).median() if not pd.Series(x).empty else 0)
    df['bluetooth_rssi_mode'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).mode()[0] if not pd.Series(x).mode().empty else 0)
    df['bluetooth_rssi_std'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).std() if not pd.Series(x).empty else 0)
    df['bluetooth_rssi_range'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).max() - pd.Series(x).min() if not pd.Series(x).empty else 0)
    
    # load the wifi json and load rssi into a new column
    df['wifiDevices'] = df['wifiDevices'].apply(lambda x: json.loads(x))
    df['wifi_rssi'] = df['wifiDevices'].apply(lambda x: [device['level'] for device in x])
    
     # Calculate statistical figures for the wifi devices
    df['wifi_rssi_min'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).min() if not pd.Series(x).empty else 0)
    df['wifi_rssi_max'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).max() if not pd.Series(x).empty else 0)
    df['wifi_rssi_mean'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).mean() if not pd.Series(x).empty else 0)
    df['wifi_rssi_median'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).median() if not pd.Series(x).empty else 0)
    df['wifi_rssi_mode'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).mode()[0] if not pd.Series(x).mode().empty else 0)
    df['wifi_rssi_range'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).max() - pd.Series(x).min() if not pd.Series(x).empty else 0)
    
    # Drop list columns
    df.drop(columns=['satellites', 'bluetoothDevices', 'wifiDevices', 'satellite_cn0', 'bluetooth_rssi', 'wifi_rssi'], inplace=True)
    
     # Average over all columns for numeric values and take the first of non-numeric to have a single row
    df_label = df.iloc[0]['label']
    
    df.drop(columns=['label'], inplace=True)
    
    df = df.mean().to_frame().T
    
    df['label'] = df_label
    
    #place label at the beginning
    cols = list(df.columns)
    cols = [cols[-1]] + cols[:-1]
    df = df[cols]
    
    averaged_df = pd.concat([averaged_df, df], ignore_index=True)
    return averaged_df

def safe_delete(df, columns):
    for column in columns:
        if column in df.columns:
            df = df.drop(columns=[column])
    return df

files_directory = '../Daten/firsttry/'
averaged_path = '../Daten/averaged_data.csv'

if os.path.exists(averaged_path):
    os.remove(averaged_path)
    print(f"Removed {averaged_path}")

# Get list of CSV files
csv_files = [file for file in os.listdir(files_directory) if file.endswith('.csv')]
print(f'Found {len(csv_files)} CSV files')

# Get nr of files, where name starts with indoor and outdoor
indoor_files = [file for file in csv_files if file.startswith('Indoor')]
outdoor_files = [file for file in csv_files if file.startswith('Outdoor')]
print(f'Found {len(indoor_files)} indoor files and {len(outdoor_files)} outdoor files')

averaged_data = pd.DataFrame()

for file in csv_files:
    averaged_data = preprocess_data(averaged_data, files_directory, file)

averaged_data.sort_values(by=['label'], inplace=True)
print(averaged_data.columns)

Found 189 CSV files
Found 105 indoor files and 84 outdoor files
Index(['label', 'cellStrength', 'nrSatellitesInView', 'nrSatellitesInFix',
       'satellite_cn0_min', 'satellite_cn0_max', 'satellite_cn0_mean',
       'satellite_cn0_median', 'satellite_cn0_mode', 'satellite_cn0_std',
       'satellite_cn0_range', 'bluetooth_rssi_min', 'bluetooth_rssi_max',
       'bluetooth_rssi_mean', 'bluetooth_rssi_median', 'bluetooth_rssi_mode',
       'bluetooth_rssi_std', 'bluetooth_rssi_range', 'wifi_rssi_min',
       'wifi_rssi_max', 'wifi_rssi_mean', 'wifi_rssi_median', 'wifi_rssi_mode',
       'wifi_rssi_range'],
      dtype='object')


# Train the Random Forest Classifier

## Randomize and Split the Data for Random Forest Classifier

In [3]:
from sklearn.model_selection import train_test_split

# Randomize the data
averaged_data = averaged_data.sample(frac=1).reset_index(drop=True)

# drop the label column
X = averaged_data.drop(columns=['label'], axis=1)
Y = averaged_data['label']

# Split the data into training and testing data
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.0001)
X_train = X
Y_train = Y

print(f'Training data shape: {X_train.shape}')
#print(f'Testing data shape: {X_test.shape}')

Training data shape: (189, 23)


## Train a Random Forest Classifier

In [4]:
import numpy as np
from sklearn.model_selection import cross_val_score
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Train the Random Forest Classifier with regularization
clf = RandomForestClassifier()

# Train the classifier
#clf.fit(X_train, Y_train)

#Load the model (to retrain it comment the next line and uncomment the line above)
clf = joblib.load('../Daten/random_forest_classifier.joblib')

# Cross-validation
scores = cross_val_score(clf, X_train, Y_train, cv=10)
print(f"Cross-validation scores: {scores}")

# Predict the labels of the test data
#y_pred = clf.predict(X_test)

# Calculate the accuracy of the classifier
#accuracy = accuracy_score(Y_test, y_pred)
#print(f'Accuracy: {accuracy}')

Cross-validation scores: [0.89473684 0.94736842 0.94736842 1.         0.94736842 0.94736842
 0.94736842 0.94736842 0.94736842 0.94444444]


## Get the Feature Importances

In [5]:
# Print feature importances of the selected features
feature_importances = clf.feature_importances_
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': feature_importances})
feature_importances = feature_importances.sort_values(by='importance', ascending=False)
feature_importances

Unnamed: 0,feature,importance
6,satellite_cn0_median,0.16919
4,satellite_cn0_max,0.159368
2,nrSatellitesInFix,0.130355
7,satellite_cn0_mode,0.098567
5,satellite_cn0_mean,0.098037
9,satellite_cn0_range,0.094234
8,satellite_cn0_std,0.071768
3,satellite_cn0_min,0.04152
13,bluetooth_rssi_median,0.016981
1,nrSatellitesInView,0.016389


## Save the model and tryout the model with new data

In [6]:
import joblib

# Save the model
model_path = '../Daten/random_forest_classifier.joblib'
joblib.dump(clf, model_path)
print(f'Saved model to {model_path}')

Saved model to ../Daten/random_forest_classifier.joblib


# Test the rf model with new data in validation folder in this notebook

In [7]:
import joblib
import pandas as pd
import os

validation_files_directory = '../Daten/validation/'
validation_averaged_path = '../Daten/validation_averaged_data.csv'
model_path = '../Daten/random_forest_classifier.joblib'

# Load the model
model = joblib.load(model_path)

validation_averaged_data = pd.DataFrame()
validation_description = pd.DataFrame()
validation_files = [file for file in os.listdir(validation_files_directory) if file.endswith('.csv')]

for file in validation_files:
    df = pd.read_csv(validation_files_directory + file, sep=';')
    locationDescription = df['locationDescription'][0]
    locationPeople = df['people'][0]
    
    validation_description = pd.concat([validation_description, pd.DataFrame({'locationDescription': [locationDescription], 'people': [locationPeople], 'file': [file]})], ignore_index=True)

    validation_averaged_data = preprocess_data(validation_averaged_data, validation_files_directory, file)

# Predict the labels of the validation data
validation_predictions = model.predict(validation_averaged_data.drop(columns=['label']))
validation_probabilities = model.predict_proba(validation_averaged_data.drop(columns=['label']))
validation_probabilities = [max(probability) for probability in validation_probabilities]

#Save all predictions an information in a dataframe
validation_predictions_rfc = pd.DataFrame()

#print label and the corresponding prediction
for label, description, people, prediction, probability, file in zip(validation_averaged_data['label'], validation_description['locationDescription'], validation_description['people'], validation_predictions, validation_probabilities, validation_description['file']):
    validation_predictions_rfc = pd.concat([validation_predictions_rfc, pd.DataFrame({'label': [label], 'prediction': [prediction], 'probability': [probability],'description': [description], 'people': [people], 'file': [file]})], ignore_index=True)
    '''print("--------------------")
    print(f'Label: {label}, Prediction: {prediction}, Description: {description}, People: {people},  Probability: {probability}, File: {file}')'''
   
data = validation_predictions_rfc

# Calculate the average confidence  and accuracy for each location description and people

for condition in data['description'].unique():
    subset = data[data['description'] == condition]
    mean_confidence = subset['probability'].mean()
    print(f"Average confidence for {condition}: {mean_confidence:.2f}")
    
for condition in data['description'].unique():
    condition_mask = data['description'] == condition
    accuracy = accuracy_score(data[condition_mask]['label'], data[condition_mask]['prediction'])
    print(f"Accuracy for {condition}: {accuracy}")
    

print("__--------_______-----___")


for condition in data['people'].unique():
    subset = data[data['people'] == condition]
    mean_confidence = subset['probability'].mean()
    print(f"Average confidence for {condition}: {mean_confidence:.2f}")
    
for condition in data['people'].unique():
    condition_mask = data['people'] == condition
    accuracy = accuracy_score(data[condition_mask]['label'], data[condition_mask]['prediction'])
    print(f"Accuracy for {condition}: {accuracy}")

Average confidence for Überdacht: 0.89
Average confidence for Kein Fenster: 1.00
Average confidence for Nähe Fenster: 0.86
Average confidence for Häuserschlucht: 0.94
Average confidence for Raummitte: 1.00
Average confidence for Frei: 0.96
Accuracy for Überdacht: 1.0
Accuracy for Kein Fenster: 1.0
Accuracy for Nähe Fenster: 0.7272727272727273
Accuracy for Häuserschlucht: 1.0
Accuracy for Raummitte: 1.0
Accuracy for Frei: 1.0
__--------_______-----___
Average confidence for viel: 0.89
Average confidence for keine: 0.93
Average confidence for weniger als 5: 0.95
Accuracy for viel: 0.8
Accuracy for keine: 0.9333333333333333
Accuracy for weniger als 5: 1.0


## Setup LSTM Model

In [9]:
from keras.saving.save import load_model
from keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

dropout_rate = 0.1
def create_model():
    model = Sequential()
    model.add(LSTM(128, input_shape=(X_train.shape[1], 1), return_sequences=True))
    model.add(Dropout(dropout_rate))

    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(dropout_rate))

    model.add(LSTM(128))
    model.add(Dropout(dropout_rate))

    model.add(Dense(32, activation='relu'))
    model.add(Dropout(dropout_rate))

    model.add(Dense(2, activation='softmax'))

    model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Create a classifier with the wrapper
model = KerasClassifier(model=create_model, epochs=15, batch_size=32, verbose=1)

#convert all columns to float except the label column
'''X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.)
X_train = X_train.astype(float)
Y_train = Y_train.map({'Indoor': 0, 'Outdoor': 1})
Y_train = Y_train.astype(float)
X_test = X_test.astype(float)
Y_test = Y_test.map({'Indoor': 0, 'Outdoor': 1})
Y_train = Y_train.astype(float)'''

# Don't split the data into training and testing data but use all for training, since we have a separate validation dataset
X_train = X
Y_train = Y.map({'Indoor': 0, 'Outdoor': 1})

X_train = np.array(X_train)
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

Y_train = Y_train.astype(float)
#X_test = np.array(X_test)
#X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

print(f'Training data shape: {X_train.shape}')

#Uncomment the following line to train the model and comment line 56 and 57
model.fit(X_train, Y_train, batch_size=32)

# Load the final tflite model instead of retraining it
#model_path = '../Daten/lstm_classifier.h5'
#model = load_model(model_path)

# Now you can use cross_val_score (this works only with 
scores = cross_val_score(model, X_train, Y_train, cv=10)
print(f"Cross-validation scores: {scores}")

# Save the model
model_path = '../Daten/lstm_classifier.h5'
if hasattr(model, 'model_'):  # Check if the fitted model is accessible
    #model.model_.save(model_path)  # Note the underscore in model_
    print(f'Saved model to {model_path}')
else:
    print("Model has not been fitted or model is not< accessible")

Training data shape: (189, 23, 1)
Epoch 1/15


2024-08-16 10:01:41.972538: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/

## Convert the LSTM Model to TFLite

In [None]:
import tensorflow as tf
import pandas as pd

# Load the model
model = tf.keras.models.load_model(model_path)

# Initialize the TFLite converter
converter = tf.lite.TFLiteConverter.from_keras_model(model)

# Use the Select TF ops and disable the _experimental_lower_tensor_list_ops flag
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]
converter._experimental_lower_tensor_list_ops = False

# Convert the model
tflite_model = converter.convert()

# Save the converted model
with open("../Daten/lstm_classifier.tflite", "wb") as f:
    f.write(tflite_model)

print(f'Saved model to ../Daten/lstm_classifier.tflite')

## Validate the LSTM Model with h5 file

In [None]:
from tensorflow.python.keras.models import load_model
import os
import pandas as pd
import json 
from collections import Counter

# Load the validation data into dataframes
validation_averaged_data = pd.DataFrame()
validation_description = pd.DataFrame()
validation_files = [file for file in os.listdir(validation_files_directory) if file.endswith('.csv')]

for file in validation_files:
    df = pd.read_csv(validation_files_directory + file, sep=';')
    locationDescription = df['locationDescription'][0]
    locationPeople = df['people'][0]
    
    validation_description = pd.concat([validation_description, pd.DataFrame({'locationDescription': [locationDescription], 'people': [locationPeople], 'file': [file]})], ignore_index=True)

    validation_averaged_data = preprocess_data(validation_averaged_data, validation_files_directory, file)

#map the label column to 0 and 1
validation_averaged_data['label'] = validation_averaged_data['label'].map({'Indoor': 0, 'Outdoor': 1})

#convert all columns to float except the label column
validation_averaged_data = validation_averaged_data.astype(float)

Y_validation = validation_averaged_data['label']
X_validation = validation_averaged_data.drop(columns=['label'])

X_validation = np.array(X_validation)
X_validation = np.reshape(X_validation, (X_validation.shape[0], X_validation.shape[1], 1))

Y_validation = Y_validation.astype(float)

#load the model
model_path = '../Daten/lstm_classifier.h5'
model = load_model(model_path)

#predict the labels of the validation data for each data point with their corresponding probability
validation_predictions = model.predict(X_validation)
validation_predictions = [np.argmax(prediction) for prediction in validation_predictions]
validation_probabilities = model.predict_proba(X_validation)
validation_probabilities = [max(probability) for probability in validation_probabilities]

#convert label and prediction to Indoor and Outdoor
validation_averaged_data['label'] = validation_averaged_data['label'].map({0: 'Indoor', 1: 'Outdoor'})
validation_predictions = ['Indoor' if prediction == 0 else 'Outdoor' for prediction in validation_predictions]

#Save all predictions an information in a dataframe
validation_predictions_lstm = pd.DataFrame()

for label, description, people, prediction, probability, file in zip(validation_averaged_data['label'], validation_description['locationDescription'], validation_description['people'], validation_predictions, validation_probabilities, validation_description['file']):
    validation_predictions_lstm = pd.concat([validation_predictions_lstm, pd.DataFrame({'label': [label], 'prediction': [prediction], 'probability': [probability],'description': [description], 'people': [people], 'file': [file]})], ignore_index=True)
'''#print label and the corresponding prediction
for label, description, people, prediction, file in zip(validation_averaged_data['label'], validation_description['locationDescription'], validation_description['people'], validation_predictions, validation_description['file']):
    #print(f'Label: {label}, Prediction: {prediction}, Description: {description}, People: {people},  File: {file}')'''

data = validation_predictions_lstm

import pandas as pd

for condition in data['description'].unique():
    subset = data[data['description'] == condition]
    mean_confidence = subset['probability'].mean()
    print(f"Average confidence for {condition}: {mean_confidence:.2f}")
    
for condition in data['description'].unique():
    condition_mask = data['description'] == condition
    accuracy = accuracy_score(data[condition_mask]['label'], data[condition_mask]['prediction'])
    print(f"Accuracy for {condition}: {accuracy}")
    

print("__--------_______-----___")


for condition in data['people'].unique():
    subset = data[data['people'] == condition]
    mean_confidence = subset['probability'].mean()
    print(f"Average confidence for {condition}: {mean_confidence:.2f}")
    
for condition in data['people'].unique():
    condition_mask = data['people'] == condition
    accuracy = accuracy_score(data[condition_mask]['label'], data[condition_mask]['prediction'])
    print(f"Accuracy for {condition}: {accuracy}")


## Validate the LSTM Model with tflite file

In [None]:
'''# run the same validation process as above but with the tflite model
# Load the TFLite model and allocate tensors.
interpreter = tf.lite.Interpreter(model_path="../Daten/lstm_classifier.tflite")
interpreter.allocate_tensors()

# Get input and output tensors.
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Load the validation data into dataframes
validation_averaged_data = pd.DataFrame()
validation_description = pd.DataFrame()
validation_files = [file for file in os.listdir(validation_files_directory) if file.endswith('.csv')]
for file in validation_files:
    df = pd.read_csv(validation_files_directory + file, sep=';')
    locationDescription = df['locationDescription'][0]
    locationPeople = df['people'][0]
    
    validation_description = pd.concat([validation_description, pd.DataFrame({'locationDescription': [locationDescription], 'people': [locationPeople], 'file': [file]})], ignore_index=True)

    validation_averaged_data = preprocess_data(validation_averaged_data, validation_files_directory, file)
    
#map the label column to 0 and 1
validation_averaged_data['label'] = validation_averaged_data['label'].map({'Indoor': 0, 'Outdoor': 1})

#convert all columns to float except the label column
validation_averaged_data = validation_averaged_data.astype(float)

Y_validation = validation_averaged_data['label']
X_validation = validation_averaged_data.drop(columns=['label'])

X_validation = np.array(X_validation)
X_validation = np.reshape(X_validation, (X_validation.shape[0], X_validation.shape[1], 1))

Y_validation = Y_validation.astype(float)

# Run the model with TensorFlow Lite
validation_predictions = []
for i in range(len(X_validation)):
    # Add an extra dimension to the input tensor
    input_data = np.expand_dims(X_validation[i], axis=0).astype(np.float32)

    # Set the tensor to point to the input data to be inferred
    interpreter.set_tensor(input_details[0]['index'], input_data)

    # Run the inference
    interpreter.invoke()

    # Retrieve the output of the inference
    output_data = interpreter.get_tensor(output_details[0]['index'])
    validation_predictions.append(output_data)
    
#print label and the corresponding prediction
for label, description, people, prediction, file in zip(validation_averaged_data['label'], validation_description['locationDescription'], validation_description['people'], validation_predictions, validation_description['file']):
    #if abs(prediction[0] - prediction[1]) < 0.7:
    print(f'Label: {label}, Prediction: {prediction}, Description: {description}, People: {people},  File: {file}')'''

Confusion Matrix and Probability Distribution

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Define true labels and predictions for RFC with the updated data
true_labels_rfc = [
    'Outdoor', 'Indoor', 'Indoor', 'Outdoor', 'Indoor', 'Indoor', 'Indoor', 'Outdoor', 'Outdoor', 'Outdoor', 'Outdoor', 'Indoor', 'Indoor', 'Outdoor', 'Outdoor', 'Indoor', 'Indoor', 'Indoor', 'Outdoor', 'Indoor', 'Outdoor', 'Outdoor', 'Indoor', 'Indoor', 'Outdoor', 'Indoor', 'Indoor', 'Outdoor', 'Indoor', 'Indoor', 'Indoor'
]
predictions_rfc_alphabetical = [
    'Outdoor','Indoor','Indoor','Outdoor','Indoor','Indoor','Indoor','Outdoor','Outdoor','Outdoor','Outdoor','Outdoor','Indoor','Outdoor','Outdoor','Outdoor','Indoor','Indoor','Outdoor','Outdoor','Outdoor','Outdoor','Indoor','Indoor','Outdoor','Indoor','Indoor','Outdoor','Indoor','Indoor','Indoor'
]

predictions_rfc_probs = [
    [0.03, 0.97], [1.0, 0.0], [1.0, 0.0], [0.18, 0.82], [1.0, 0.0], [0.85, 0.15], [1.0, 0.0], [0.13, 0.87], [0.0, 1.0], [0.02, 0.98], [0.0, 1.0], [0.41, 0.59], [1.0, 0.0], [0.06, 0.94], [0.0, 1.0], [0.23, 0.77], [1.0, 0.0],[1.0, 0.0], [0.07, 0.93], [0.28, 0.72], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.96, 0.04], [0.21, 0.79], [0.99, 0.01], [0.87, 0.13], [0.29, 0.71], [0.74, 0.26], [1.0, 0.0], [1.0,0.0]
]

print("Predictions rfc: ", predictions_rfc_probs)

# Define true labels and predictions for LSTM with the updated data
true_labels_lstm = [
     1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
]
predictions_lstm_probs = [
    [0.03169885, 0.9683012], [9.9961096e-01, 3.8903771e-04], [9.9962103e-01, 3.7897032e-04], [0.0628316, 0.9371685], [9.9950373e-01, 4.9623835e-04], [0.96669286, 0.03330712], [9.9960977e-01, 3.9024084e-04], [0.03111412, 0.96888584], [0.02428848, 0.9757116], [0.02169448, 0.9783055] ,[0.02275957, 0.9772404], [0.538785, 0.46121508], [9.9961084e-01, 3.8915250e-04], [0.07990491, 0.9200951], [0.02168687, 0.9783132], [0.5124758, 0.48752418], [9.9961191e-01, 3.8806704e-04], [9.9961460e-01, 3.8542488e-04], [0.07851861, 0.9214814], [0.17278534, 0.8272147 ], [0.02279828, 0.9772017], [0.022396, 0.97760403], [9.9960881e-01, 3.9122254e-04], [0.96038276, 0.03961726], [0.05924854, 0.94075143], [9.9958938e-01, 4.1067746e-04], [0.9418696, 0.05813045], [0.10422354, 0.89577645], [0.7670185, 0.23298149], [9.9961120e-01, 3.8879746e-04], [0.992029, 0.007971]
]
print("Predictions lstm: ", predictions_lstm_probs)
predictions_lstm = [np.argmax(pred) for pred in predictions_lstm_probs]

# Ensure true_labels_lstm and predictions_lstm have the same length
assert len(true_labels_lstm) == len(predictions_lstm), "Length mismatch between true labels and predictions"

# Compute confusion matrices
conf_matrix_rfc = confusion_matrix(true_labels_rfc, predictions_rfc_alphabetical, labels=['Indoor', 'Outdoor'])
conf_matrix_lstm = confusion_matrix(true_labels_lstm, predictions_lstm, labels=[0.0, 1.0])

# Plot confusion matrices
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
fontsize = 12

# Plot RFC confusion matrix
sns.heatmap(conf_matrix_rfc, annot=True, fmt='d', cmap='Blues', xticklabels=['Indoor', 'Outdoor'], yticklabels=['Indoor', 'Outdoor'], ax=ax[0], annot_kws={"size": fontsize})
ax[0].set_title('RFC Confusion Matrix', fontsize=fontsize)
ax[0].set_xlabel('Predicted', fontsize=fontsize)
ax[0].set_ylabel('Actual', fontsize=fontsize)
ax[0].tick_params(axis='both', which='major', labelsize=fontsize)

# Plot LSTM confusion matrix
sns.heatmap(conf_matrix_lstm, annot=True, fmt='d', cmap='Blues', xticklabels=['Indoor', 'Outdoor'], yticklabels=['Indoor', 'Outdoor'], ax=ax[1], annot_kws={"size": fontsize})
ax[1].set_title('LSTM Confusion Matrix', fontsize=fontsize)
ax[1].set_xlabel('Predicted', fontsize=fontsize)
ax[1].set_ylabel('Actual', fontsize=fontsize)
ax[1].tick_params(axis='both', which='major', labelsize=fontsize)

plt.tight_layout()
plt.savefig("confusion_matrices.png")
plt.show()

# Plot probability distributions
fig, ax = plt.subplots(1, 2, figsize=(12, 6))

# Plot RFC probabilities
sns.histplot([prob[1] for prob in predictions_rfc_probs], bins=10, kde=False, ax=ax[0],legend=False)
ax[0].set_title('RFC Probability Distribution', fontsize=fontsize)
ax[0].set_xlabel('Probability of Outdoor', fontsize=fontsize)
ax[0].set_ylabel('Frequency', fontsize=fontsize)
ax[0].set_ylim(0, 15)  # Set y-axis limit to 14

# Plot LSTM probabilities
sns.histplot([prob[1] for prob in predictions_lstm_probs], bins=10, kde=False, ax=ax[1], legend=False)
ax[1].set_title('LSTM Probability Distribution', fontsize=fontsize)
ax[1].set_xlabel('Probability of Outdoor', fontsize=fontsize)
ax[1].set_ylabel('Frequency', fontsize=fontsize)
ax[1].set_ylim(0, 15)  # Set y-axis limit to 14

plt.tight_layout()
plt.savefig("probability_distributions.png")
plt.show()


Model Comparison

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming 'y_true' holds actual labels and 'y_pred_rfc', 'y_pred_lstm' hold predictions from both models
accuracy_rfc = accuracy_score(true_labels_rfc, predictions_rfc)
precision_rfc = precision_score(true_labels_lstm, predictions_lstm, average='macro')
recall_rfc = recall_score(true_labels_rfc, predictions_rfc, average='macro')
f1_rfc = f1_score(true_labels_rfc, predictions_rfc, average='macro')
confidence_rfc = np.mean(predictions_rfc_probs)

accuracy_lstm = accuracy_score(true_labels_lstm, predictions_lstm)
precision_lstm = precision_score(true_labels_lstm, predictions_lstm, average='macro')
recall_lstm = recall_score(true_labels_lstm, predictions_lstm, average='macro')
f1_lstm = f1_score(true_labels_lstm, predictions_lstm, average='macro')
confidence_lstm = np.mean([max(prob) for prob in predictions_lstm_probs])

print(f"RFC Metrics: Accuracy={accuracy_rfc}, Precision={precision_rfc}, Recall={recall_rfc}, F1-Score={f1_rfc}")
print(f"Confidence: ", {confidence_rfc})
print(f"LSTM Metrics: Accuracy={accuracy_lstm}, Precision={precision_lstm}, Recall={recall_lstm}, F1-Score={f1_lstm}")
print(f"Confidence: ", {confidence_lstm})


Analyze transition scenarios

In [None]:
import os
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

transition_scenarios_folder = '../Daten/environment_transition/'
transition_files = ['LC_Indoor_to_Outdoor.csv', 'LC_Outdoor_to_Indoor.csv', 'BIB_Outdoor_to_Indoor.csv', 'Sparkasse_Outdoor_to_Indoor.csv']

# Load the transition scenarios into dataframes
Indoor_to_Outdoor_LC = pd.read_csv(transition_scenarios_folder + 'LC_Indoor_to_Outdoor.csv')
Outdoor_to_Indoor_LC = pd.read_csv(transition_scenarios_folder + 'LC_Outdoor_to_Indoor.csv')
Outdoor_to_Indoor_Bib = pd.read_csv(transition_scenarios_folder + 'BIB_Outdoor_to_Indoor.csv')
Outdoor_to_Indoor_Sparkasse = pd.read_csv(transition_scenarios_folder + 'Sparkasse_Outdoor_to_Indoor.csv')

data_frames = [Indoor_to_Outdoor_LC, Outdoor_to_Indoor_LC, Outdoor_to_Indoor_Bib, Outdoor_to_Indoor_Sparkasse]

fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))
axs = axs.flatten()
fontsize = 16
# Loop over each scenario and plot
for ax, data, scenario_name in zip(axs, data_frames, transition_files):
    # Assume 'probability' column exists which has the probability values
    # Change 'predictionRFC_Indoor' to 'predictionLSTM_Indoor' to see results of other model
    ax.plot(data['predictionRFC_Indoor'], label=f'Indoor Probability of {scenario_name[:-4]}')  # Removes '.csv' from name
    ax.set_title(f'Probability Over Time for {scenario_name[:-4]}', fontsize=fontsize)
    ax.set_xlabel('Time (seconds)', fontsize=fontsize)
    ax.set_ylabel('Probability', fontsize=fontsize)
    ax.tick_params(axis='both', labelsize=fontsize)
    ax.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Check when the models are predicting the transition by looking at the probability values
# Print the index of for all four scenarios when the model predicts the transition

# Define the threshold for transition prediction
threshold = 0.5
transition_indices_LSTM = []
transition_indices_RFC = []

# Loop over each scenario and print the index when the transition is predicted by RFC
print("Transition predicted by RFC")
for data, scenario_name in zip(data_frames, transition_files):
    # Assume 'probability' column exists which has the probability values
    transition_indices_RFC = data[data['predictionRFC_Indoor'] > threshold].index
    print(f'Transition predicted for {scenario_name[:-4]} at indices: {transition_indices_RFC}')
    
print()

# Loop over each scenario and print the index when the transition is predicted by LSTM
print("Transition predicted by LSTM")
for data, scenario_name in zip(data_frames, transition_files):
    # Assume 'probability' column exists which has the probability values
    transition_indices_LSTM = data[data['predictionLSTM_Indoor'] > threshold].index
    print(f'Transition predicted for {scenario_name[:-4]} at indices: {transition_indices_LSTM}')


# For the sake of simplicity (since there are only 4 scenarios), we will calculate the delay manually
