# Data Preprocessing, Averaging over all csv-files and Saving the averaged data into new csv-file

In [257]:
import os
import pandas as pd
import json 
from collections import Counter

# extract the code into a function called preprocess_data so that we can use it later for new data
def preprocess_data(averaged_df, directory, file):
    df = pd.read_csv(directory + file, sep=';')
    
    # Drop unnecessary and outdated columns
    deleted_columns = ['timeStampNetwork', 'timeStampGPS', 'locationDescription', 'people', 'latitudeGPS', 'longitudeGPS', 'latitudeNetwork', 'longitudeNetwork', 'minCn0GPS', 'maxCn0GPS', 'meanCn0GPS', 'minCn0Bluetooth', 'maxCn0Bluetooth', 'minCn0Wifi', 'maxCn0Wifi', 'meanCn0Wifi', 'bAccuracyNetwork', 'speedAccuracyNetwork', 'cellType', 'networkLocationType', 'hAccuracyNetwork', 'vAccuracyNetwork', 'speedAccuracyNetwork', 'bAccuracyNetwork', 'nrBlDevices', 'hAccuracyGPS', 'minCn0Bl', 'meanCn0Bl','maxCn0Bl', 'bAccuracyGPS', 'speedAccuracyGPS', 'vAccuracyGPS', 'nrWifiDevices']
    df = safe_delete(df, deleted_columns)
    
    # Remove first x rows and reset begin index to 0
    removedRows = 3
    df = df.iloc[removedRows:]
    df = df.reset_index(drop=True)
    
    # Load satellites json
    df['satellites'] = df['satellites'].apply(lambda x: json.loads(x))
    
    # Add cn0 column for easier computation of statistics
    df['satellite_cn0'] = df['satellites'].apply(lambda x: [sat['cn0'] for sat in x])
    
    # Calculate min, max, mean. median, mode, variance, standard deviation and range of the satellite cn0
    df['satellite_cn0_min'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).min() if not pd.Series(x).empty else 0)
    df['satellite_cn0_max'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).max() if not pd.Series(x).empty else 0)
    df['satellite_cn0_mean'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).mean() if not pd.Series(x).empty else 0)    
    df['satellite_cn0_median'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).median() if not pd.Series(x).empty else 0)
    df['satellite_cn0_mode'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).mode()[0] if not pd.Series(x).mode().empty else 0)
    df['satellite_cn0_std'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).std() if not pd.Series(x).empty else 0)
    df['satellite_cn0_range'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).max() - pd.Series(x).min() if not pd.Series(x).empty else 0)
    
    # load the bluetooth json and load rssi into a new column
    df['bluetoothDevices'] = df['bluetoothDevices'].apply(lambda x: json.loads(x))
    df['bluetooth_rssi'] = df['bluetoothDevices'].apply(lambda x: [device['rssi'] for device in x])
    
     # Calculate statistical figures for the bluetooth devices
    df['bluetooth_rssi_min'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).min() if not pd.Series(x).empty else 0)
    df['bluetooth_rssi_max'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).max() if not pd.Series(x).empty else 0)
    df['bluetooth_rssi_mean'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).mean() if not pd.Series(x).empty else 0)
    df['bluetooth_rssi_median'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).median() if not pd.Series(x).empty else 0)
    df['bluetooth_rssi_mode'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).mode()[0] if not pd.Series(x).mode().empty else 0)
    df['bluetooth_rssi_std'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).std() if not pd.Series(x).empty else 0)
    df['bluetooth_rssi_range'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).max() - pd.Series(x).min() if not pd.Series(x).empty else 0)
    
    # load the wifi json and load rssi into a new column
    df['wifiDevices'] = df['wifiDevices'].apply(lambda x: json.loads(x))
    df['wifi_rssi'] = df['wifiDevices'].apply(lambda x: [device['level'] for device in x])
    
     # Calculate statistical figures for the wifi devices
    df['wifi_rssi_min'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).min() if not pd.Series(x).empty else 0)
    df['wifi_rssi_max'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).max() if not pd.Series(x).empty else 0)
    df['wifi_rssi_mean'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).mean() if not pd.Series(x).empty else 0)
    df['wifi_rssi_median'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).median() if not pd.Series(x).empty else 0)
    df['wifi_rssi_mode'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).mode()[0] if not pd.Series(x).mode().empty else 0)
    df['wifi_rssi_range'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).max() - pd.Series(x).min() if not pd.Series(x).empty else 0)
    
    # Drop list columns
    df.drop(columns=['satellites', 'bluetoothDevices', 'wifiDevices', 'satellite_cn0', 'bluetooth_rssi', 'wifi_rssi'], inplace=True)
    
     # Average over all columns for numeric values and take the first of non-numeric to have a single row
    df_label = df.iloc[0]['label']
    
    df.drop(columns=['label'], inplace=True)
    
    df = df.mean().to_frame().T
    
    df['label'] = df_label
    
    #place label at the beginning
    cols = list(df.columns)
    cols = [cols[-1]] + cols[:-1]
    df = df[cols]
    
    averaged_df = pd.concat([averaged_df, df], ignore_index=True)
    return averaged_df

def safe_delete(df, columns):
    for column in columns:
        if column in df.columns:
            df = df.drop(columns=[column])
    return df

files_directory = '../Daten/firsttry/'
averaged_path = '../Daten/averaged_data.csv'

if os.path.exists(averaged_path):
    os.remove(averaged_path)
    print(f"Removed {averaged_path}")

# Get list of CSV files
csv_files = [file for file in os.listdir(files_directory) if file.endswith('.csv')]
print(f'Found {len(csv_files)} CSV files')

# Get nr of files, where name starts with indoor and outdoor
indoor_files = [file for file in csv_files if file.startswith('Indoor')]
outdoor_files = [file for file in csv_files if file.startswith('Outdoor')]
print(f'Found {len(indoor_files)} indoor files and {len(outdoor_files)} outdoor files')

averaged_data = pd.DataFrame()

for file in csv_files:
    averaged_data = preprocess_data(averaged_data, files_directory, file)

averaged_data.sort_values(by=['label'], inplace=True)

Found 189 CSV files
Found 105 indoor files and 84 outdoor files


# Train the Random Forest Classifier

## Randomize and Split the Data for Random Forest Classifier

In [258]:
from sklearn.model_selection import train_test_split

# Randomize the data
averaged_data = averaged_data.sample(frac=1).reset_index(drop=True)

# drop the label column
X = averaged_data.drop(columns=['label'], axis=1)
Y = averaged_data['label']

# Split the data into training and testing data
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.0001)
X_train = X
Y_train = Y

print(f'Training data shape: {X_train.shape}')
#print(f'Testing data shape: {X_test.shape}')

Training data shape: (189, 23)


## Train a Random Forest Classifier

In [259]:
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Train the Random Forest Classifier with regularization
clf = RandomForestClassifier()

# Train the classifier
clf.fit(X_train, Y_train)

# Cross-validation
scores = cross_val_score(clf, X_train, Y_train, cv=10)
print(f"Cross-validation scores: {scores}")

# Predict the labels of the test data
#y_pred = clf.predict(X_test)

# Calculate the accuracy of the classifier
#accuracy = accuracy_score(Y_test, y_pred)
#print(f'Accuracy: {accuracy}')

Cross-validation scores: [0.94736842 0.89473684 0.89473684 0.89473684 0.94736842 0.84210526
 0.89473684 0.94736842 0.94736842 1.        ]


## Get the Feature Importances

In [260]:
# Print feature importances of the selected features
feature_importances = clf.feature_importances_
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': feature_importances})
feature_importances = feature_importances.sort_values(by='importance', ascending=False)
feature_importances

Unnamed: 0,feature,importance
6,satellite_cn0_median,0.182521
4,satellite_cn0_max,0.146972
5,satellite_cn0_mean,0.131615
9,satellite_cn0_range,0.124775
2,nrSatellitesInFix,0.10105
7,satellite_cn0_mode,0.082592
8,satellite_cn0_std,0.055492
3,satellite_cn0_min,0.030267
12,bluetooth_rssi_mean,0.022277
0,cellStrength,0.020286


## Save the model and tryout the model with new data

In [261]:
import joblib

# Save the model
model_path = '../Daten/random_forest_classifier.joblib'
joblib.dump(clf, model_path)
print(f'Saved model to {model_path}')

Saved model to ../Daten/random_forest_classifier.joblib


# Test the model with new data in validation folder in this notebook

In [262]:
validation_files_directory = '../Daten/validation/'
validation_averaged_path = '../Daten/validation_averaged_data.csv'
model_path = '../Daten/random_forest_classifier.joblib'

# Load the model
model = joblib.load(model_path)

validation_averaged_data = pd.DataFrame()
validation_description = pd.DataFrame()
validation_files = [file for file in os.listdir(validation_files_directory) if file.endswith('.csv')]

for file in validation_files:
    df = pd.read_csv(validation_files_directory + file, sep=';')
    locationDescription = df['locationDescription'][0]
    locationPeople = df['people'][0]
    
    validation_description = pd.concat([validation_description, pd.DataFrame({'locationDescription': [locationDescription], 'people': [locationPeople], 'file': [file]})], ignore_index=True)

    validation_averaged_data = preprocess_data(validation_averaged_data, validation_files_directory, file)

# Predict the labels of the validation data
validation_predictions = model.predict(validation_averaged_data.drop(columns=['label']))
validation_probabilities = model.predict_proba(validation_averaged_data.drop(columns=['label']))

#print label and the corresponding prediction
for label, description, people, prediction, probability, file in zip(validation_averaged_data['label'], validation_description['locationDescription'], validation_description['people'], validation_predictions, validation_probabilities, validation_description['file']):
    if label != prediction:
        print("--------------------")
        print(f'Label: {label}, Prediction: {prediction}, Description: {description}, People: {people},  Probability: {probability}, File: {file}')
   

--------------------
Label: Indoor, Prediction: Outdoor, Description: Nähe Fenster, People: viel,  Probability: [0.31 0.69], File: Indoor_2024-03-29 13_41_40.csv


## Setup LSTM Model

In [263]:
from scikeras.wrappers import KerasClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

def create_model():
    model = Sequential()
    model.add(LSTM(128, input_shape=(X_train.shape[1], 1), return_sequences=True))
    model.add(Dropout(0.2))

    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(0.2))

    model.add(LSTM(128))
    model.add(Dropout(0.2))

    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))

    model.add(Dense(2, activation='softmax'))

    model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Create a classifier with the wrapper
model = KerasClassifier(model=create_model, epochs=15, batch_size=32, verbose=1)

#convert all columns to float except the label column
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
X_train = X_train.astype(float)
Y_train = Y_train.map({'Indoor': 0, 'Outdoor': 1})
Y_train = Y_train.astype(float)
X_test = X_test.astype(float)
Y_test = Y_test.map({'Indoor': 0, 'Outdoor': 1})
Y_train = Y_train.astype(float)

X_train = np.array(X_train)
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

X_test = np.array(X_test)
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

print(f'Training data shape: {X_train.shape}')
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), batch_size=32)

# Now you can use cross_val_score
scores = cross_val_score(model, X_train, Y_train, cv=10)
print(f"Cross-validation scores: {scores}")

# Save the model
model_path = '../Daten/lstm_classifier.joblib'
joblib.dump(model, model_path)
print(f'Saved model to {model_path}')


Training data shape: (151, 23, 1)
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15




INFO:tensorflow:Assets written to: ram://e657ba795e914b5fadaeae7f0b0f426f/assets


INFO:tensorflow:Assets written to: ram://e657ba795e914b5fadaeae7f0b0f426f/assets


Saved model to ../Daten/lstm_classifier.joblib


## Validate the LSTM Model

In [266]:
# Load the validation data into dataframes
validation_averaged_data = pd.DataFrame()
validation_description = pd.DataFrame()
validation_files = [file for file in os.listdir(validation_files_directory) if file.endswith('.csv')]

for file in validation_files:
    df = pd.read_csv(validation_files_directory + file, sep=';')
    locationDescription = df['locationDescription'][0]
    locationPeople = df['people'][0]
    
    validation_description = pd.concat([validation_description, pd.DataFrame({'locationDescription': [locationDescription], 'people': [locationPeople], 'file': [file]})], ignore_index=True)

    validation_averaged_data = preprocess_data(validation_averaged_data, validation_files_directory, file)

#map the label column to 0 and 1
validation_averaged_data['label'] = validation_averaged_data['label'].map({'Indoor': 0, 'Outdoor': 1})

#convert all columns to float except the label column
validation_averaged_data = validation_averaged_data.astype(float)

Y_validation = validation_averaged_data['label']
X_validation = validation_averaged_data.drop(columns=['label'])

X_validation = np.array(X_validation)
X_validation = np.reshape(X_validation, (X_validation.shape[0], X_validation.shape[1], 1))

Y_validation = Y_validation.astype(float)

#load the model
model = joblib.load(model_path)

#predict the labels of the validation data for each data point with their corresponding probability
validation_predictions = model.predict_proba(X_validation)

#print label and the corresponding prediction
for label, description, people, prediction, file in zip(validation_averaged_data['label'], validation_description['locationDescription'], validation_description['people'], validation_predictions, validation_description['file']):
    #if abs(prediction[0] - prediction[1]) < 0.7:
    print(f'Label: {label}, Prediction: {prediction}, Description: {description}, People: {people},  File: {file}')

2024-04-03 10:57:19.236880: W tensorflow/core/util/tensor_slice_reader.cc:96] Could not open ram://055adb36f5e24116b1b1a5f162e4e6d0: INVALID_ARGUMENT: ram://055adb36f5e24116b1b1a5f162e4e6d0 is a directory.


Label: 1.0, Prediction: [0.01758427 0.9824157 ], Description: Überdacht, People: viel,  File: Outdoor_2024-03-30 20_23_34.csv
Label: 0.0, Prediction: [9.994553e-01 5.447234e-04], Description: Kein Fenster, People: keine,  File: Indoor_2024-04-01 17_18_10.csv
Label: 0.0, Prediction: [9.9946827e-01 5.3180743e-04], Description: Nähe Fenster, People: weniger als 5,  File: Indoor_2024-04-01 16_33_22.csv
Label: 1.0, Prediction: [0.02851426 0.9714857 ], Description: Häuserschlucht, People: viel,  File: Outdoor_2024-03-29 13_46_20.csv
Label: 0.0, Prediction: [9.9932563e-01 6.7434151e-04], Description: Raummitte, People: viel,  File: Indoor_2024-03-29 13_43_31.csv
Label: 0.0, Prediction: [0.9889065 0.0110935], Description: Nähe Fenster, People: keine,  File: Indoor_2024-04-01 17_27_27.csv
Label: 0.0, Prediction: [9.994585e-01 5.415267e-04], Description: Raummitte, People: weniger als 5,  File: Indoor_2024-03-29 13_05_59.csv
Label: 1.0, Prediction: [0.01238049 0.9876195 ], Description: Häusersch