# Data Preprocessing, Averaging over all csv-files and Saving the averaged data into new csv-file

In [26]:
import os
import pandas as pd
import json 
from collections import Counter

# extract the code into a function called preprocess_data so that we can use it later for new data
def preprocess_data(averaged_df, directory, file):
    df = pd.read_csv(directory + file, sep=';')
    
    # Drop unnecessary and outdated columns
    deleted_columns = ['timeStampNetwork', 'timeStampGPS', 'locationDescription', 'people', 'latitudeGPS', 'longitudeGPS', 'latitudeNetwork', 'longitudeNetwork', 'minCn0GPS', 'maxCn0GPS', 'meanCn0GPS', 'minCn0Bluetooth', 'maxCn0Bluetooth', 'minCn0Wifi', 'maxCn0Wifi', 'meanCn0Wifi', 'bAccuracyNetwork', 'speedAccuracyNetwork', 'cellType', 'networkLocationType', 'hAccuracyNetwork', 'vAccuracyNetwork', 'speedAccuracyNetwork', 'bAccuracyNetwork', 'nrBlDevices', 'hAccuracyGPS', 'minCn0Bl', 'meanCn0Bl','maxCn0Bl', 'bAccuracyGPS', 'speedAccuracyGPS', 'vAccuracyGPS', 'nrWifiDevices']
    df = safe_delete(df, deleted_columns)
    
    # Remove first x rows and reset begin index to 0
    removedRows = 5
    df = df.iloc[removedRows:]
    df = df.reset_index(drop=True)
    
    # Load satellites json
    df['satellites'] = df['satellites'].apply(lambda x: json.loads(x))
    
    # Add cn0 column for easier computation of statistics
    df['satellite_cn0'] = df['satellites'].apply(lambda x: [sat['cn0'] for sat in x])
    
    # Calculate min, max, mean. median, mode, variance, standard deviation and range of the satellite cn0
    df['satellite_cn0_min'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).min() if not pd.Series(x).empty else 0)
    df['satellite_cn0_max'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).max() if not pd.Series(x).empty else 0)
    df['satellite_cn0_mean'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).mean() if not pd.Series(x).empty else 0)    
    df['satellite_cn0_median'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).median() if not pd.Series(x).empty else 0)
    df['satellite_cn0_mode'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).mode()[0] if not pd.Series(x).mode().empty else 0)
    df['satellite_cn0_std'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).std() if not pd.Series(x).empty else 0)
    df['satellite_cn0_range'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).max() - pd.Series(x).min() if not pd.Series(x).empty else 0)
    
    # load the bluetooth json and load rssi into a new column
    df['bluetoothDevices'] = df['bluetoothDevices'].apply(lambda x: json.loads(x))
    df['bluetooth_rssi'] = df['bluetoothDevices'].apply(lambda x: [device['rssi'] for device in x])
    
     # Calculate statistical figures for the bluetooth devices
    df['bluetooth_rssi_min'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).min() if not pd.Series(x).empty else 0)
    df['bluetooth_rssi_max'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).max() if not pd.Series(x).empty else 0)
    df['bluetooth_rssi_mean'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).mean() if not pd.Series(x).empty else 0)
    df['bluetooth_rssi_median'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).median() if not pd.Series(x).empty else 0)
    df['bluetooth_rssi_mode'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).mode()[0] if not pd.Series(x).mode().empty else 0)
    df['bluetooth_rssi_std'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).std() if not pd.Series(x).empty else 0)
    df['bluetooth_rssi_range'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).max() - pd.Series(x).min() if not pd.Series(x).empty else 0)
    
    # load the wifi json and load rssi into a new column
    df['wifiDevices'] = df['wifiDevices'].apply(lambda x: json.loads(x))
    df['wifi_rssi'] = df['wifiDevices'].apply(lambda x: [device['level'] for device in x])
    
     # Calculate statistical figures for the wifi devices
    df['wifi_rssi_min'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).min() if not pd.Series(x).empty else 0)
    df['wifi_rssi_max'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).max() if not pd.Series(x).empty else 0)
    df['wifi_rssi_mean'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).mean() if not pd.Series(x).empty else 0)
    df['wifi_rssi_median'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).median() if not pd.Series(x).empty else 0)
    df['wifi_rssi_mode'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).mode()[0] if not pd.Series(x).mode().empty else 0)
    df['wifi_rssi_range'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).max() - pd.Series(x).min() if not pd.Series(x).empty else 0)
    
    # Drop list columns
    df.drop(columns=['satellites', 'bluetoothDevices', 'wifiDevices', 'satellite_cn0', 'bluetooth_rssi', 'wifi_rssi'], inplace=True)
    
     # Average over all columns for numeric values and take the first of non-numeric to have a single row
    df_label = df.iloc[0]['label']
    
    df.drop(columns=['label'], inplace=True)
    
    df = df.mean().to_frame().T
    
    df['label'] = df_label
    
    #place label at the beginning
    cols = list(df.columns)
    cols = [cols[-1]] + cols[:-1]
    df = df[cols]
    
    averaged_df = pd.concat([averaged_df, df], ignore_index=True)
    return averaged_df

def safe_delete(df, columns):
    for column in columns:
        if column in df.columns:
            df = df.drop(columns=[column])
    return df

files_directory = '../Daten/firsttry/'
averaged_path = '../Daten/averaged_data.csv'

if os.path.exists(averaged_path):
    os.remove(averaged_path)
    print(f"Removed {averaged_path}")

# Get list of CSV files
csv_files = [file for file in os.listdir(files_directory) if file.endswith('.csv')]
print(f'Found {len(csv_files)} CSV files')

# Get nr of files, where name starts with indoor and outdoor
indoor_files = [file for file in csv_files if file.startswith('Indoor')]
outdoor_files = [file for file in csv_files if file.startswith('Outdoor')]
print(f'Found {len(indoor_files)} indoor files and {len(outdoor_files)} outdoor files')

averaged_data = pd.DataFrame()

for file in csv_files:
    averaged_data = preprocess_data(averaged_data, files_directory, file)

averaged_data.sort_values(by=['label'], inplace=True)
averaged_data

Found 174 CSV files
Found 96 indoor files and 78 outdoor files


Unnamed: 0,label,cellStrength,nrSatellitesInView,nrSatellitesInFix,satellite_cn0_min,satellite_cn0_max,satellite_cn0_mean,satellite_cn0_median,satellite_cn0_mode,satellite_cn0_std,...,bluetooth_rssi_median,bluetooth_rssi_mode,bluetooth_rssi_std,bluetooth_rssi_range,wifi_rssi_min,wifi_rssi_max,wifi_rssi_mean,wifi_rssi_median,wifi_rssi_mode,wifi_rssi_range
0,Indoor,3.000000,43.500000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,-91.750000,-98.090909,23.236532,57.090909,-88.818182,-46.090909,-71.632172,-73.727273,-88.818182,42.727273
119,Indoor,3.913043,37.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,-86.500000,-91.608696,12.470589,54.000000,-91.000000,-75.000000,-81.315789,-81.000000,-82.000000,16.000000
118,Indoor,3.739130,43.478261,4.739130,4.139130,5.182609,4.578923,4.450000,4.139130,0.414395,...,-89.260870,-90.521739,9.234602,28.913043,-90.000000,-70.000000,-80.428571,-82.500000,-86.000000,20.000000
117,Indoor,3.413793,47.965517,14.551724,16.444828,32.248276,23.493151,22.948276,18.851724,4.684899,...,-87.137931,-95.965517,12.347439,48.379310,-86.965517,-65.344828,-76.452065,-74.965517,-85.482759,21.620690
115,Indoor,1.473684,46.868421,24.631579,16.278947,44.102632,26.469973,24.807895,22.247368,7.521006,...,-74.092105,-81.105263,13.895173,40.500000,-86.789474,-64.000000,-73.657895,-71.210526,-82.526316,22.789474
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,Outdoor,3.285714,45.000000,25.476190,17.385714,43.690476,27.899786,27.269048,25.104762,6.844320,...,-90.547619,-91.904762,14.953090,45.000000,-88.000000,-73.000000,-83.461538,-85.000000,-87.000000,15.000000
98,Outdoor,4.000000,51.000000,30.458333,20.216667,45.758333,33.752564,33.972917,31.233333,6.260535,...,-91.479167,-93.000000,16.794869,57.000000,-93.916667,-73.583333,-85.896465,-87.000000,-93.250000,20.333333
97,Outdoor,3.000000,55.692308,23.846154,18.296154,41.330769,30.886022,31.817308,25.180769,7.092339,...,-95.384615,-96.923077,11.538678,36.461538,-90.192308,-80.538462,-85.980769,-86.596154,-89.884615,9.653846
95,Outdoor,4.000000,52.000000,30.920000,18.628000,46.428000,34.935632,35.710000,33.048000,7.279229,...,-92.480000,-93.000000,11.675512,59.520000,-94.800000,-59.600000,-83.379705,-85.460000,-91.320000,35.200000


# Train the Random Forest Classifier

## Randomize and Split the Data for Random Forest Classifier

In [27]:
from sklearn.model_selection import train_test_split

# Randomize the data
averaged_data = averaged_data.sample(frac=1).reset_index(drop=True)

# drop the label column
X = averaged_data.drop(columns=['label'], axis=1)
Y = averaged_data['label']

# Split the data into training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

print(f'Training data shape: {X_train.shape}')
print(f'Testing data shape: {X_test.shape}')

Training data shape: (139, 23)
Testing data shape: (35, 23)


## Train a Random Forest Classifier

In [28]:
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Train the Random Forest Classifier with regularization
clf = RandomForestClassifier()

# Train the classifier
clf.fit(X_train, Y_train)

# Cross-validation
scores = cross_val_score(clf, X_train, Y_train, cv=5)
print(f"Cross-validation scores: {scores}")

# Predict the labels of the test data
y_pred = clf.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(Y_test, y_pred)
print(f'Accuracy: {accuracy}')

Cross-validation scores: [0.92857143 0.96428571 0.92857143 0.89285714 0.92592593]
Accuracy: 1.0


## Get the Feature Importances

In [29]:
# Print feature importances of the selected features
feature_importances = clf.feature_importances_
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': feature_importances})
feature_importances = feature_importances.sort_values(by='importance', ascending=False)
feature_importances

Unnamed: 0,feature,importance
5,satellite_cn0_mean,0.169718
6,satellite_cn0_median,0.138555
4,satellite_cn0_max,0.129626
2,nrSatellitesInFix,0.108872
9,satellite_cn0_range,0.104865
7,satellite_cn0_mode,0.09378
8,satellite_cn0_std,0.091682
3,satellite_cn0_min,0.039882
13,bluetooth_rssi_median,0.02057
0,cellStrength,0.017656


## Save the model and tryout the model with new data

In [30]:
import joblib

# Save the model
model_path = '../Daten/random_forest_classifier.joblib'
joblib.dump(clf, model_path)
print(f'Saved model to {model_path}')

Saved model to ../Daten/random_forest_classifier.joblib


# Create a bagging classifier, run the Random Forest Classifier with different random states and average the results

In [31]:
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
# Define the base classifier
base_cls = RandomForestClassifier(n_estimators=100)
# Instantiate the BaggingClassifier
bagging_cls = BaggingClassifier(estimator=base_cls, n_estimators=10)
# Train the BaggingClassifier
bagging_cls.fit(X_train, Y_train)
# Predict and evaluate as usual
y_pred = bagging_cls.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Save the model
model_path = '../Daten/bagging_classifier.joblib'
joblib.dump(bagging_cls, model_path)
print(f'Saved model to {model_path}')


Accuracy: 1.0
Saved model to ../Daten/bagging_classifier.joblib


# Test the model with new data in validation folder in this notebook

In [32]:
validation_files_directory = '../Daten/validation/'
validation_averaged_path = '../Daten/validation_averaged_data.csv'
model_path = '../Daten/random_forest_classifier.joblib'

# Load the model
model = joblib.load(model_path)

validation_averaged_data = pd.DataFrame()
validation_description = pd.DataFrame()
validation_files = [file for file in os.listdir(validation_files_directory) if file.endswith('.csv')]

for file in validation_files:
    df = pd.read_csv(validation_files_directory + file, sep=';')
    locationDescription = df['locationDescription'][0]
    locationPeople = df['people'][0]
    
    validation_description = pd.concat([validation_description, pd.DataFrame({'locationDescription': [locationDescription], 'people': [locationPeople], 'file': [file]})], ignore_index=True)

    validation_averaged_data = preprocess_data(validation_averaged_data, validation_files_directory, file)

# Predict the labels of the validation data
validation_predictions = model.predict(validation_averaged_data.drop(columns=['label']))
validation_probabilities = model.predict_proba(validation_averaged_data.drop(columns=['label']))

#print label and the corresponding prediction
for label, description, people, prediction, probability, file in zip(validation_averaged_data['label'], validation_description['locationDescription'], validation_description['people'], validation_predictions, validation_probabilities, validation_description['file']):
    if label != prediction:
        print(f'Label: {label}, Prediction: {prediction}, Description: {description}, People: {people},  Probability: {probability}, File: {file}')
   

Label: Indoor, Prediction: Outdoor, Description: Nähe Fenster, People: viel,  Probability: [0.39 0.61], File: Indoor_2024-03-29 13_41_40.csv


# Test validation files with bagging classifier

In [33]:
validation_files_directory = '../Daten/validation/'
validation_averaged_path = '../Daten/validation_averaged_data.csv'
model_path = '../Daten/bagging_classifier.joblib'

# Load the model
model = joblib.load(model_path)

validation_averaged_data = pd.DataFrame()
validation_description = pd.DataFrame()
validation_files = [file for file in os.listdir(validation_files_directory) if file.endswith('.csv')]

for file in validation_files:
    df = pd.read_csv(validation_files_directory + file, sep=';')
    locationDescription = df['locationDescription'][0]
    locationPeople = df['people'][0]
    
    validation_description = pd.concat([validation_description, pd.DataFrame({'locationDescription': [locationDescription], 'people': [locationPeople], 'file': [file]})], ignore_index=True)

    validation_averaged_data = preprocess_data(validation_averaged_data, validation_files_directory, file)

# Predict the labels of the validation data
validation_predictions = model.predict(validation_averaged_data.drop(columns=['label']))
validation_probabilities = model.predict_proba(validation_averaged_data.drop(columns=['label']))

#print label and the corresponding prediction
for label, description, people, prediction, probability, file in zip(validation_averaged_data['label'], validation_description['locationDescription'], validation_description['people'], validation_predictions, validation_probabilities, validation_description['file']):
    if label != prediction:
        print(f'Label: {label}, Prediction: {prediction}, Description: {description}, People: {people}, Probability: {probability}, File: {file}')
        

Label: Indoor, Prediction: Outdoor, Description: Nähe Fenster, People: viel, Probability: [0.306 0.694], File: Indoor_2024-03-29 13_41_40.csv


## Setup LSTM Model

In [34]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Define the LSTM model
model = Sequential()

model.add(LSTM(128, input_shape=(X_train.shape[1], 1), return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(128))
model.add(Dropout(0.2))

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

#convert all columns to float except the label column
X_train = X_train.astype(float)
Y_train = Y_train.map({'Indoor': 0, 'Outdoor': 1})
Y_train = Y_train.astype(float)

X_test = X_test.astype(float)
Y_test = Y_test.map({'Indoor': 0, 'Outdoor': 1})
Y_train = Y_train.astype(float)

#print accuracy of the model with the averaged data
X_train = np.array(X_train)
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

X_test = np.array(X_test)
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

model.fit(X_train, Y_train, epochs=10, validation_data=(X_test, Y_test), batch_size=32)

print(f'Accuracy: {model.evaluate(X_test, Y_test)[1]}')



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 23, 128)           66560     
                                                                 
 dropout_4 (Dropout)         (None, 23, 128)           0         
                                                                 
 lstm_4 (LSTM)               (None, 23, 128)           131584    
                                                                 
 dropout_5 (Dropout)         (None, 23, 128)           0         
                                                                 
 lstm_5 (LSTM)               (None, 128)               131584    
                                                                 
 dropout_6 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 32)               

## Validate the LSTM Model

In [35]:
#map the label column to 0 and 1
validation_averaged_data['label'] = validation_averaged_data['label'].map({'Indoor': 0, 'Outdoor': 1})

#convert all columns to float except the label column
validation_averaged_data = validation_averaged_data.astype(float)

X_validation = validation_averaged_data.drop(columns=['label'])
Y_validation = validation_averaged_data['label']

X_validation = np.array(X_validation)
X_validation = np.reshape(X_validation, (X_validation.shape[0], X_validation.shape[1], 1))

Y_validation = Y_validation.astype(float)

#predict the labels of the validation data for each data point
validation_predictions = model.predict(X_validation)

#print label and the corresponding prediction
for label, description, people, prediction in zip(validation_averaged_data['label'], validation_description['locationDescription'], validation_description['people'], validation_predictions):
    if prediction[0] > prediction[1]:
        print(f'Label: {label}, Description: {description}, People: {people}, Prediction: Indoor, Probability: {prediction[0]}')
    else:
        print(f'Label: {label}, Description: {description}, People: {people}, Prediction: Outdoor, Probability: {prediction[1]}')

Label: 1.0, Description: Überdacht, People: viel, Prediction: Outdoor, Probability: 0.948621928691864
Label: 1.0, Description: Häuserschlucht, People: viel, Prediction: Outdoor, Probability: 0.9332685470581055
Label: 0.0, Description: Raummitte, People: viel, Prediction: Indoor, Probability: 0.9963526725769043
Label: 0.0, Description: Raummitte, People: weniger als 5, Prediction: Indoor, Probability: 0.9975817203521729
Label: 1.0, Description: Frei, People: viel, Prediction: Outdoor, Probability: 0.9665589928627014
Label: 0.0, Description: Nähe Fenster, People: viel, Prediction: Indoor, Probability: 0.6442140936851501
Label: 1.0, Description: Frei, People: keine, Prediction: Outdoor, Probability: 0.9665058851242065
Label: 1.0, Description: Überdacht, People: keine, Prediction: Outdoor, Probability: 0.8634873032569885
Label: 1.0, Description: Überdacht, People: viel, Prediction: Outdoor, Probability: 0.9646337032318115
Label: 0.0, Description: Raummitte, People: keine, Prediction: Indoo