# Data Preprocessing, Averaging over all csv-files and Saving the averaged data into new csv-file

In [929]:
import os
import pandas as pd
import json 
from collections import Counter

# extract the code into a function called preprocess_data so that we can use it later for new data
def preprocess_data(averaged_df, directory, file):
    df = pd.read_csv(directory + file, sep=';')
    # Drop unnecessary columns like timeStampNetwork and timeStampGPS
    df = df.drop(columns=['timeStampNetwork', 'timeStampGPS', 'bAccuracyNetwork', 'speedAccuracyNetwork', 'cellType', 'networkLocationType', 'hAccuracyNetwork', 'vAccuracyNetwork', 'speedAccuracyNetwork', 'bAccuracyNetwork', 'nrBlDevices', 'hAccuracyGPS', 'maxCn0Bl', 'bAccuracyGPS', 'speedAccuracyGPS', 'vAccuracyGPS', 'nrWifiDevices'])
    
    # Drop 'locationDescription' and 'people' if they exist
    if 'locationDescription' in df.columns:
        df = df.drop(columns=['locationDescription'])
    if 'people' in df.columns:
        df = df.drop(columns=['people'])
    if 'latitudeGPS' in df.columns:
        df = df.drop(columns=['latitudeGPS'])
    if 'longitudeGPS' in df.columns:
        df = df.drop(columns=['longitudeGPS'])
    if 'latitudeNetwork' in df.columns:
        df = df.drop(columns=['latitudeNetwork'])
    if 'longitudeNetwork' in df.columns:
        df = df.drop(columns=['longitudeNetwork'])
        
    # TODO remove this again after making changes in app
    if df['meanCn0Wifi'].isnull().values.any():
        df['meanCn0Wifi'] = 0
    
    # Remove first x rows and reset begin index to 0
    removedRows = 3
    df = df.iloc[removedRows:]
    df = df.reset_index(drop=True)
    
    # Load satellites json
    df['satellites'] = df['satellites'].apply(lambda x: json.loads(x))
    
    # Add cn0 column for easier computation of statistics
    df['satellite_cn0'] = df['satellites'].apply(lambda x: [sat['cn0'] for sat in x])
    
    # Calculate median, mode, variance, standard deviation and range of the satellite cn0
    df['satellite_cn0_median'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).median() if not pd.Series(x).empty else 0)
    df['satellite_cn0_mode'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).mode()[0] if not pd.Series(x).mode().empty else 0)
    df['satellite_cn0_std'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).std() if not pd.Series(x).empty else 0)
    df['satellite_cn0_range'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).max() - pd.Series(x).min() if not pd.Series(x).empty else 0)
    
    # load the bluetooth json and load rssi into a new column
    df['bluetoothDevices'] = df['bluetoothDevices'].apply(lambda x: json.loads(x))
    df['bluetooth_rssi'] = df['bluetoothDevices'].apply(lambda x: [device['rssi'] for device in x])
    
     # Calculate statistical figures for the bluetooth devices
    df['bluetooth_rssi_median'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).median() if not pd.Series(x).empty else 0)
    df['bluetooth_rssi_mode'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).mode()[0] if not pd.Series(x).mode().empty else 0)
    df['bluetooth_rssi_std'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).std() if not pd.Series(x).empty else 0)
    df['bluetooth_rssi_range'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).max() - pd.Series(x).min() if not pd.Series(x).empty else 0)
    
    # load the wifi json and load rssi into a new column
    df['wifiDevices'] = df['wifiDevices'].apply(lambda x: json.loads(x))
    df['wifi_rssi'] = df['wifiDevices'].apply(lambda x: [device['level'] for device in x])
    
     # Calculate statistical figures for the wifi devices
    df['wifi_rssi_median'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).median() if not pd.Series(x).empty else 0)
    df['wifi_rssi_mode'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).mode()[0] if not pd.Series(x).mode().empty else 0)
    df['wifi_rssi_range'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).max() - pd.Series(x).min() if not pd.Series(x).empty else 0)
    
    # Drop list columns
    df.drop(columns=['satellites', 'bluetoothDevices', 'wifiDevices', 'satellite_cn0', 'bluetooth_rssi', 'wifi_rssi'], inplace=True)
    
     # Average over all columns for numeric values and take the first of non-numeric to have a single row
    df_label = df.iloc[0]['label']
    
    df.drop(columns=['label'], inplace=True)
    
    df = df.mean().to_frame().T
    
    df['label'] = df_label
    
    #place label at the beginning
    cols = list(df.columns)
    cols = [cols[-1]] + cols[:-1]
    df = df[cols]
    
    averaged_df = pd.concat([averaged_df, df], ignore_index=True)
    return averaged_df

files_directory = '../Daten/firsttry/'
averaged_path = '../Daten/averaged_data.csv'

if os.path.exists(averaged_path):
    os.remove(averaged_path)
    print(f"Removed {averaged_path}")

# Get list of CSV files
csv_files = [file for file in os.listdir(files_directory) if file.endswith('.csv')]
print(f'Found {len(csv_files)} CSV files')

# Get nr of files, where name starts with indoor and outdoor
indoor_files = [file for file in csv_files if file.startswith('Indoor')]
outdoor_files = [file for file in csv_files if file.startswith('Outdoor')]
print(f'Found {len(indoor_files)} indoor files and {len(outdoor_files)} outdoor files')

averaged_data = pd.DataFrame()

for file in csv_files:
    averaged_data = preprocess_data(averaged_data, files_directory, file)

averaged_data.sort_values(by=['label'], inplace=True)
averaged_data

Found 174 CSV files
Found 96 indoor files and 78 outdoor files


Unnamed: 0,label,cellStrength,nrSatellitesInView,nrSatellitesInFix,minCn0GPS,meanCn0GPS,maxCn0GPS,minCn0Bl,meanCn0Bl,minCn0Wifi,...,satellite_cn0_mode,satellite_cn0_std,satellite_cn0_range,bluetooth_rssi_median,bluetooth_rssi_mode,bluetooth_rssi_std,bluetooth_rssi_range,wifi_rssi_median,wifi_rssi_mode,wifi_rssi_range
0,Indoor,3.000000,42.958333,0.000000,0.000000,0.000000,0.000000,-98.000000,-78.742362,-88.500000,...,0.000000,0.000000,0.000000,-90.416667,-98.000000,23.345390,57.000000,-73.875000,-87.833333,40.833333
119,Indoor,3.920000,37.000000,0.000000,0.000000,0.000000,0.000000,-99.000000,-83.374629,-91.000000,...,0.000000,0.000000,0.000000,-86.540000,-91.800000,12.348973,54.000000,-81.000000,-82.000000,16.000000
118,Indoor,3.680000,42.720000,4.360000,3.808000,14.240000,16.640000,-96.000000,-86.117511,-90.000000,...,3.808000,0.381244,0.960000,-89.160000,-90.960000,9.139573,28.840000,-82.500000,-86.000000,20.000000
117,Indoor,3.387097,47.967742,14.258065,16.603226,22.548387,31.354839,-99.000000,-84.553654,-86.903226,...,18.854839,4.605385,15.477419,-87.129032,-95.645161,12.264947,48.032258,-74.903226,-85.451613,21.387097
115,Indoor,1.500000,46.875000,24.125000,16.612500,25.700000,43.750000,-89.475000,-71.124643,-86.875000,...,22.407500,7.482374,27.510000,-74.087500,-81.500000,13.924079,40.475000,-71.412500,-82.550000,22.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,Outdoor,3.260870,45.000000,24.869565,17.856522,27.130435,43.739130,-99.000000,-84.961734,-88.000000,...,25.152174,6.823607,25.934783,-90.586957,-91.739130,14.618854,44.565217,-85.000000,-87.000000,15.000000
98,Outdoor,4.000000,51.000000,30.230769,20.088462,32.923077,45.115385,-96.846154,-85.467216,-93.769231,...,31.423077,6.283073,25.676923,-91.442308,-92.923077,16.902139,56.846154,-86.923077,-93.153846,20.384615
97,Outdoor,3.000000,55.642857,23.714286,18.457143,30.000000,41.321429,-98.357143,-90.834902,-90.071429,...,25.742857,7.052654,23.150000,-95.357143,-96.928571,11.781860,36.357143,-86.607143,-89.750000,9.500000
95,Outdoor,4.000000,52.000000,30.666667,18.970370,34.259259,45.814815,-99.333333,-89.196863,-94.814815,...,33.470370,7.170785,27.451852,-92.462963,-93.000000,11.826576,59.333333,-85.500000,-91.518519,34.740741


# Train the Random Forest Classifier

## Randomize and Split the Data for Random Forest Classifier

In [930]:
from sklearn.model_selection import train_test_split

# Randomize the data
averaged_data = averaged_data.sample(frac=1).reset_index(drop=True)

# drop the label column
X = averaged_data.drop(columns=['label'], axis=1)
Y = averaged_data['label']

# Split the data into training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

print(f'Training data shape: {X_train.shape}')
print(f'Testing data shape: {X_test.shape}')

Training data shape: (139, 22)
Testing data shape: (35, 22)


## Train a Random Forest Classifier

In [931]:
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Train the Random Forest Classifier with regularization
clf = RandomForestClassifier()

# Train the classifier
clf.fit(X_train, Y_train)

# Cross-validation
scores = cross_val_score(clf, X_train, Y_train, cv=5)
print(f"Cross-validation scores: {scores}")

# Predict the labels of the test data
y_pred = clf.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(Y_test, y_pred)
print(f'Accuracy: {accuracy}')

Cross-validation scores: [0.89285714 0.92857143 0.92857143 0.96428571 0.96296296]
Accuracy: 0.9714285714285714


## Get the Feature Importances

In [932]:
# Print feature importances of the selected features
feature_importances = clf.feature_importances_
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': feature_importances})
feature_importances = feature_importances.sort_values(by='importance', ascending=False)
feature_importances

Unnamed: 0,feature,importance
12,satellite_cn0_mode,0.192766
14,satellite_cn0_range,0.148656
5,maxCn0GPS,0.131702
11,satellite_cn0_median,0.119207
4,meanCn0GPS,0.082761
13,satellite_cn0_std,0.079774
2,nrSatellitesInFix,0.066627
3,minCn0GPS,0.043555
0,cellStrength,0.031373
1,nrSatellitesInView,0.02161


## Save the model and tryout the model with new data

In [933]:
import joblib

# Save the model
model_path = '../Daten/random_forest_classifier.joblib'
joblib.dump(clf, model_path)
print(f'Saved model to {model_path}')

Saved model to ../Daten/random_forest_classifier.joblib


# Test the model with new data in validation folder in this notebook

In [934]:
validation_files_directory = '../Daten/validation/'
validation_averaged_path = '../Daten/validation_averaged_data.csv'
model_path = '../Daten/random_forest_classifier.joblib'

# Load the model
model = joblib.load(model_path)

validation_averaged_data = pd.DataFrame()
validation_description = pd.DataFrame()
validation_files = [file for file in os.listdir(validation_files_directory) if file.endswith('.csv')]

for file in validation_files:
    df = pd.read_csv(validation_files_directory + file, sep=';')
    locationDescription = df['locationDescription'][0]
    locationPeople = df['people'][0]
    
    validation_description = pd.concat([validation_description, pd.DataFrame({'locationDescription': [locationDescription], 'people': [locationPeople]})], ignore_index=True)

    validation_averaged_data = preprocess_data(validation_averaged_data, validation_files_directory, file)

# Predict the labels of the validation data
validation_predictions = model.predict(validation_averaged_data.drop(columns=['label']))
validation_probabilities = model.predict_proba(validation_averaged_data.drop(columns=['label']))

#print label and the corresponding prediction
for label, description, people, prediction, probability in zip(validation_averaged_data['label'], validation_description['locationDescription'], validation_description['people'], validation_predictions, validation_probabilities):
    if prediction == 'Indoor':
        print(f'Label: {label}, Description: {description}, People: {people}, Prediction: {prediction}, Probability: {probability[0]}')
    else:
        print(f'Label: {label}, Description: {description}, People: {people}, Prediction: {prediction}, Probability: {probability[1]}')
        

Label: Outdoor, Description: Häuserschlucht, People: viel, Prediction: Outdoor, Probability: 0.9
Label: Indoor, Description: Raummitte, People: viel, Prediction: Indoor, Probability: 1.0
Label: Indoor, Description: Raummitte, People: weniger als 5, Prediction: Indoor, Probability: 1.0
Label: Indoor, Description: Nähe Fenster, People: viel, Prediction: Outdoor, Probability: 0.64


## Setup LSTM Model

In [935]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Define the LSTM model
model = Sequential()

model.add(LSTM(128, input_shape=(X_train.shape[1], 1), return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(128))
model.add(Dropout(0.2))

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

#convert all columns to float except the label column
X_train = X_train.astype(float)
Y_train = Y_train.map({'Indoor': 0, 'Outdoor': 1})
Y_train = Y_train.astype(float)

X_test = X_test.astype(float)
Y_test = Y_test.map({'Indoor': 0, 'Outdoor': 1})
Y_train = Y_train.astype(float)

#print accuracy of the model with the averaged data
X_train = np.array(X_train)
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

X_test = np.array(X_test)
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

model.fit(X_train, Y_train, epochs=10, validation_data=(X_test, Y_test), batch_size=32)

print(f'Accuracy: {model.evaluate(X_test, Y_test)[1]}')



Model: "sequential_39"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_117 (LSTM)             (None, 22, 128)           66560     
                                                                 
 dropout_157 (Dropout)       (None, 22, 128)           0         
                                                                 
 lstm_118 (LSTM)             (None, 22, 128)           131584    
                                                                 
 dropout_158 (Dropout)       (None, 22, 128)           0         
                                                                 
 lstm_119 (LSTM)             (None, 128)               131584    
                                                                 
 dropout_159 (Dropout)       (None, 128)               0         
                                                                 
 dense_80 (Dense)            (None, 32)              

## Validate the LSTM Model

In [936]:
#map the label column to 0 and 1
validation_averaged_data['label'] = validation_averaged_data['label'].map({'Indoor': 0, 'Outdoor': 1})

#convert all columns to float except the label column
validation_averaged_data = validation_averaged_data.astype(float)

X_validation = validation_averaged_data.drop(columns=['label'])
Y_validation = validation_averaged_data['label']

X_validation = np.array(X_validation)
X_validation = np.reshape(X_validation, (X_validation.shape[0], X_validation.shape[1], 1))

Y_validation = Y_validation.astype(float)

#predict the labels of the validation data for each data point
validation_predictions = model.predict(X_validation)

#print label and the corresponding prediction
for label, description, people, prediction in zip(validation_averaged_data['label'], validation_description['locationDescription'], validation_description['people'], validation_predictions):
    if prediction[0] > prediction[1]:
        print(f'Label: {label}, Description: {description}, People: {people}, Prediction: Indoor, Probability: {prediction[0]}')
    else:
        print(f'Label: {label}, Description: {description}, People: {people}, Prediction: Outdoor, Probability: {prediction[1]}')

Label: 1.0, Description: Häuserschlucht, People: viel, Prediction: Outdoor, Probability: 0.9101307988166809
Label: 0.0, Description: Raummitte, People: viel, Prediction: Indoor, Probability: 0.9893389344215393
Label: 0.0, Description: Raummitte, People: weniger als 5, Prediction: Indoor, Probability: 0.9958679676055908
Label: 0.0, Description: Nähe Fenster, People: viel, Prediction: Outdoor, Probability: 0.8353986740112305
