# Data Preprocessing, Averaging over all csv-files and Saving the averaged data into new csv-file

In [116]:
import os
import pandas as pd
import json 
from collections import Counter

# extract the code into a function called preprocess_data so that we can use it later for new data
def preprocess_data(averaged_df, directory, file):
    df = pd.read_csv(directory + file, sep=';')
    
    # Drop unnecessary and outdated columns
    deleted_columns = ['timeStampNetwork', 'timeStampGPS', 'locationDescription', 'people', 'latitudeGPS', 'longitudeGPS', 'latitudeNetwork', 'longitudeNetwork', 'minCn0GPS', 'maxCn0GPS', 'meanCn0GPS', 'minCn0Bluetooth', 'maxCn0Bluetooth', 'minCn0Wifi', 'maxCn0Wifi', 'meanCn0Wifi', 'bAccuracyNetwork', 'speedAccuracyNetwork', 'cellType', 'networkLocationType', 'hAccuracyNetwork', 'vAccuracyNetwork', 'speedAccuracyNetwork', 'bAccuracyNetwork', 'nrBlDevices', 'hAccuracyGPS', 'minCn0Bl', 'meanCn0Bl','maxCn0Bl', 'bAccuracyGPS', 'speedAccuracyGPS', 'vAccuracyGPS', 'nrWifiDevices']
    df = safe_delete(df, deleted_columns)
    
    # Remove first x rows and reset begin index to 0
    removedRows = 3
    df = df.iloc[removedRows:]
    df = df.reset_index(drop=True)
    
    # Load satellites json
    df['satellites'] = df['satellites'].apply(lambda x: json.loads(x))
    
    # Add cn0 column for easier computation of statistics
    df['satellite_cn0'] = df['satellites'].apply(lambda x: [sat['cn0'] for sat in x])
    
    # Calculate min, max, mean. median, mode, variance, standard deviation and range of the satellite cn0
    df['satellite_cn0_min'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).min() if not pd.Series(x).empty else 0)
    df['satellite_cn0_max'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).max() if not pd.Series(x).empty else 0)
    df['satellite_cn0_mean'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).mean() if not pd.Series(x).empty else 0)    
    df['satellite_cn0_median'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).median() if not pd.Series(x).empty else 0)
    df['satellite_cn0_mode'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).mode()[0] if not pd.Series(x).mode().empty else 0)
    df['satellite_cn0_std'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).std() if not pd.Series(x).empty else 0)
    df['satellite_cn0_range'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).max() - pd.Series(x).min() if not pd.Series(x).empty else 0)
    
    # load the bluetooth json and load rssi into a new column
    df['bluetoothDevices'] = df['bluetoothDevices'].apply(lambda x: json.loads(x))
    df['bluetooth_rssi'] = df['bluetoothDevices'].apply(lambda x: [device['rssi'] for device in x])
    
     # Calculate statistical figures for the bluetooth devices
    df['bluetooth_rssi_min'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).min() if not pd.Series(x).empty else 0)
    df['bluetooth_rssi_max'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).max() if not pd.Series(x).empty else 0)
    df['bluetooth_rssi_mean'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).mean() if not pd.Series(x).empty else 0)
    df['bluetooth_rssi_median'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).median() if not pd.Series(x).empty else 0)
    df['bluetooth_rssi_mode'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).mode()[0] if not pd.Series(x).mode().empty else 0)
    df['bluetooth_rssi_std'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).std() if not pd.Series(x).empty else 0)
    df['bluetooth_rssi_range'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).max() - pd.Series(x).min() if not pd.Series(x).empty else 0)
    
    # load the wifi json and load rssi into a new column
    df['wifiDevices'] = df['wifiDevices'].apply(lambda x: json.loads(x))
    df['wifi_rssi'] = df['wifiDevices'].apply(lambda x: [device['level'] for device in x])
    
     # Calculate statistical figures for the wifi devices
    df['wifi_rssi_min'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).min() if not pd.Series(x).empty else 0)
    df['wifi_rssi_max'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).max() if not pd.Series(x).empty else 0)
    df['wifi_rssi_mean'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).mean() if not pd.Series(x).empty else 0)
    df['wifi_rssi_median'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).median() if not pd.Series(x).empty else 0)
    df['wifi_rssi_mode'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).mode()[0] if not pd.Series(x).mode().empty else 0)
    df['wifi_rssi_range'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).max() - pd.Series(x).min() if not pd.Series(x).empty else 0)
    
    # Drop list columns
    df.drop(columns=['satellites', 'bluetoothDevices', 'wifiDevices', 'satellite_cn0', 'bluetooth_rssi', 'wifi_rssi'], inplace=True)
    
     # Average over all columns for numeric values and take the first of non-numeric to have a single row
    df_label = df.iloc[0]['label']
    
    df.drop(columns=['label'], inplace=True)
    
    df = df.mean().to_frame().T
    
    df['label'] = df_label
    
    #place label at the beginning
    cols = list(df.columns)
    cols = [cols[-1]] + cols[:-1]
    df = df[cols]
    
    averaged_df = pd.concat([averaged_df, df], ignore_index=True)
    return averaged_df

def safe_delete(df, columns):
    for column in columns:
        if column in df.columns:
            df = df.drop(columns=[column])
    return df

files_directory = '../Daten/firsttry/'
averaged_path = '../Daten/averaged_data.csv'

if os.path.exists(averaged_path):
    os.remove(averaged_path)
    print(f"Removed {averaged_path}")

# Get list of CSV files
csv_files = [file for file in os.listdir(files_directory) if file.endswith('.csv')]
print(f'Found {len(csv_files)} CSV files')

# Get nr of files, where name starts with indoor and outdoor
indoor_files = [file for file in csv_files if file.startswith('Indoor')]
outdoor_files = [file for file in csv_files if file.startswith('Outdoor')]
print(f'Found {len(indoor_files)} indoor files and {len(outdoor_files)} outdoor files')

averaged_data = pd.DataFrame()

for file in csv_files:
    averaged_data = preprocess_data(averaged_data, files_directory, file)

averaged_data.sort_values(by=['label'], inplace=True)
averaged_data

Found 177 CSV files
Found 99 indoor files and 78 outdoor files


Unnamed: 0,label,cellStrength,nrSatellitesInView,nrSatellitesInFix,satellite_cn0_min,satellite_cn0_max,satellite_cn0_mean,satellite_cn0_median,satellite_cn0_mode,satellite_cn0_std,...,bluetooth_rssi_median,bluetooth_rssi_mode,bluetooth_rssi_std,bluetooth_rssi_range,wifi_rssi_min,wifi_rssi_max,wifi_rssi_mean,wifi_rssi_median,wifi_rssi_mode,wifi_rssi_range
0,Indoor,3.000000,42.958333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,-90.416667,-98.000000,23.345390,57.000000,-88.500000,-47.666667,-71.843380,-73.875000,-87.833333,40.833333
121,Indoor,3.920000,37.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,-86.540000,-91.800000,12.348973,54.000000,-91.000000,-75.000000,-81.315789,-81.000000,-82.000000,16.000000
120,Indoor,3.680000,42.720000,4.360000,3.808000,4.768000,4.212610,4.094000,3.808000,0.381244,...,-89.160000,-90.960000,9.139573,28.840000,-90.000000,-70.000000,-80.428571,-82.500000,-86.000000,20.000000
119,Indoor,3.387097,47.967742,14.258065,16.603226,32.080645,23.502349,22.958065,18.854839,4.605385,...,-87.129032,-95.645161,12.264947,48.032258,-86.903226,-65.516129,-76.459766,-74.903226,-85.451613,21.387097
117,Indoor,4.000000,41.393939,21.242424,15.503030,39.257576,25.837499,24.840909,20.324242,6.997593,...,-76.848485,-86.000000,11.687739,38.636364,-86.000000,-66.000000,-76.000000,-76.000000,-86.000000,20.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118,Outdoor,4.000000,46.000000,26.066667,17.606667,38.886667,27.543205,27.590000,25.416667,5.368232,...,-94.400000,-97.000000,15.298340,56.700000,-94.900000,-66.200000,-80.984248,-79.400000,-80.100000,28.700000
38,Outdoor,4.000000,47.000000,28.629630,18.892593,45.033333,33.685858,33.553704,25.759259,7.193455,...,-93.685185,-97.814815,13.860640,59.370370,-93.666667,-79.333333,-86.648077,-86.333333,-88.000000,14.333333
39,Outdoor,3.517241,45.000000,21.379310,16.596552,35.713793,25.511660,25.563793,20.765517,5.255834,...,-94.586207,-96.310345,15.064577,43.413793,-87.206897,-63.034483,-71.379310,-67.172414,-68.655172,24.172414
125,Outdoor,4.000000,47.000000,17.250000,15.800000,42.346429,28.339306,26.571429,20.028571,8.574544,...,-70.428571,-74.214286,12.382702,39.357143,-93.285714,-74.357143,-84.497059,-82.500000,-81.500000,18.928571


# Train the Random Forest Classifier

## Randomize and Split the Data for Random Forest Classifier

In [117]:
from sklearn.model_selection import train_test_split

# Randomize the data
averaged_data = averaged_data.sample(frac=1).reset_index(drop=True)

# drop the label column
X = averaged_data.drop(columns=['label'], axis=1)
Y = averaged_data['label']

# Split the data into training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1)

print(f'Training data shape: {X_train.shape}')
print(f'Testing data shape: {X_test.shape}')

Training data shape: (159, 23)
Testing data shape: (18, 23)


## Train a Random Forest Classifier

In [118]:
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Train the Random Forest Classifier with regularization
clf = RandomForestClassifier()

# Train the classifier
clf.fit(X_train, Y_train)

# Cross-validation
scores = cross_val_score(clf, X_train, Y_train, cv=5)
print(f"Cross-validation scores: {scores}")

# Predict the labels of the test data
y_pred = clf.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(Y_test, y_pred)
print(f'Accuracy: {accuracy}')

Cross-validation scores: [0.90625    0.9375     0.96875    0.9375     0.96774194]
Accuracy: 0.8888888888888888


## Get the Feature Importances

In [119]:
# Print feature importances of the selected features
feature_importances = clf.feature_importances_
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': feature_importances})
feature_importances = feature_importances.sort_values(by='importance', ascending=False)
feature_importances

Unnamed: 0,feature,importance
6,satellite_cn0_median,0.166387
4,satellite_cn0_max,0.161272
9,satellite_cn0_range,0.143178
5,satellite_cn0_mean,0.105602
2,nrSatellitesInFix,0.098595
7,satellite_cn0_mode,0.092267
8,satellite_cn0_std,0.057459
3,satellite_cn0_min,0.039255
20,wifi_rssi_median,0.027629
0,cellStrength,0.018634


## Save the model and tryout the model with new data

In [120]:
import joblib

# Save the model
model_path = '../Daten/random_forest_classifier.joblib'
joblib.dump(clf, model_path)
print(f'Saved model to {model_path}')

Saved model to ../Daten/random_forest_classifier.joblib


# Create a bagging classifier, run the Random Forest Classifier with different random states and average the results

In [121]:
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
# Define the base classifier
base_cls = RandomForestClassifier(n_estimators=100)
# Instantiate the BaggingClassifier
bagging_cls = BaggingClassifier(estimator=base_cls, n_estimators=10)
# Train the BaggingClassifier
bagging_cls.fit(X_train, Y_train)
# Predict and evaluate as usual
y_pred = bagging_cls.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Save the model
model_path = '../Daten/bagging_classifier.joblib'
joblib.dump(bagging_cls, model_path)
print(f'Saved model to {model_path}')


Accuracy: 0.8888888888888888
Saved model to ../Daten/bagging_classifier.joblib


# Test the model with new data in validation folder in this notebook

In [122]:
validation_files_directory = '../Daten/validation/'
validation_averaged_path = '../Daten/validation_averaged_data.csv'
model_path = '../Daten/random_forest_classifier.joblib'

# Load the model
model = joblib.load(model_path)

validation_averaged_data = pd.DataFrame()
validation_description = pd.DataFrame()
validation_files = [file for file in os.listdir(validation_files_directory) if file.endswith('.csv')]

for file in validation_files:
    df = pd.read_csv(validation_files_directory + file, sep=';')
    locationDescription = df['locationDescription'][0]
    locationPeople = df['people'][0]
    
    validation_description = pd.concat([validation_description, pd.DataFrame({'locationDescription': [locationDescription], 'people': [locationPeople], 'file': [file]})], ignore_index=True)

    validation_averaged_data = preprocess_data(validation_averaged_data, validation_files_directory, file)

# Predict the labels of the validation data
validation_predictions = model.predict(validation_averaged_data.drop(columns=['label']))
validation_probabilities = model.predict_proba(validation_averaged_data.drop(columns=['label']))

#print label and the corresponding prediction
for label, description, people, prediction, probability, file in zip(validation_averaged_data['label'], validation_description['locationDescription'], validation_description['people'], validation_predictions, validation_probabilities, validation_description['file']):
    if label != prediction:
        print(f'Label: {label}, Prediction: {prediction}, Description: {description}, People: {people},  Probability: {probability}, File: {file}')
   

Label: Indoor, Prediction: Outdoor, Description: Nähe Fenster, People: viel,  Probability: [0.33 0.67], File: Indoor_2024-03-29 13_41_40.csv


# Test validation files with bagging classifier

In [123]:
validation_files_directory = '../Daten/validation/'
validation_averaged_path = '../Daten/validation_averaged_data.csv'
model_path = '../Daten/bagging_classifier.joblib'

# Load the model
model = joblib.load(model_path)

validation_averaged_data = pd.DataFrame()
validation_description = pd.DataFrame()
validation_files = [file for file in os.listdir(validation_files_directory) if file.endswith('.csv')]

for file in validation_files:
    df = pd.read_csv(validation_files_directory + file, sep=';')
    locationDescription = df['locationDescription'][0]
    locationPeople = df['people'][0]
    
    validation_description = pd.concat([validation_description, pd.DataFrame({'locationDescription': [locationDescription], 'people': [locationPeople], 'file': [file]})], ignore_index=True)

    validation_averaged_data = preprocess_data(validation_averaged_data, validation_files_directory, file)

# Predict the labels of the validation data
validation_predictions = model.predict(validation_averaged_data.drop(columns=['label']))
validation_probabilities = model.predict_proba(validation_averaged_data.drop(columns=['label']))

#print label and the corresponding prediction
for label, description, people, prediction, probability, file in zip(validation_averaged_data['label'], validation_description['locationDescription'], validation_description['people'], validation_predictions, validation_probabilities, validation_description['file']):
    if label != prediction:
        print(f'Label: {label}, Prediction: {prediction}, Description: {description}, People: {people}, Probability: {probability}, File: {file}')
        

Label: Indoor, Prediction: Outdoor, Description: Nähe Fenster, People: viel, Probability: [0.38 0.62], File: Indoor_2024-03-29 13_41_40.csv


## Setup LSTM Model

In [124]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Define the LSTM model
model = Sequential()

model.add(LSTM(128, input_shape=(X_train.shape[1], 1), return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(128))
model.add(Dropout(0.2))

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

#convert all columns to float except the label column
X_train = X_train.astype(float)
Y_train = Y_train.map({'Indoor': 0, 'Outdoor': 1})
Y_train = Y_train.astype(float)

X_test = X_test.astype(float)
Y_test = Y_test.map({'Indoor': 0, 'Outdoor': 1})
Y_train = Y_train.astype(float)

#print accuracy of the model with the averaged data
X_train = np.array(X_train)
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

X_test = np.array(X_test)
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

model.fit(X_train, Y_train, epochs=10, validation_data=(X_test, Y_test), batch_size=32)

print(f'Accuracy: {model.evaluate(X_test, Y_test)[1]}')



Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_30 (LSTM)              (None, 23, 128)           66560     
                                                                 
 dropout_40 (Dropout)        (None, 23, 128)           0         
                                                                 
 lstm_31 (LSTM)              (None, 23, 128)           131584    
                                                                 
 dropout_41 (Dropout)        (None, 23, 128)           0         
                                                                 
 lstm_32 (LSTM)              (None, 128)               131584    
                                                                 
 dropout_42 (Dropout)        (None, 128)               0         
                                                                 
 dense_20 (Dense)            (None, 32)              

## Validate the LSTM Model

In [125]:
#map the label column to 0 and 1
validation_averaged_data['label'] = validation_averaged_data['label'].map({'Indoor': 0, 'Outdoor': 1})

#convert all columns to float except the label column
validation_averaged_data = validation_averaged_data.astype(float)

X_validation = validation_averaged_data.drop(columns=['label'])
Y_validation = validation_averaged_data['label']

X_validation = np.array(X_validation)
X_validation = np.reshape(X_validation, (X_validation.shape[0], X_validation.shape[1], 1))

Y_validation = Y_validation.astype(float)

#predict the labels of the validation data for each data point
validation_predictions = model.predict(X_validation)

#print label and the corresponding prediction
for label, description, people, prediction in zip(validation_averaged_data['label'], validation_description['locationDescription'], validation_description['people'], validation_predictions):
    if label != np.argmax(prediction):
        print(f'Label: {label}, Probability: {prediction},  Description: {description}, People: {people}')

Label: 0.0, Probability: [0.1663937 0.8336063],  Description: Nähe Fenster, People: viel
