# Data Preprocessing, Averaging over all csv-files and Saving the averaged data into new csv-file

In [253]:
import os
import pandas as pd
import json 
from collections import Counter

files_directory = '../Daten/firsttry/'

averaged_path = '../Daten/averaged_data.csv'


# extract the code into a function called preprocess_data so that we can use it later for new data
def preprocess_data(directory, averaged_path):
    if os.path.exists(averaged_path):
        os.remove(averaged_path)
        print(f"Removed {averaged_path}")

    # Get list of CSV files
    csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]
    print(f'Found {len(csv_files)} CSV files')
    
    # Get nr of files, where name starts with indoor and outdoor
    indoor_files = [file for file in csv_files if file.startswith('Indoor')]
    outdoor_files = [file for file in csv_files if file.startswith('Outdoor')]
    print(f'Found {len(indoor_files)} indoor files and {len(outdoor_files)} outdoor files')
    
    averaged_data = pd.DataFrame()
    
    for file in csv_files:
        df = pd.read_csv(directory + file, sep=';')
        # Drop unnecessary columns like timeStampNetwork and timeStampGPS
        df = df.drop(columns=['timeStampNetwork', 'timeStampGPS', 'bAccuracyNetwork', 'speedAccuracyNetwork', 'cellType', 'networkLocationType'])
        # Remove first x rows and reset begin index to 0
        removedRows = 3
        df = df.iloc[removedRows:]
        df = df.reset_index(drop=True)
        
        # Load satellites json
        df['satellites'] = df['satellites'].apply(lambda x: json.loads(x))
        
        '''# Add columns of nr of satellite of each constellation
        df['GPS_counts'] = df['satellites'].apply(lambda x: Counter([sat['constellation'] for sat in x if sat['constellation'] == 'GPS']).get('GPS', 0))
        
        df['BEIDOU_counts'] = df['satellites'].apply(lambda x: Counter([sat['constellation'] for sat in x if sat['constellation'] == 'BEIDOU']).get('BEIDOU', 0))
        
        df['GALILEO_counts'] = df['satellites'].apply(lambda x: Counter([sat['constellation'] for sat in x if sat['constellation'] == 'GALILEO']).get('GALILEO', 0))
        
        df['GLONASS_counts'] = df['satellites'].apply(lambda x: Counter([sat['constellation'] for sat in x if sat['constellation'] == 'GLONASS']).get('GLONASS', 0))'''
        
        # Add cn0 column for easier computation of statistics
        df['satellite_cn0'] = df['satellites'].apply(lambda x: [sat['cn0'] for sat in x])
        
        # Calculate median, mode, variance, standard deviation and range of the satellite cn0
        df['satellite_cn0_median'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).median() if not pd.Series(x).empty else 0)
        df['satellite_cn0_mode'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).mode()[0] if not pd.Series(x).mode().empty else 0)
        df['satellite_cn0_std'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).std() if not pd.Series(x).empty else 0)
        df['satellite_cn0_range'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).max() - pd.Series(x).min() if not pd.Series(x).empty else 0)
        
        # load the bluetooth json and load rssi into a new column
        df['bluetoothDevices'] = df['bluetoothDevices'].apply(lambda x: json.loads(x))
        df['bluetooth_rssi'] = df['bluetoothDevices'].apply(lambda x: [device['rssi'] for device in x])
        
         # Calculate statistical figures for the bluetooth devices
        df['bluetooth_rssi_median'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).median() if not pd.Series(x).empty else 0)
        df['bluetooth_rssi_mode'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).mode()[0] if not pd.Series(x).mode().empty else 0)
        df['bluetooth_rssi_std'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).std() if not pd.Series(x).empty else 0)
        df['bluetooth_rssi_range'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).max() - pd.Series(x).min() if not pd.Series(x).empty else 0)
        
        # load the wifi json and load rssi into a new column
        df['wifiDevices'] = df['wifiDevices'].apply(lambda x: json.loads(x))
        df['wifi_rssi'] = df['wifiDevices'].apply(lambda x: [device['level'] for device in x])
        
         # Calculate statistical figures for the wifi devices
        df['wifi_rssi_median'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).median() if not pd.Series(x).empty else 0)
        df['wifi_rssi_mode'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).mode()[0] if not pd.Series(x).mode().empty else 0)
        df['wifi_rssi_range'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).max() - pd.Series(x).min() if not pd.Series(x).empty else 0)
        
        # Drop list columns
        df.drop(columns=['satellites', 'bluetoothDevices', 'wifiDevices', 'satellite_cn0', 'bluetooth_rssi', 'wifi_rssi'], inplace=True)
        
         # Average over all columns for numeric values and take the first of non-numeric to have a single row
        label = df.iloc[0]['label']
        
        df.drop(columns=['label'], inplace=True)
        
        df = df.mean().to_frame().T
        
        df['label'] = label
        
        #place label at the beginning
        cols = list(df.columns)
        cols = [cols[-1]] + cols[:-1]
        df = df[cols]
        
        
        # if the nr of satellites is 0, then replace hAccurracyGPS, vAccuracyGPS, speedAccuracyGPS, bAccuracyGPS with NaN
        df.loc[df['nrSatellitesInFix'] == 0, ['hAccuracyGPS', 'vAccuracyGPS', 'speedAccuracyGPS', 'bAccuracyGPS']] = 0
        
        averaged_data = pd.concat([averaged_data, df], ignore_index=True)
        
        
    averaged_data.sort_values(by=['label'], inplace=True)
    averaged_data.to_csv(averaged_path, index=False)
    print(f'Saved averaged data to {averaged_path}')
    return averaged_data

averaged_data = preprocess_data(files_directory, averaged_path)


Removed ../Daten/averaged_data.csv
Found 156 CSV files
Found 85 indoor files and 71 outdoor files
Saved averaged data to ../Daten/averaged_data.csv


# Train the Random Forest Classifier

## Randomize and Split the Data for Random Forest Classifier

In [254]:
from sklearn.model_selection import train_test_split

# Randomize the data
averaged_data = averaged_data.sample(frac=1).reset_index(drop=True)

# drop the label column
X = averaged_data.drop(columns=['label'], axis=1)
Y = averaged_data['label']

# Split the data into training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.8)

print(f'Training data shape: {X_train.shape}')
print(f'Testing data shape: {X_test.shape}')

Training data shape: (31, 31)
Testing data shape: (125, 31)


## Train a Random Forest Classifier

In [255]:
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Train the Random Forest Classifier with regularization
clf = RandomForestClassifier()

# Train the classifier
clf.fit(X_train, Y_train)

# Cross-validation
scores = cross_val_score(clf, X_train, Y_train, cv=5)
print(f"Cross-validation scores: {scores}")

# Predict the labels of the test data
y_pred = clf.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(Y_test, y_pred)
print(f'Accuracy: {accuracy}')

Cross-validation scores: [1.         0.83333333 1.         0.83333333 1.        ]
Accuracy: 0.968


## Get the Feature Importances

In [256]:
# Print feature importances of the selected features
feature_importances = clf.feature_importances_
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': feature_importances})
feature_importances = feature_importances.sort_values(by='importance', ascending=False)
feature_importances

Unnamed: 0,feature,importance
11,maxCn0GPS,0.158963
23,satellite_cn0_range,0.126607
10,meanCn0GPS,0.094723
21,satellite_cn0_mode,0.093422
20,satellite_cn0_median,0.090168
8,nrSatellitesInFix,0.086108
9,minCn0GPS,0.078598
0,cellStrength,0.053482
1,hAccuracyNetwork,0.030734
22,satellite_cn0_std,0.029231


## Save the model and tryout the model with new data

In [257]:
import joblib

# Save the model
model_path = '../Daten/random_forest_classifier.joblib'
joblib.dump(clf, model_path)
print(f'Saved model to {model_path}')

Saved model to ../Daten/random_forest_classifier.joblib


# Test the model with new data in validation folder in this notebook

In [258]:
validation_files_directory = '../Daten/validation/'

validation_averaged_path = '../Daten/validation_averaged_data.csv'

model_path = '../Daten/random_forest_classifier.joblib'

# Load the model
model = joblib.load(model_path)

validation_averaged_data = preprocess_data(validation_files_directory, validation_averaged_path)

# Predict the labels of the validation data
validation_predictions = model.predict(validation_averaged_data.drop(columns=['label']))

validation_probabilities = model.predict_proba(validation_averaged_data.drop(columns=['label']))

#print label and the corresponding prediction
for label, prediction, probability in zip(validation_averaged_data['label'], validation_predictions, validation_probabilities):
    if prediction == 'Indoor':
        print(f'Label: {label}, Prediction: {prediction}, Probability: {probability[0]}')
    else:
        print(f'Label: {label}, Prediction: {prediction}, Probability: {probability[1]}')
        

Removed ../Daten/validation_averaged_data.csv
Found 11 CSV files
Found 4 indoor files and 7 outdoor files
Saved averaged data to ../Daten/validation_averaged_data.csv
Label: Indoor, Prediction: Indoor, Probability: 0.84
Label: Indoor, Prediction: Indoor, Probability: 0.98
Label: Indoor, Prediction: Indoor, Probability: 0.99
Label: Indoor, Prediction: Indoor, Probability: 0.98
Label: Outdoor, Prediction: Outdoor, Probability: 0.99
Label: Outdoor, Prediction: Outdoor, Probability: 0.89
Label: Outdoor, Prediction: Outdoor, Probability: 0.71
Label: Outdoor, Prediction: Outdoor, Probability: 1.0
Label: Outdoor, Prediction: Outdoor, Probability: 0.82
Label: Outdoor, Prediction: Outdoor, Probability: 1.0
Label: Outdoor, Prediction: Outdoor, Probability: 0.85
