# Data Preprocessing, Averaging over all csv-files and Saving the averaged data into new csv-file

In [317]:
import os
import pandas as pd
import json 
from collections import Counter

files_directory = '../Daten/firsttry/'

averaged_path = '../Daten/averaged_data.csv'


# extract the code into a function called preprocess_data so that we can use it later for new data
def preprocess_data(directory, averaged_path):
    if os.path.exists(averaged_path):
        os.remove(averaged_path)
        print(f"Removed {averaged_path}")

    # Get list of CSV files
    csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]
    print(f'Found {len(csv_files)} CSV files')
    
    # Get nr of files, where name starts with indoor and outdoor
    indoor_files = [file for file in csv_files if file.startswith('Indoor')]
    outdoor_files = [file for file in csv_files if file.startswith('Outdoor')]
    print(f'Found {len(indoor_files)} indoor files and {len(outdoor_files)} outdoor files')
    
    averaged_data = pd.DataFrame()
    
    for file in csv_files:
        df = pd.read_csv(directory + file, sep=';')
        # Drop unnecessary columns like timeStampNetwork and timeStampGPS
        df = df.drop(columns=['timeStampNetwork', 'timeStampGPS', 'bAccuracyNetwork', 'speedAccuracyNetwork', 'cellType', 'networkLocationType'])
        # Remove first x rows and reset begin index to 0
        removedRows = 3
        df = df.iloc[removedRows:]
        df = df.reset_index(drop=True)
        
        # Load satellites json
        df['satellites'] = df['satellites'].apply(lambda x: json.loads(x))
        
        '''# Add columns of nr of satellite of each constellation
        df['GPS_counts'] = df['satellites'].apply(lambda x: Counter([sat['constellation'] for sat in x if sat['constellation'] == 'GPS']).get('GPS', 0))
        
        df['BEIDOU_counts'] = df['satellites'].apply(lambda x: Counter([sat['constellation'] for sat in x if sat['constellation'] == 'BEIDOU']).get('BEIDOU', 0))
        
        df['GALILEO_counts'] = df['satellites'].apply(lambda x: Counter([sat['constellation'] for sat in x if sat['constellation'] == 'GALILEO']).get('GALILEO', 0))
        
        df['GLONASS_counts'] = df['satellites'].apply(lambda x: Counter([sat['constellation'] for sat in x if sat['constellation'] == 'GLONASS']).get('GLONASS', 0))'''
        
        # Add cn0 column for easier computation of statistics
        df['satellite_cn0'] = df['satellites'].apply(lambda x: [sat['cn0'] for sat in x])
        
        # Calculate median, mode, variance, standard deviation and range of the satellite cn0
        df['satellite_cn0_median'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).median() if not pd.Series(x).empty else 0)
        df['satellite_cn0_mode'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).mode()[0] if not pd.Series(x).mode().empty else 0)
        df['satellite_cn0_std'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).std() if not pd.Series(x).empty else 0)
        df['satellite_cn0_range'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).max() - pd.Series(x).min() if not pd.Series(x).empty else 0)
        
        # load the bluetooth json and load rssi into a new column
        df['bluetoothDevices'] = df['bluetoothDevices'].apply(lambda x: json.loads(x))
        df['bluetooth_rssi'] = df['bluetoothDevices'].apply(lambda x: [device['rssi'] for device in x])
        
         # Calculate statistical figures for the bluetooth devices
        df['bluetooth_rssi_median'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).median() if not pd.Series(x).empty else 0)
        df['bluetooth_rssi_mode'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).mode()[0] if not pd.Series(x).mode().empty else 0)
        df['bluetooth_rssi_std'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).std() if not pd.Series(x).empty else 0)
        df['bluetooth_rssi_range'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).max() - pd.Series(x).min() if not pd.Series(x).empty else 0)
        
        # load the wifi json and load rssi into a new column
        df['wifiDevices'] = df['wifiDevices'].apply(lambda x: json.loads(x))
        df['wifi_rssi'] = df['wifiDevices'].apply(lambda x: [device['level'] for device in x])
        
         # Calculate statistical figures for the wifi devices
        df['wifi_rssi_median'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).median() if not pd.Series(x).empty else 0)
        df['wifi_rssi_mode'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).mode()[0] if not pd.Series(x).mode().empty else 0)
        df['wifi_rssi_std'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).std() if not pd.Series(x).empty else 0)
        df['wifi_rssi_range'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).max() - pd.Series(x).min() if not pd.Series(x).empty else 0)
        
        # Drop list columns
        df.drop(columns=['satellites', 'bluetoothDevices', 'wifiDevices', 'satellite_cn0', 'bluetooth_rssi', 'wifi_rssi'], inplace=True)
        
         # Average over all columns for numeric values and take the first of non-numeric to have a single row
        label = df.iloc[0]['label']
        
        df.drop(columns=['label'], inplace=True)
        
        df = df.mean().to_frame().T
        
        df['label'] = label
        
        #place label at the beginning
        cols = list(df.columns)
        cols = [cols[-1]] + cols[:-1]
        df = df[cols]
        
        
        # if the nr of satellites is 0, then replace hAccurracyGPS, vAccuracyGPS, speedAccuracyGPS, bAccuracyGPS with NaN
        df.loc[df['nrSatellitesInFix'] == 0, ['hAccuracyGPS', 'vAccuracyGPS', 'speedAccuracyGPS', 'bAccuracyGPS']] = 0
        
        averaged_data = pd.concat([averaged_data, df], ignore_index=True)
        
        
    averaged_data.sort_values(by=['label'], inplace=True)
    averaged_data.to_csv(averaged_path, index=False)
    print(f'Saved averaged data to {averaged_path}')
    return averaged_data

averaged_data = preprocess_data(files_directory, averaged_path)
averaged_data.head()


Removed ../Daten/averaged_data.csv
Found 55 CSV files
Found 29 indoor files and 26 outdoor files
Saved averaged data to ../Daten/averaged_data.csv


Unnamed: 0,label,cellStrength,hAccuracyNetwork,vAccuracyNetwork,hAccuracyGPS,vAccuracyGPS,bAccuracyGPS,speedAccuracyGPS,nrSatellitesInView,nrSatellitesInFix,...,satellite_cn0_std,satellite_cn0_range,bluetooth_rssi_median,bluetooth_rssi_mode,bluetooth_rssi_std,bluetooth_rssi_range,wifi_rssi_median,wifi_rssi_mode,wifi_rssi_std,wifi_rssi_range
27,Indoor,2.965517,14.174379,1.0,0.0,0.0,0.0,0.0,37.0,0.0,...,0.0,0.0,-92.965517,-98.37931,12.799422,51.0,-72.689655,-73.62069,9.1061,25.068966
32,Indoor,1.578947,44.897211,8.357078,0.0,0.0,0.0,0.0,36.0,0.0,...,0.0,0.0,-93.052632,-67.157895,26.664825,63.578947,-78.5,-89.105263,11.209734,27.789474
31,Indoor,1.071429,16.903071,1.235335,1.9,9.9,178.3,0.9,28.571429,25.0,...,0.0,0.0,-91.0,-96.0,11.244463,52.0,-67.0,-67.0,7.14065,17.0
30,Indoor,2.708333,19.186583,2.056563,0.0,0.0,0.0,0.0,36.208333,0.0,...,0.0,0.0,-81.75,-90.75,20.431087,47.666667,-78.479167,-89.5,10.401494,26.833333
37,Indoor,2.0,13.483828,1.246068,0.0,0.0,0.0,0.0,32.37931,0.0,...,0.0,0.0,-91.775862,-95.827586,10.434339,49.137931,-74.0,-86.0,19.372172,48.0


# Train the Random Forest Classifier

## Randomize and Split the Data for Random Forest Classifier

In [318]:
from sklearn.model_selection import train_test_split

# Randomize the data
averaged_data = averaged_data.sample(frac=1, random_state=42).reset_index(drop=True)

# drop the label column
X = averaged_data.drop(columns=['label'], axis=1)
Y = averaged_data['label']

# Split the data into training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

## Train a Random Forest Classifier

In [319]:
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Train the Random Forest Classifier with regularization
clf = RandomForestClassifier(random_state=42, max_depth=10, min_samples_leaf=5)

# Print the nr of features before selection
print(f'Nr of features before selection: {X_train.shape[1]}')

# Feature selection
selector = SelectFromModel(clf,  threshold=0.01).fit(X_train, Y_train)

# Transform the data to create a new dataset containing only the most important features
X_train = selector.transform(X_train)
X_test = selector.transform(X_test)

# Print the nr of features after selection
print(f'Nr of features after selection: {X_train.shape[1]}')


# Train the classifier
clf.fit(X_train, Y_train)

# Cross-validation
scores = cross_val_score(clf, X_train, Y_train, cv=5)
print(f"Cross-validation scores: {scores}")

# Predict the labels of the test data
y_pred = clf.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(Y_test, y_pred)
print(f'Accuracy: {accuracy}')

Nr of features before selection: 32
Nr of features after selection: 16
                 feature  importance
12    satellite_cn0_mode    0.170669
13     satellite_cn0_std    0.150000
5      nrSatellitesInFix    0.128789
11  satellite_cn0_median    0.125257
8              maxCn0GPS    0.120973
7             meanCn0GPS    0.119420
14   satellite_cn0_range    0.100566
6              minCn0GPS    0.029346
2           bAccuracyGPS    0.021123
3       speedAccuracyGPS    0.014743
4     nrSatellitesInView    0.010000
9             minCn0Wifi    0.008762
1           vAccuracyGPS    0.000185
0           hAccuracyGPS    0.000167
10            maxCn0Wifi    0.000000
15         wifi_rssi_std    0.000000
Cross-validation scores: [0.875 1.    1.    1.    1.   ]
Accuracy: 0.9411764705882353


## Visualizing Decision Trees

In [320]:
#Visualizing the all decision trees
'''from sklearn.tree import export_graphviz
import pydot

# Export as dot and png file
for i, tree in enumerate(clf.estimators_):
    export_graphviz(tree, out_file=f'tree{i}.dot', feature_names = X.columns, rounded = True, precision = 1)
    (graph,) = pydot.graph_from_dot_file(f'tree{i}.dot')
    graph.write_png(f'tree{i}.png')
    
#combine all trees into the decision forest
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

images = [Image.open(f'tree{i}.png') for i in range(10)]
widths, heights = zip(*(i.size for i in images))

total_width = sum(widths)
max_height = max(heights)

new_im = Image.new('RGB', (total_width, max_height))

x_offset = 0
for im in images:
  new_im.paste(im, (x_offset,0))
  x_offset += im.size[0]
  
new_im.save('decision_forest.png')
plt.imshow(np.asarray(new_im))
plt.show()'''

"from sklearn.tree import export_graphviz\nimport pydot\n\n# Export as dot and png file\nfor i, tree in enumerate(clf.estimators_):\n    export_graphviz(tree, out_file=f'tree{i}.dot', feature_names = X.columns, rounded = True, precision = 1)\n    (graph,) = pydot.graph_from_dot_file(f'tree{i}.dot')\n    graph.write_png(f'tree{i}.png')\n    \n#combine all trees into the decision forest\nfrom PIL import Image\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nimages = [Image.open(f'tree{i}.png') for i in range(10)]\nwidths, heights = zip(*(i.size for i in images))\n\ntotal_width = sum(widths)\nmax_height = max(heights)\n\nnew_im = Image.new('RGB', (total_width, max_height))\n\nx_offset = 0\nfor im in images:\n  new_im.paste(im, (x_offset,0))\n  x_offset += im.size[0]\n  \nnew_im.save('decision_forest.png')\nplt.imshow(np.asarray(new_im))\nplt.show()"

## Get the Feature Importances

In [321]:
# Get the names of the most important features and print their according importance
selected_features = X.columns[selector.get_support()]

# Print feature importances of the selected features
feature_importances = clf.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_features, 'importance': feature_importances})
feature_importances = feature_importances.sort_values(by='importance', ascending=False)
print(feature_importances)

                 feature  importance
12    satellite_cn0_mode    0.170669
13     satellite_cn0_std    0.150000
5      nrSatellitesInFix    0.128789
11  satellite_cn0_median    0.125257
8              maxCn0GPS    0.120973
7             meanCn0GPS    0.119420
14   satellite_cn0_range    0.100566
6              minCn0GPS    0.029346
2           bAccuracyGPS    0.021123
3       speedAccuracyGPS    0.014743
4     nrSatellitesInView    0.010000
9             minCn0Wifi    0.008762
1           vAccuracyGPS    0.000185
0           hAccuracyGPS    0.000167
10            maxCn0Wifi    0.000000
15         wifi_rssi_std    0.000000


## Save the model and tryout the model with new data

In [322]:
import joblib

# Save the model
model_path = '../Daten/random_forest_classifier.joblib'
joblib.dump(clf, model_path)
print(f'Saved model to {model_path}')

Saved model to ../Daten/random_forest_classifier.joblib


# Test the model with new data in validation folder in this notebook

In [323]:
validation_files_directory = '../Daten/validation/'

validation_averaged_path = '../Daten/validation_averaged_data.csv'

model_path = '../Daten/random_forest_classifier.joblib'

# Load the model
model = joblib.load(model_path)

validation_averaged_data = preprocess_data(validation_files_directory, validation_averaged_path)

# Predict the labels of the validation data
validation_predictions = model.predict(validation_averaged_data.drop(columns=['label']))

#print label and the corresponding prediction
for label, prediction in zip(validation_averaged_data['label'], validation_predictions):
    print(f'Label: {label}, Prediction: {prediction}')

Removed ../Daten/validation_averaged_data.csv
Found 4 CSV files
Found 2 indoor files and 2 outdoor files
Saved averaged data to ../Daten/validation_averaged_data.csv




ValueError: X has 32 features, but RandomForestClassifier is expecting 16 features as input.