# # Data Preprocessing

In [584]:
import os
import pandas as pd

directory = '../Daten/firsttry/'

averaged_path = '../Daten/averaged_data.csv'

if os.path.exists(averaged_path):
    os.remove(averaged_path)
    print(f"Removed {averaged_path}")


# Get list of CSV files
csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]

print(csv_files)

averaged_data = pd.DataFrame()

for file in csv_files:
    print(f"Processing {file}...")
    df = pd.read_csv(directory + file, sep=';')

Removed ../Daten/averaged_data.csv
['Outdoor_2024-03-14 21_02_59.csv', 'Indoor_2024-03-11 14_15_41.csv', 'Indoor_2024-03-14 21_14_45.csv', 'Indoor_2024-03-14 21_12_33.csv', 'Outdoor_2024-03-14 20_45_50.csv', 'Outdoor_2024-03-14 20_44_10.csv', 'Indoor_2024-03-14 21_19_23.csv', 'Outdoor_2024-03-14 21_05_28.csv', 'Outdoor_2024-03-14 21_11_04.csv', 'Outdoor_2024-03-14 21_08_18.csv', 'Indoor_2024-03-14 16_21_11.csv']
Processing Outdoor_2024-03-14 21_02_59.csv...
Processing Indoor_2024-03-11 14_15_41.csv...
Processing Indoor_2024-03-14 21_14_45.csv...
Processing Indoor_2024-03-14 21_12_33.csv...
Processing Outdoor_2024-03-14 20_45_50.csv...
Processing Outdoor_2024-03-14 20_44_10.csv...
Processing Indoor_2024-03-14 21_19_23.csv...
Processing Outdoor_2024-03-14 21_05_28.csv...
Processing Outdoor_2024-03-14 21_11_04.csv...
Processing Outdoor_2024-03-14 21_08_18.csv...
Processing Indoor_2024-03-14 16_21_11.csv...


## Drop unnecessary columns

In [585]:
    # Drop unnecessary columns like timeStampNetwork and timeStampGPS
    df = df.drop(columns=['timeStampNetwork', 'timeStampGPS'])

## Remove first x rows

In [586]:
    # Remove first x rows and reset begin index to 0
    removedRows = 3
    df = df.iloc[removedRows:]
    df = df.reset_index(drop=True)

## Load satellites json 

In [587]:
    import json
    from collections import Counter
    
    df['satellites'] = df['satellites'].apply(lambda x: json.loads(x))

## Add columns of nr of satellite of each constellation

In [588]:
    # Add columns of nr of satellite of each constellation
    df['GPS_counts'] = df['satellites'].apply(lambda x: Counter([sat['constellation'] for sat in x if sat['constellation'] == 'GPS']).get('GPS', 0))
    
    df['BEIDOU_counts'] = df['satellites'].apply(lambda x: Counter([sat['constellation'] for sat in x if sat['constellation'] == 'BEIDOU']).get('BEIDOU', 0))
    
    df['GALILEO_counts'] = df['satellites'].apply(lambda x: Counter([sat['constellation'] for sat in x if sat['constellation'] == 'GALILEO']).get('GALILEO', 0))
    
    df['GLONASS_counts'] = df['satellites'].apply(lambda x: Counter([sat['constellation'] for sat in x if sat['constellation'] == 'GLONASS']).get('GLONASS', 0))

## Add cn0 column for easier computation of statistics

In [589]:
    df['satellite_cn0'] = df['satellites'].apply(lambda x: [sat['cn0'] for sat in x])

## Calculate median, mode, variance, standard deviation and range of the satellite cn0

In [590]:
    # Calculate median, mode, variance, standard deviation and range of the satellite cn0
    df['satellite_cn0_median'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).median())
    df['satellite_cn0_mode'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).mode()[0] if not pd.Series(x).mode().empty else None)
    df['satellite_cn0_std'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).std())
    df['satellite_cn0_range'] = df['satellite_cn0'].apply(lambda x: pd.Series(x).max() - pd.Series(x).min())

## load the bluetooth json and load rssi into a new column

In [591]:
    df['bluetoothDevices'] = df['bluetoothDevices'].apply(lambda x: json.loads(x))
    df['bluetooth_rssi'] = df['bluetoothDevices'].apply(lambda x: [device['rssi'] for device in x])

## Calculate statistical figures for the bluetooth devices

In [592]:
    # Calculate statistical figures for the bluetooth devices
    df['bluetooth_rssi_median'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).median())
    df['bluetooth_rssi_mode'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).mode()[0] if not pd.Series(x).mode().empty else None)
    df['bluetooth_rssi_std'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).std())
    df['bluetooth_rssi_range'] = df['bluetooth_rssi'].apply(lambda x: pd.Series(x).max() - pd.Series(x).min())

## load the wifi json and load rssi into a new column

In [593]:
    df['wifiDevices'] = df['wifiDevices'].apply(lambda x: json.loads(x))
    df['wifi_rssi'] = df['wifiDevices'].apply(lambda x: [device['level'] for device in x])

## Calculate statistical figures for the wifi devices

In [594]:
    # Calculate statistical figures for the wifi devices
    df['wifi_rssi_median'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).median())
    df['wifi_rssi_mode'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).mode()[0] if not pd.Series(x).mode().empty else None)
    df['wifi_rssi_std'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).std())
    df['wifi_rssi_range'] = df['wifi_rssi'].apply(lambda x: pd.Series(x).max() - pd.Series(x).min())

## Drop list columns

In [595]:
    df.drop(columns=['satellites', 'bluetoothDevices', 'wifiDevices', 'satellite_cn0', 'bluetooth_rssi', 'wifi_rssi'], inplace=True)

## Average over all columns to have a single row

In [596]:
    # Average over all columns for numeric values and take the first of non-numeric to have a single row
    label = df.iloc[0]['label']
    cellType = df.iloc[0]['cellType']
    networkLocationType = df.iloc[0]['networkLocationType']
    
    df.drop(columns=['label', 'cellType', 'networkLocationType'], inplace=True)
    
    df = df.mean().to_frame().T
    
    df['label'] = label
    
    #place label at the beginning
    cols = list(df.columns)
    cols = [cols[-1]] + cols[:-1]
    df = df[cols]
    
    df['cellType'] = cellType
    df['networkLocationType'] = networkLocationType

## Append to averaged_data

In [597]:
    averaged_data = pd.concat([averaged_data, df], ignore_index=True)
    print(f"After processing {file}, averaged_data has {averaged_data.shape[0]} rows.")


After processing Indoor_2024-03-14 16_21_11.csv, averaged_data has 1 rows.


## Save to CSV

In [598]:
averaged_data.to_csv(averaged_path, index=False)
averaged_data

Unnamed: 0,label,cellStrength,hAccuracyNetwork,vAccuracyNetwork,bAccuracyNetwork,speedAccuracyNetwork,hAccuracyGPS,vAccuracyGPS,bAccuracyGPS,speedAccuracyGPS,...,bluetooth_rssi_median,bluetooth_rssi_mode,bluetooth_rssi_std,bluetooth_rssi_range,wifi_rssi_median,wifi_rssi_mode,wifi_rssi_std,wifi_rssi_range,cellType,networkLocationType
0,Indoor,4.0,22.296462,8.442781,0.0,0.0,5.434615,29.484615,146.761538,3.038462,...,-80.0,-80.0,18.115287,51.0,-85.076923,-88.0,8.166664,18.384615,LTE,wifi


## Visualization

In [599]:
# test visualisation
import matplotlib.pyplot as plt

