In [25]:
import os
import pandas as pd
import glob
import json
data_path = "../data/interim/v4"
dataset_target_path = "../data/processed"
test_cases = [file.split("-")[2] for file in glob.glob(f"{data_path}/*stage_5*.csv")]
# List test cases in dataset
for i,test_case in enumerate(test_cases):
    print(i,test_case)

0 milenkosKitchenNotNearDevices1
1 milenkosRoomNearDevices1
2 milenkosRoomNearDevices


In [26]:
# read all data in dataset
data_off = {test_case: pd.read_csv(glob.glob(f"{data_path}/*stage_5*{test_case}*")[0]) for test_case in test_cases}
data_on = {test_case: pd.read_csv(glob.glob(f"{data_path}/*stage_6*{test_case}*")[0]) for test_case in test_cases}

In [27]:
def add_intensity(df: pd.DataFrame):
    """
    add Intensity as 2nd norm of 3D vector
    """
    df["Intensity"] = (df["X_UnCal"] ** 2 + df["Y_UnCal"] ** 2 + df["Z_UnCal"] ** 2) ** 0.5

for key in data_off.keys():
    add_intensity(data_off[key])
for key in data_on.keys():
    add_intensity(data_on[key])

In [28]:
def statistical_features_flat(df: pd.DataFrame, test_case_name: str, test_case_class: int) -> dict:
    

    d = dict() 

    # add test_case_name and label
    d['name'] = test_case_name
    d['label'] = test_case_class

    statistics = df[['X_UnCal', 'Y_UnCal', 'Z_UnCal', 'Intensity']].describe()
    for col_name in statistics.columns:
        for row_name, value in statistics.iterrows():

            # add one feature "count"
            if col_name == "X_UnCal" and row_name == "count":
                d["count"] = value[col_name]

            # exclude other counts, since they are the same
            if row_name != "count":
                d[f"{col_name}_{row_name}"] = value[col_name]
            

    # add features that do not change
    for feature in ['X_Bias', 'Y_Bias', 'Z_Bias', 'Accuracy']:
        d[feature] = df[feature][0]

    
    return d

In [29]:
# all_on is class 6
df_on = pd.DataFrame([statistical_features_flat(df=data, test_case_name=name, test_case_class=6)for name, data in data_on.items()])
# all_of is class 5
df_off = pd.DataFrame([statistical_features_flat(df=data, test_case_name=name, test_case_class=5)for name, data in data_off.items()])
df = df_on.append(df_off, ignore_index=True)
df

  df = df_on.append(df_off, ignore_index=True)


Unnamed: 0,name,label,count,X_UnCal_mean,X_UnCal_std,X_UnCal_min,X_UnCal_25%,X_UnCal_50%,X_UnCal_75%,X_UnCal_max,...,Intensity_std,Intensity_min,Intensity_25%,Intensity_50%,Intensity_75%,Intensity_max,X_Bias,Y_Bias,Z_Bias,Accuracy
0,milenkosKitchenNotNearDevices1,6,5998.0,4.095771,1.341844,-4.9044,3.2696,4.0504,4.9288,8.6986,...,0.818838,78.755509,82.04004,82.530658,83.006781,88.566211,0.0,0.0,0.0,1
1,milenkosRoomNearDevices1,6,5998.0,11.445347,1.417421,-3.294,10.6384,11.407,12.285399,16.2016,...,0.692225,92.093707,94.704681,95.08979,95.492666,100.326031,0.0,0.0,0.0,1
2,milenkosRoomNearDevices,6,5998.0,11.430729,1.480767,1.891,10.5774,11.467999,12.3952,16.2016,...,0.735164,92.324587,94.626308,95.084685,95.517978,101.39755,0.0,0.0,0.0,1
3,milenkosKitchenNotNearDevices1,5,5998.0,2.534175,1.23101,-9.735599,1.77205,2.44,3.2452,6.5758,...,0.619732,80.786646,82.335112,82.72954,83.119809,87.482094,0.0,0.0,0.0,1
4,milenkosRoomNearDevices1,5,5998.0,9.831416,1.384276,-7.1736,9.076799,9.7478,10.5774,13.6396,...,0.689222,92.894185,94.6729,95.029032,95.407577,102.886507,0.0,0.0,0.0,1
5,milenkosRoomNearDevices,5,5998.0,9.831416,1.384276,-7.1736,9.076799,9.7478,10.5774,13.6396,...,0.689222,92.894185,94.6729,95.029032,95.407577,102.886507,0.0,0.0,0.0,1


In [30]:
# save dataset
if not os.path.isdir(dataset_target_path):
    os.makedirs(dataset_target_path)

import time
# metadata
metadata = {
    "title": "Statistical dataset",
    "version": "v3",
    "description": "Dataset from v3 data, single device, without traffic, statistical features",
    "author": "Mihael",
    "places": [
        "room",
        "kitchen"
    ],
    "stages": 2,
    "traffic": False,
    "format": "processed",
    "created": int(time.time())
}
file_name = f"{metadata['version']}-{metadata['stages']}_stages-statistical"
df.to_csv(f"{dataset_target_path}/{file_name}.csv",index=True)
# Writing metadata
with open(f"{dataset_target_path}/{file_name}.json", "w") as outfile:
    outfile.write(json.dumps(metadata, indent=4))
