# TSFresh exploration

In [19]:
from tsfresh import select_features, extract_relevant_features, extract_features
from tsfresh.utilities.dataframe_functions import impute
from get_processed_data import processed_data_malte
import pandas as pd

In [None]:
# to use our data in tsfresh, we need to convert it to their format
# lets try with one data point first
# we need the following format : id, time, [...sensor values]


data = processed_data_malte["3"][0]
# convert to pandas dataframe

data = pd.DataFrame(data)

#add id column and set to 1
data["id"] = 1

data.head()

extracted_features = extract_features(data, column_id="id", column_sort="SampleTimeFine", column_kind=None, column_value=None)



## Prepare data for TSFresh
Convert data to a single pandas dataframe and generate a unique id for each climb.

Also, generate a labels array, that indicates 0 for an easy route and 1 for a hard route for each climb.

In [59]:
# now try loading all data points for processed_data_malte
all_climbs = []
labels = pd.Series() # 0 = not hard, 1 = hard
i = 0


# iterate through dict and get values and keys

for key, climbs in processed_data_malte.items():
  if key == "5-":
    continue

  for climb in climbs:
    data = pd.DataFrame(climb)
    data = data.drop("PacketCounter", axis=1)
    labels[i] = 0 if key == "3" or key == "4" else 1
    data["id"] = i
    i += 1
    all_climbs.append(data)
    # labels.append(0 if key == "3" or key == "4" else 1)


print(labels)
print(pd.concat(all_climbs).head())

all_climbs_df = pd.concat(all_climbs)

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     1
9     1
10    1
11    1
12    1
dtype: int64
   SampleTimeFine    Quat_W    Quat_X    Quat_Y    Quat_Z  FreeAcc_X  \
0               0 -0.634064 -0.305900  0.667268 -0.243187   0.897042   
1           16667 -0.634288 -0.308343  0.665901 -0.243266   1.185700   
2           33334 -0.634115 -0.311293  0.664682 -0.243294   1.388625   
3           50001 -0.634039 -0.314479  0.663143 -0.243595   1.238289   
4           66668 -0.633950 -0.317556  0.661447 -0.244443   1.082093   

   FreeAcc_Y  FreeAcc_Z  id  
0   0.352503  -1.251197   0  
1  -0.084639  -0.808746   0  
2  -0.452435  -0.391868   0  
3  -0.543406   0.012652   0  
4  -0.170053   0.300346   0  


In [None]:
# now we can extract features from all_climbs
extracted_features = extract_features(all_climbs_df, column_id="id", column_sort="SampleTimeFine", column_kind=None, column_value=None)
# save to disk
extracted_features.to_csv("extracted_features.csv")
extracted_features.head()

In [62]:
# drop columns with NaN values
extracted_features_without_nan = impute(extracted_features)

# # now we can filter out the relevant features
filtered_features = select_features(extracted_features_without_nan, labels)
filtered_features

0
1
2
3
4
5
6
7
8
9
10


# Conclusion

Using the samples directly results in an empty feature array. This is likely because we have too little samples to calculate a relevancy of the features. We need to generate more samples, e.g. using windowing.