In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
import pandas as pd
from tsfresh import extract_features
from tsfresh.feature_extraction import ComprehensiveFCParameters, EfficientFCParameters, MinimalFCParameters
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh import extract_relevant_features
from tsfresh.feature_selection.relevance import calculate_relevance_table
from sklearn.model_selection import train_test_split

sns.set(style="darkgrid")

# Mengambil directory dan input file awal

In [2]:
cwd = os.getcwd()
input_dir = os.path.join(cwd, 'input')
output_dir = os.path.join(cwd, 'output')
transformation_file = os.path.join(input_dir, 'result_of_transformation.csv')

## Menyimpan data hasil transformasi sebelumnya dan menyimpannya dalam *dataframe* baru

In [3]:
df = pd.read_csv(transformation_file)

In [4]:
df

Unnamed: 0,id,label,time,amplitude,event_id
0,1,1,0,-0.032211,1
1,1,1,1,-0.010978,1
2,1,1,2,-0.007734,1
3,1,1,3,-0.003970,1
4,1,1,4,0.004136,1
...,...,...,...,...,...
2114894,2114,2,995,-0.220835,2725
2114895,2114,2,996,-0.300042,2725
2114896,2114,2,997,-0.379990,2725
2114897,2114,2,998,-0.407046,2725


---

## Menghapus kolom yang tidak perlu

In [5]:
df = df.drop(columns=['event_id','label'], axis=1)

In [6]:
df

Unnamed: 0,id,time,amplitude
0,1,0,-0.032211
1,1,1,-0.010978
2,1,2,-0.007734
3,1,3,-0.003970
4,1,4,0.004136
...,...,...,...
2114894,2114,995,-0.220835
2114895,2114,996,-0.300042
2114896,2114,997,-0.379990
2114897,2114,998,-0.407046


---

## Mengambil data label

In [7]:
y = pd.read_csv(os.path.join(input_dir, 'label.csv'), index_col=0)
y = y['label']
y

1       1
2       1
3       1
4       1
5       1
       ..
2110    2
2111    2
2112    2
2113    2
2114    2
Name: label, Length: 2114, dtype: int64

---

## Melakukan ekstraksi features menggunakan EfficientParamater
https://tsfresh.readthedocs.io/en/latest/text/feature_extraction_settings.html

In [8]:
settings = EfficientFCParameters()
processor = 5

In [9]:
efficient_features = extract_features(
    df, 
    column_id='id', 
    column_sort='time', 
    default_fc_parameters=settings, 
    n_jobs=processor
)

Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 25/25 [02:26<00:00,  5.87s/it]


Melihat hasil ekstraksi *features*

In [10]:
efficient_features

Unnamed: 0,amplitude__variance_larger_than_standard_deviation,amplitude__has_duplicate_max,amplitude__has_duplicate_min,amplitude__has_duplicate,amplitude__sum_values,amplitude__abs_energy,amplitude__mean_abs_change,amplitude__mean_change,amplitude__mean_second_derivative_central,amplitude__median,...,amplitude__permutation_entropy__dimension_5__tau_1,amplitude__permutation_entropy__dimension_6__tau_1,amplitude__permutation_entropy__dimension_7__tau_1,amplitude__query_similarity_count__query_None__threshold_0.0,"amplitude__matrix_profile__feature_""min""__threshold_0.98","amplitude__matrix_profile__feature_""max""__threshold_0.98","amplitude__matrix_profile__feature_""mean""__threshold_0.98","amplitude__matrix_profile__feature_""median""__threshold_0.98","amplitude__matrix_profile__feature_""25""__threshold_0.98","amplitude__matrix_profile__feature_""75""__threshold_0.98"
1,0.0,0.0,0.0,0.0,3.438250e-12,19.541841,0.070559,0.000011,-4.963242e-05,0.004209,...,3.551866,4.632335,5.572210,,0.989923,3.822515,2.233437,2.179965,1.839865,2.614556
2,0.0,0.0,0.0,0.0,2.301492e-13,7.390869,0.048051,0.000040,-1.195282e-05,0.001135,...,4.126592,5.285573,6.160600,,0.802988,3.243582,1.762846,1.733120,1.421240,2.095567
3,0.0,0.0,0.0,0.0,1.488698e-11,9.027499,0.046502,0.000010,-1.136874e-05,-0.000570,...,3.463705,4.468192,5.372275,,1.048702,4.391836,2.394916,2.259696,1.843935,2.875129
4,0.0,0.0,0.0,0.0,-6.516232e-12,5.815679,0.046589,-0.000024,6.332566e-07,0.000144,...,4.056004,5.205172,6.120090,,0.939566,3.836638,2.020023,1.979983,1.639101,2.374945
5,0.0,0.0,0.0,0.0,-6.692424e-13,49.325230,0.104913,0.000004,3.055010e-05,0.002612,...,3.329512,4.326275,5.235979,,1.108195,4.567074,2.492848,2.361321,1.898045,2.976771
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2110,0.0,0.0,0.0,0.0,3.393397e-13,0.555332,0.006994,-0.000013,3.071015e-07,-0.000141,...,3.741362,4.930222,5.947250,,0.856938,3.191243,1.915255,1.899585,1.667645,2.163333
2111,0.0,0.0,0.0,0.0,4.679590e-12,60.000985,0.083394,0.000142,6.279844e-05,-0.003158,...,2.560175,3.214032,3.889401,,1.222361,5.041593,2.727012,2.737334,2.240628,3.142049
2112,0.0,0.0,0.0,0.0,2.000705e-12,2.113640,0.028958,0.000003,-7.890697e-06,0.000007,...,3.813289,4.933336,5.931802,,0.841534,2.942069,1.570292,1.532020,1.318993,1.772453
2113,0.0,0.0,0.0,0.0,-3.342771e-12,50.162677,0.093422,-0.000127,-1.684957e-05,-0.003489,...,3.353572,4.237889,5.020215,,0.981940,5.077616,2.365450,2.202731,1.786046,2.693299


Menyimpan hasil perhitungan awal *features*

In [11]:
efficient_features.to_csv(os.path.join(output_dir, 'efficient_features.csv'), index=False)

---

### Melakukan pemilahan feature, dan membuang features yang tidak relevance

Imputing dilakukan terhadap *features* yang telah diekstrak untuk memastikan tidak ada data yang bersifat *null* atau *NaN*

In [12]:
impute(efficient_features)



Unnamed: 0,amplitude__variance_larger_than_standard_deviation,amplitude__has_duplicate_max,amplitude__has_duplicate_min,amplitude__has_duplicate,amplitude__sum_values,amplitude__abs_energy,amplitude__mean_abs_change,amplitude__mean_change,amplitude__mean_second_derivative_central,amplitude__median,...,amplitude__permutation_entropy__dimension_5__tau_1,amplitude__permutation_entropy__dimension_6__tau_1,amplitude__permutation_entropy__dimension_7__tau_1,amplitude__query_similarity_count__query_None__threshold_0.0,"amplitude__matrix_profile__feature_""min""__threshold_0.98","amplitude__matrix_profile__feature_""max""__threshold_0.98","amplitude__matrix_profile__feature_""mean""__threshold_0.98","amplitude__matrix_profile__feature_""median""__threshold_0.98","amplitude__matrix_profile__feature_""25""__threshold_0.98","amplitude__matrix_profile__feature_""75""__threshold_0.98"
1,0.0,0.0,0.0,0.0,3.438250e-12,19.541841,0.070559,0.000011,-4.963242e-05,0.004209,...,3.551866,4.632335,5.572210,0.0,0.989923,3.822515,2.233437,2.179965,1.839865,2.614556
2,0.0,0.0,0.0,0.0,2.301492e-13,7.390869,0.048051,0.000040,-1.195282e-05,0.001135,...,4.126592,5.285573,6.160600,0.0,0.802988,3.243582,1.762846,1.733120,1.421240,2.095567
3,0.0,0.0,0.0,0.0,1.488698e-11,9.027499,0.046502,0.000010,-1.136874e-05,-0.000570,...,3.463705,4.468192,5.372275,0.0,1.048702,4.391836,2.394916,2.259696,1.843935,2.875129
4,0.0,0.0,0.0,0.0,-6.516232e-12,5.815679,0.046589,-0.000024,6.332566e-07,0.000144,...,4.056004,5.205172,6.120090,0.0,0.939566,3.836638,2.020023,1.979983,1.639101,2.374945
5,0.0,0.0,0.0,0.0,-6.692424e-13,49.325230,0.104913,0.000004,3.055010e-05,0.002612,...,3.329512,4.326275,5.235979,0.0,1.108195,4.567074,2.492848,2.361321,1.898045,2.976771
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2110,0.0,0.0,0.0,0.0,3.393397e-13,0.555332,0.006994,-0.000013,3.071015e-07,-0.000141,...,3.741362,4.930222,5.947250,0.0,0.856938,3.191243,1.915255,1.899585,1.667645,2.163333
2111,0.0,0.0,0.0,0.0,4.679590e-12,60.000985,0.083394,0.000142,6.279844e-05,-0.003158,...,2.560175,3.214032,3.889401,0.0,1.222361,5.041593,2.727012,2.737334,2.240628,3.142049
2112,0.0,0.0,0.0,0.0,2.000705e-12,2.113640,0.028958,0.000003,-7.890697e-06,0.000007,...,3.813289,4.933336,5.931802,0.0,0.841534,2.942069,1.570292,1.532020,1.318993,1.772453
2113,0.0,0.0,0.0,0.0,-3.342771e-12,50.162677,0.093422,-0.000127,-1.684957e-05,-0.003489,...,3.353572,4.237889,5.020215,0.0,0.981940,5.077616,2.365450,2.202731,1.786046,2.693299


Mengambil *features* yang relevance.
https://tsfresh.readthedocs.io/en/latest/api/tsfresh.feature_selection.html

In [13]:
relevance_features = select_features(
    X = efficient_features, 
    y = y,
    fdr_level = 0.005,
    ml_task = 'classification',
    n_jobs=processor
)

In [14]:
relevance_features

Unnamed: 0,amplitude__energy_ratio_by_chunks__num_segments_10__segment_focus_3,amplitude__kurtosis,"amplitude__agg_linear_trend__attr_""slope""__chunk_len_50__f_agg_""min""",amplitude__ratio_beyond_r_sigma__r_1,amplitude__ratio_beyond_r_sigma__r_3,amplitude__binned_entropy__max_bins_10,amplitude__ratio_beyond_r_sigma__r_0.5,"amplitude__agg_linear_trend__attr_""slope""__chunk_len_50__f_agg_""max""",amplitude__lempel_ziv_complexity__bins_100,amplitude__lempel_ziv_complexity__bins_10,...,amplitude__root_mean_square,amplitude__variance,amplitude__standard_deviation,"amplitude__fft_coefficient__attr_""real""__coeff_17",amplitude__ar_coefficient__coeff_5__k_10,"amplitude__fft_coefficient__attr_""abs""__coeff_82","amplitude__fft_coefficient__attr_""angle""__coeff_88","amplitude__fft_coefficient__attr_""angle""__coeff_98","amplitude__fft_coefficient__attr_""abs""__coeff_74","amplitude__fft_coefficient__attr_""imag""__coeff_16"
1,0.469513,5.959679,0.004841,0.200799,0.022977,1.217102,0.416583,-0.005377,0.445554,0.206793,...,0.139722,0.019522,0.139722,-0.255936,4.632095,4.781439,-174.380049,-3.787593,7.893489,0.297510
2,0.326544,10.845853,0.005782,0.166000,0.022000,1.095710,0.371000,-0.007323,0.433000,0.180000,...,0.085970,0.007391,0.085970,0.303064,0.149904,2.647566,29.291453,15.324467,0.810832,0.750789
3,0.345457,3.723390,0.002654,0.221000,0.020000,1.310342,0.470000,-0.002445,0.457000,0.213000,...,0.095013,0.009027,0.095013,1.545728,1.148375,4.472979,78.126925,142.877509,16.003414,0.244029
4,0.356715,5.284755,0.004640,0.215000,0.019000,1.142785,0.418000,-0.005138,0.449000,0.209000,...,0.076261,0.005816,0.076261,-1.038585,0.301560,2.017855,-79.612473,51.023973,2.397158,-0.162564
5,0.349177,2.932299,0.004034,0.209000,0.022000,1.614728,0.416000,-0.005391,0.469000,0.226000,...,0.222093,0.049325,0.222093,-0.031673,0.527180,8.606132,-73.521341,-61.072681,36.559995,-0.210694
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2110,0.136688,0.001900,0.001691,0.312687,0.000999,1.884606,0.591409,-0.000901,0.494505,0.236763,...,0.023554,0.000555,0.023554,-0.855171,3.327245,0.375969,-45.478794,-37.066539,0.343040,-0.623953
2111,0.188925,1.014374,-0.011636,0.289000,0.008000,1.725398,0.512000,0.011150,0.479000,0.227000,...,0.244951,0.060001,0.244951,1.461314,2.185293,2.275094,123.750387,155.411774,5.192975,0.177391
2112,0.123255,1.956209,-0.000634,0.251000,0.014000,1.548374,0.532000,0.000686,0.492000,0.237000,...,0.045974,0.002114,0.045974,0.379760,0.341125,0.942002,61.453897,-30.846669,3.685654,-0.541400
2113,0.113944,1.044436,-0.017344,0.276000,0.008000,1.722862,0.537000,0.020907,0.465000,0.227000,...,0.223970,0.050163,0.223970,-1.606857,0.216068,8.355193,-165.544002,5.897375,20.571795,0.963679


Menyimpan *relavance features* dalam CSV

In [15]:
relevance_features.to_csv(os.path.join(output_dir, 'relevance_features.csv'), index=False)