In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Topic: EX2 - Turbofan RUL Prediction
**Task**: Predict the remaining useful life (RUL) of turbofan engines based on given sensor data (time series data). It is a regression problem.
**Data**: Turbofan engine degradation simulation data (NASA) - [Link](https://data.nasa.gov/dataset/Turbofan-Engine-Degradation-Simulation-Data-Set/vrks-gjie). See also in the topic [introduction notebook](https://github.com/nina-prog/damage-propagation-modeling/blob/2fb8c1a1102a48d7abbf04e4031807790a913a99/notebooks/Turbofan%20remaining%20useful%20life%20Prediction.ipynb).

**Subtasks**:
1. Perform a deep **exploratory data analysis (EDA)** on the given data. -> 🎯 **Focus on this task**
2. Implement a more efficient **sliding window method** for time series data analysis.
3. Apply **traditional machine learning methods** (SOTA) to predict the remaining useful life. Includes data preparation, feature extraction, feature selection, model selection, and model parameter optimization.
4. Create **neural network models** to predict the remaining useful life. Includes different architectures like Convolutional Neural Networks (CNN), Recurrent Neural Networks (RNN), or Attention Models. Note: You can search for SOTA research papers and reproduce current state-of-the-art models.


# Imports + Settings

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
# third-party libraries
import pandas as pd
import numpy as np
import os

import time
from tqdm.notebook import tqdm

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
import sys
sys.path.insert(0, '/content/drive/MyDrive/PSDA')

In [6]:
!pip install tsfresh
!pip install colorlog

Collecting tsfresh
  Downloading tsfresh-0.20.2-py2.py3-none-any.whl (95 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.8/95.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting stumpy>=1.7.2 (from tsfresh)
  Downloading stumpy-1.12.0-py3-none-any.whl (169 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.1/169.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: stumpy, tsfresh
Successfully installed stumpy-1.12.0 tsfresh-0.20.2
Collecting colorlog
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Installing collected packages: colorlog
Successfully installed colorlog-6.8.2


In [7]:
# source code
from src.utils import load_data, load_config
from src.data_preprocessing import create_rolling_windows_datasets

In [8]:
# settings
sns.set_style("whitegrid")
sns.set_palette("Set2")
sns.set(rc={"figure.dpi":100, 'savefig.dpi':200})
sns.set_context('notebook')

In [9]:
np.random.seed(42)

# Paths

In [10]:
# Make sure to execute this cell only once for one kernel session, before running any other cell below.
#os.chdir("../") # set working directory to root of project
#os.getcwd() # check current working directory

In [10]:
PATH_TO_CONFIG = "drive/MyDrive/PSDA/configs/config.yaml"

# Load Config + Data

In [11]:
config = load_config(PATH_TO_CONFIG) # config is dict

In [12]:
train_data_1, test_data_1, test_RUL_data_1 = load_data(config_path=PATH_TO_CONFIG, dataset_num=1)

2024-05-19 16:45:49 [[34msrc.utils:60[0m] [[32mINFO[0m] >>>> Loading data set 1...[0m


INFO:src.utils:Loading data set 1...


2024-05-19 16:45:50 [[34msrc.utils:89[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 1.[0m


INFO:src.utils:Loaded raw data for dataset 1.


2024-05-19 16:45:50 [[34msrc.utils:90[0m] [[32mINFO[0m] >>>> Train Data: (20631, 26)[0m


INFO:src.utils:Train Data: (20631, 26)


2024-05-19 16:45:50 [[34msrc.utils:91[0m] [[32mINFO[0m] >>>> Test Data: (13096, 26)[0m


INFO:src.utils:Test Data: (13096, 26)


2024-05-19 16:45:50 [[34msrc.utils:92[0m] [[32mINFO[0m] >>>> Test RUL Data: (100, 1)[0m


INFO:src.utils:Test RUL Data: (100, 1)


In [13]:
train_data_2, test_data_2, test_RUL_data_2 = load_data(config_path=PATH_TO_CONFIG, dataset_num=2)

2024-05-19 16:45:52 [[34msrc.utils:60[0m] [[32mINFO[0m] >>>> Loading data set 2...[0m


INFO:src.utils:Loading data set 2...


2024-05-19 16:45:54 [[34msrc.utils:89[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 2.[0m


INFO:src.utils:Loaded raw data for dataset 2.


2024-05-19 16:45:54 [[34msrc.utils:90[0m] [[32mINFO[0m] >>>> Train Data: (53759, 26)[0m


INFO:src.utils:Train Data: (53759, 26)


2024-05-19 16:45:54 [[34msrc.utils:91[0m] [[32mINFO[0m] >>>> Test Data: (33991, 26)[0m


INFO:src.utils:Test Data: (33991, 26)


2024-05-19 16:45:54 [[34msrc.utils:92[0m] [[32mINFO[0m] >>>> Test RUL Data: (259, 1)[0m


INFO:src.utils:Test RUL Data: (259, 1)


In [14]:
train_data_3, test_data_3, test_RUL_data_3 = load_data(config_path=PATH_TO_CONFIG, dataset_num=3)

2024-05-19 16:45:55 [[34msrc.utils:60[0m] [[32mINFO[0m] >>>> Loading data set 3...[0m


INFO:src.utils:Loading data set 3...


2024-05-19 16:45:56 [[34msrc.utils:89[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 3.[0m


INFO:src.utils:Loaded raw data for dataset 3.


2024-05-19 16:45:56 [[34msrc.utils:90[0m] [[32mINFO[0m] >>>> Train Data: (24720, 26)[0m


INFO:src.utils:Train Data: (24720, 26)


2024-05-19 16:45:56 [[34msrc.utils:91[0m] [[32mINFO[0m] >>>> Test Data: (16596, 26)[0m


INFO:src.utils:Test Data: (16596, 26)


2024-05-19 16:45:56 [[34msrc.utils:92[0m] [[32mINFO[0m] >>>> Test RUL Data: (100, 1)[0m


INFO:src.utils:Test RUL Data: (100, 1)


In [15]:
train_data_4, test_data_4, test_RUL_data_4 = load_data(config_path=PATH_TO_CONFIG, dataset_num=4)

2024-05-19 16:45:57 [[34msrc.utils:60[0m] [[32mINFO[0m] >>>> Loading data set 4...[0m


INFO:src.utils:Loading data set 4...


2024-05-19 16:45:59 [[34msrc.utils:89[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 4.[0m


INFO:src.utils:Loaded raw data for dataset 4.


2024-05-19 16:45:59 [[34msrc.utils:90[0m] [[32mINFO[0m] >>>> Train Data: (61249, 26)[0m


INFO:src.utils:Train Data: (61249, 26)


2024-05-19 16:45:59 [[34msrc.utils:91[0m] [[32mINFO[0m] >>>> Test Data: (41214, 26)[0m


INFO:src.utils:Test Data: (41214, 26)


2024-05-19 16:45:59 [[34msrc.utils:92[0m] [[32mINFO[0m] >>>> Test RUL Data: (248, 1)[0m


INFO:src.utils:Test RUL Data: (248, 1)


# 📍 Subtask 1: Feature Extraction

### Feature Extraction with tsfresh

In [16]:
from tsfresh.feature_extraction.settings import MinimalFCParameters, EfficientFCParameters, ComprehensiveFCParameters

In [17]:
features_all = ComprehensiveFCParameters()
features_eff = EfficientFCParameters()
features_minimal = MinimalFCParameters()



In [18]:
features_all.keys() ^ features_eff.keys()

{'approximate_entropy', 'sample_entropy'}

Findings: The differences between efficient and all features are two features which have a very high computational cost which is why these two features are not further investigated.

Dataset 1

There is a bug in extracting the RUL in create_rolling_windows_datasets

In [19]:
np.random.seed(seed=5)
random_indices = np.arange(1, 101)
np.random.shuffle(random_indices)

train_data_1_sample = train_data_1[train_data_1['UnitNumber'].isin(random_indices[:10])]

In [21]:
train_data_1_rolling = create_rolling_windows_datasets(train_data_1_sample, test_data_1, test_RUL_data_1, max_timeshift=20, min_timeshift=5, feature_extraction_mode='efficient')



2024-05-19 16:34:52 [[34msrc.data_preprocessing:78[0m] [[32mINFO[0m] >>>> Creating rolling windows for train data...[0m


INFO:src.data_preprocessing:Creating rolling windows for train data...
Rolling: 100%|██████████| 313/313 [00:05<00:00, 58.13it/s]


2024-05-19 16:34:57 [[34msrc.data_preprocessing:82[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m


INFO:src.data_preprocessing:Extracting features for train data...
Feature Extraction:  21%|██        | 10187/48096 [09:53<36:46, 17.18it/s]


KeyboardInterrupt: 

In [None]:
X_train, y_train, X_test, y_test = train_data_1_rolling
timestamp = time.strftime("%Y%m%d-%H%M%S")
X_train.to_pickle(f"{config['paths']['processed_data_dir']}ex2_train_data_1_max_20_min_5_sample_10_{timestamp}.pkl")
y_train.to_pickle(f"{config['paths']['processed_data_dir']}ex2_rul_train_data_1_max_20_min_5_sample_10_{timestamp}.pkl")
X_test.to_pickle(f"{config['paths']['processed_data_dir']}ex2_test_data_1_max_20_min_5_sample_10_{timestamp}.pkl")
y_test.to_pickle(f"{config['paths']['processed_data_dir']}ex2_rul_test_data_1_max_20_min_5_sample_10_{timestamp}.pkl")

In [None]:
#Check if data is saved
pd.read_pickle('drive/MyDrive/PSDA/data/processed/ex2_train_data_2_max_20_min_5_sample_10_20240515-160124.pkl')

Dataset 2

In [None]:
np.random.seed(seed=4)
random_indices = np.arange(1, 101)
np.random.shuffle(random_indices)

train_data_2_sample = train_data_2[train_data_2['UnitNumber'].isin(random_indices[:10])]

In [None]:
train_data_2_rolling = create_rolling_windows_datasets(train_data_2_sample, test_data_2, test_RUL_data_2, max_timeshift=20, min_timeshift=5, feature_extraction_mode='efficient')

In [None]:
X_train, y_train, X_test, y_test = train_data_2_rolling
timestamp = time.strftime("%Y%m%d-%H%M%S")
X_train.to_pickle(f"{config['paths']['processed_data_dir']}ex2_train_data_2_max_20_min_5_sample_10_{timestamp}.pkl")
y_train.to_pickle(f"{config['paths']['processed_data_dir']}ex2_rul_train_data_2_max_20_min_5_sample_10_{timestamp}.pkl")
X_test.to_pickle(f"{config['paths']['processed_data_dir']}ex2_test_data_2_max_20_min_5_sample_10_{timestamp}.pkl")
y_test.to_pickle(f"{config['paths']['processed_data_dir']}ex2_rul_test_data_2_max_20_min_5_sample_10_{timestamp}.pkl")

In [None]:
#Check if data is saved
pd.read_pickle('drive/MyDrive/PSDA/data/processed/ex2_train_data_2_max_20_min_5_sample_10_20240515-160124.pkl')

Dataset 3

In [None]:
np.random.seed(seed=8743)
random_indices = np.arange(1, 101)
np.random.shuffle(random_indices)

train_data_3_sample = train_data_3[train_data_3['UnitNumber'].isin(random_indices[:10])]

In [None]:
train_data_3_rolling = create_rolling_windows_datasets(train_data_3_sample, test_data_3, test_RUL_data_3, max_timeshift=20, min_timeshift=5, feature_extraction_mode='efficient')

In [None]:
train_data_3_rolling = create_rolling_windows_datasets(train_data_3_sample, test_data_3, test_RUL_data_3, max_timeshift=20, min_timeshift=5, feature_extraction_mode='efficient')

In [None]:
X_train, y_train, X_test, y_test = train_data_3_rolling
timestamp = time.strftime("%Y%m%d-%H%M%S")
X_train.to_pickle(f"{config['paths']['processed_data_dir']}ex2_train_data_3_max_20_min_5_sample_10_{timestamp}.pkl")
y_train.to_pickle(f"{config['paths']['processed_data_dir']}ex2_rul_train_data_3_max_20_min_5_sample_10_{timestamp}.pkl")
X_test.to_pickle(f"{config['paths']['processed_data_dir']}ex2_test_data_3_max_20_min_5_sample_10_{timestamp}.pkl")
y_test.to_pickle(f"{config['paths']['processed_data_dir']}ex2_rul_test_data_3_max_20_min_5_sample_10_{timestamp}.pkl")

In [None]:
#Check if data is saved
pd.read_pickle('drive/MyDrive/PSDA/data/processed/ex2_train_data_3_max_20_min_5_sample_10_20240516-075646.pkl')

Dataset 4

In [None]:
np.random.seed(seed=532)
random_indices = np.arange(1, 101)
np.random.shuffle(random_indices)

train_data_4_sample = train_data_4[train_data_4['UnitNumber'].isin(random_indices[:10])]

In [None]:
train_data_4_rolling = create_rolling_windows_datasets(train_data_4_sample, test_data_4, test_RUL_data_4, max_timeshift=15, min_timeshift=5, feature_extraction_mode='efficient')

In [None]:
X_train, y_train, X_test, y_test = train_data_4_rolling
timestamp = time.strftime("%Y%m%d-%H%M%S")
X_train.to_pickle(f"{config['paths']['processed_data_dir']}ex2_train_data_4_max_15_min_5_sample_10_{timestamp}.pkl")
y_train.to_pickle(f"{config['paths']['processed_data_dir']}ex2_rul_train_data_4_max_15_min_5_sample_10_{timestamp}.pkl")
X_test.to_pickle(f"{config['paths']['processed_data_dir']}ex2_test_data_4_max_15_min_5_sample_10_{timestamp}.pkl")
y_test.to_pickle(f"{config['paths']['processed_data_dir']}ex2_rul_test_data_4_max_15_min_5_sample_10_{timestamp}.pkl")

In [None]:
#Check if data is saved
pd.read_pickle('drive/MyDrive/PSDA/data/processed/ex2_topic_train_data_4_max_20_min_5_sample_10_20240515-160124.pkl')

___________________
Import Datasets
____________________

In [20]:
# Import uploaded data to avoid a collection everytime
#pd.read_pickle('drive/MyDrive/PSDA/data/processed/XXX')

ds1_train = pd.read_pickle('drive/MyDrive/PSDA/data/processed/ex2_train_data_1_max_20_min_5_sample_10_20240516-070658.pkl')
ds2_train = pd.read_pickle('drive/MyDrive/PSDA/data/processed/ex2_train_data_2_max_20_min_5_sample_10_20240515-160124.pkl')
ds3_train = pd.read_pickle('drive/MyDrive/PSDA/data/processed/ex2_train_data_3_max_20_min_5_sample_10_20240516-075646.pkl')
ds4_train = pd.read_pickle('drive/MyDrive/PSDA/data/processed/ex2_train_data_4_max_15_min_5_sample_10_20240519-095800.pkl')

In [21]:

ds1_test = pd.read_pickle('drive/MyDrive/PSDA/data/processed/ex2_test_data_1_max_20_min_5_sample_10_20240516-070658.pkl')
ds2_test = pd.read_pickle('drive/MyDrive/PSDA/data/processed/ex2_test_data_2_max_20_min_5_sample_10_20240515-160124.pkl')
ds3_test = pd.read_pickle('drive/MyDrive/PSDA/data/processed/ex2_test_data_3_max_20_min_5_sample_10_20240516-075646.pkl')
ds4_test = pd.read_pickle('drive/MyDrive/PSDA/data/processed/ex2_test_data_4_max_15_min_5_sample_10_20240519-095800.pkl')

In [22]:
ds1_rul_train = pd.read_pickle('drive/MyDrive/PSDA/data/processed/ex2_rul_train_data_1_max_20_min_5_sample_10_20240519-090004.pkl')
ds2_rul_train = pd.read_pickle('drive/MyDrive/PSDA/data/processed/ex2_rul_train_data_2_max_20_min_5_sample_10_20240519-090004.pkl')
ds3_rul_train = pd.read_pickle('drive/MyDrive/PSDA/data/processed/ex2_rul_train_data_3_max_20_min_5_sample_10_20240519-090004.pkl')
ds4_rul_train = pd.read_pickle('drive/MyDrive/PSDA/data/processed/ex2_rul_train_data_4_max_15_min_5_sample_10_20240519-095800.pkl')

In [23]:
ds1_rul_test = pd.read_pickle('drive/MyDrive/PSDA/data/processed/ex2_rul_test_data_1_max_20_min_5_sample_10_20240516-070658.pkl')
ds2_rul_test = pd.read_pickle('drive/MyDrive/PSDA/data/processed/ex2_rul_test_data_2_max_20_min_5_sample_10_20240515-160124.pkl')
ds3_rul_test = pd.read_pickle('drive/MyDrive/PSDA/data/processed/ex2_rul_test_data_3_max_20_min_5_sample_10_20240516-075646.pkl')
ds4_rul_test = pd.read_pickle('drive/MyDrive/PSDA/data/processed/ex2_rul_test_data_4_max_15_min_5_sample_10_20240519-095800.pkl')

Features Analysis

In [24]:
print(ds1_train.shape, ds1_rul_train.shape)
print(ds2_train.shape, ds2_rul_train.shape)
print(ds3_train.shape, ds3_rul_train.shape)
print(ds4_train.shape, ds4_rul_train.shape)

(2004, 18648) (2054, 1)
(2048, 18648) (2098, 1)
(2101, 18648) (2151, 1)
(2153, 18648) (2153, 1)


Findings/Notes: shape for rul and data is not fitting!

In [25]:
print(ds1_test.shape, ds1_rul_test.shape)
print(ds2_test.shape, ds2_rul_test.shape)
print(ds3_test.shape, ds3_rul_test.shape)
print(ds4_test.shape, ds4_rul_test.shape)

(100, 18648) (100, 1)
(259, 18648) (259, 1)
(100, 18648) (100, 1)
(248, 18648) (248, 1)


In [26]:
print(ds4_train.shape)
ds4_train["RUL"] = ds4_rul_train
print(ds4_train.shape)


(2153, 18648)
(2153, 18649)


In [27]:
correlation = ds4_train.corr()

In [28]:
correlation.to_pickle("drive/MyDrive/PSDA/data/processed/dataset4_correlation_matrix.pkl")

In [30]:
corr_rul = []
for i in range(0,ds4_train.shape(1)-1):
  if ds4_train["RUL"].corr(ds4_train[i])

SyntaxError: expected ':' (<ipython-input-30-131ed6714ae8>, line 3)

In [31]:
corr_rul = correlation["RUL"]
cd = corr_rul.dropna()
list_corr25_ds4 = []
for i in range(0,len(cd)):
    if cd.iloc[i] < 0.25 and cd.iloc[i] > -0.25:
        list_corr25_ds4.append(cd.index[i])



print(list_corr25_ds4)
cdd = cd.drop(list_corr25_ds4, inplace=False)
cddo = cdd.abs().sort_values(kind='quicksort',ascending=False)
print(cd.shape, cdd.shape)

['Operation Setting 1__has_duplicate_max', 'Operation Setting 1__has_duplicate_min', 'Operation Setting 1__has_duplicate', 'Operation Setting 1__mean_abs_change', 'Operation Setting 1__mean_change', 'Operation Setting 1__mean_second_derivative_central', 'Operation Setting 1__median', 'Operation Setting 1__mean', 'Operation Setting 1__length', 'Operation Setting 1__standard_deviation', 'Operation Setting 1__variation_coefficient', 'Operation Setting 1__variance', 'Operation Setting 1__skewness', 'Operation Setting 1__kurtosis', 'Operation Setting 1__root_mean_square', 'Operation Setting 1__absolute_sum_of_changes', 'Operation Setting 1__longest_strike_below_mean', 'Operation Setting 1__longest_strike_above_mean', 'Operation Setting 1__count_above_mean', 'Operation Setting 1__count_below_mean', 'Operation Setting 1__last_location_of_maximum', 'Operation Setting 1__first_location_of_maximum', 'Operation Setting 1__last_location_of_minimum', 'Operation Setting 1__first_location_of_minimum'

In [32]:
corr_rul = correlation["RUL"]
cd = corr_rul.dropna()
list_corr1_ds4 = []
for i in range(0,len(cd)):
    if cd.iloc[i] < 0.1 and cd.iloc[i] > -0.1:
        list_corr1_ds4.append(cd.index[i])

print(list_corr1_ds4)
cdd = cd.drop(list_corr1_ds4, inplace=False)
cddo = cdd.abs().sort_values(kind='quicksort',ascending=False)
print(cd.shape, cdd.shape)

['Operation Setting 1__has_duplicate_max', 'Operation Setting 1__has_duplicate_min', 'Operation Setting 1__has_duplicate', 'Operation Setting 1__mean_abs_change', 'Operation Setting 1__mean_change', 'Operation Setting 1__mean_second_derivative_central', 'Operation Setting 1__standard_deviation', 'Operation Setting 1__variance', 'Operation Setting 1__kurtosis', 'Operation Setting 1__absolute_sum_of_changes', 'Operation Setting 1__count_above_mean', 'Operation Setting 1__last_location_of_maximum', 'Operation Setting 1__first_location_of_maximum', 'Operation Setting 1__last_location_of_minimum', 'Operation Setting 1__first_location_of_minimum', 'Operation Setting 1__percentage_of_reoccurring_values_to_all_values', 'Operation Setting 1__percentage_of_reoccurring_datapoints_to_all_datapoints', 'Operation Setting 1__sum_of_reoccurring_values', 'Operation Setting 1__sum_of_reoccurring_data_points', 'Operation Setting 1__ratio_value_number_to_time_series_length', 'Operation Setting 1__maximum'

In [33]:
print(len(list_corr1_ds4))
print(len(list_corr25_ds4))

5963
7843


In [34]:
print(ds4_train.iloc[:,1])
#print(ds4_train.columns[1])

UnitNumber  Cycle
5           6        0.0
            7        0.0
            8        0.0
            9        0.0
            10       0.0
                    ... 
93          192      0.0
            193      0.0
            194      0.0
            195      0.0
            196      0.0
Name: Operation Setting 1__has_duplicate_max, Length: 2153, dtype: float64


In [35]:
list_const_ds4 = []

for i in  range(0, ds4_train.shape[1]):
  if max(ds4_train.iloc[:,i]) == min(ds4_train.iloc[:,i]):
     list_const_ds4.append(ds4_train.columns[i])

In [36]:
print(list_const_ds4)
print(len(list_const_ds4))

['Operation Setting 1__variance_larger_than_standard_deviation', 'Operation Setting 1__symmetry_looking__r_0.0', 'Operation Setting 1__symmetry_looking__r_0.35000000000000003', 'Operation Setting 1__symmetry_looking__r_0.4', 'Operation Setting 1__symmetry_looking__r_0.45', 'Operation Setting 1__symmetry_looking__r_0.5', 'Operation Setting 1__symmetry_looking__r_0.55', 'Operation Setting 1__symmetry_looking__r_0.6000000000000001', 'Operation Setting 1__symmetry_looking__r_0.65', 'Operation Setting 1__symmetry_looking__r_0.7000000000000001', 'Operation Setting 1__symmetry_looking__r_0.75', 'Operation Setting 1__symmetry_looking__r_0.8', 'Operation Setting 1__symmetry_looking__r_0.8500000000000001', 'Operation Setting 1__symmetry_looking__r_0.9', 'Operation Setting 1__symmetry_looking__r_0.9500000000000001', 'Operation Setting 1__large_standard_deviation__r_0.05', 'Operation Setting 1__large_standard_deviation__r_0.1', 'Operation Setting 1__large_standard_deviation__r_0.15000000000000002'

In [37]:
print(set(list_corr25_ds4) & set(list_const_ds4))
print(len(set(list_corr25_ds4) & set(list_const_ds4)))

set()
0


In [38]:
print(len(list_corr25_ds4)+len(list_const_ds4))
print(ds4_train.shape[1])
print("Remaining Features: ", ds4_train.shape[1] - (len(list_corr25_ds4)+len(list_const_ds4)))

18604
18649
Remaining Features:  45


In [45]:
drop_list = list_const_ds4 + list_corr25_ds4
ds4_rmn_fts = ds4_train.drop(columns=drop_list, inplace=False)

print(ds4_rmn_fts)
correlation.to_pickle("drive/MyDrive/PSDA/data/processed/dataset4_remaining_features_0519.pkl")

                  Operation Setting 1__sum_values  \
UnitNumber Cycle                                    
5          6                             179.0060   
           7                             221.0056   
           8                             241.0114   
           9                             276.0146   
           10                            311.0182   
...                                           ...   
93         192                           369.0294   
           193                           411.0307   
           194                           411.0289   
           195                           406.0284   
           196                           396.0237   

                  Operation Setting 1__abs_energy  \
UnitNumber Cycle                                    
5          6                          6103.446248   
           7                          7867.412649   
           8                          8267.644682   
           9                          9492.86

In [134]:
print(ds4_rmn_fts.columns)
wanted_features = ds4_rmn_fts.columns

Index(['Operation Setting 1__sum_values', 'Operation Setting 1__abs_energy',
       'Operation Setting 1__fft_coefficient__attr_"real"__coeff_0',
       'Operation Setting 1__fft_coefficient__attr_"abs"__coeff_0',
       'Operation Setting 2__sum_values', 'Operation Setting 2__abs_energy',
       'Operation Setting 2__benford_correlation',
       'Operation Setting 2__fft_coefficient__attr_"real"__coeff_0',
       'Operation Setting 2__fft_coefficient__attr_"abs"__coeff_0',
       'Sensor Measure 6__ratio_value_number_to_time_series_length',
       'Sensor Measure 7__permutation_entropy__dimension_4__tau_1',
       'Sensor Measure 11__abs_energy', 'Sensor Measure 13__median',
       'Sensor Measure 13__first_location_of_maximum',
       'Sensor Measure 13__percentage_of_reoccurring_values_to_all_values',
       'Sensor Measure 13__percentage_of_reoccurring_datapoints_to_all_datapoints',
       'Sensor Measure 13__sum_of_reoccurring_values',
       'Sensor Measure 13__sum_of_reoccurring

In [135]:
if type(wanted_features) != list: string_list = wanted_features.copy().tolist()

for i in range(0,len(string_list)):
  string_list[i] = string_list[i].partition("__")[2]
  string_list[i] = string_list[i].partition("__")[0]
  #print(string_list[i])

wanted_features = string_list

print(wanted_features)

['sum_values', 'abs_energy', 'fft_coefficient', 'fft_coefficient', 'sum_values', 'abs_energy', 'benford_correlation', 'fft_coefficient', 'fft_coefficient', 'ratio_value_number_to_time_series_length', 'permutation_entropy', 'abs_energy', 'median', 'first_location_of_maximum', 'percentage_of_reoccurring_values_to_all_values', 'percentage_of_reoccurring_datapoints_to_all_datapoints', 'sum_of_reoccurring_values', 'sum_of_reoccurring_data_points', 'ratio_value_number_to_time_series_length', 'maximum', 'absolute_maximum', 'quantile', 'quantile', 'quantile', 'quantile', 'change_quantiles', 'agg_linear_trend', 'maximum', 'absolute_maximum', 'quantile', 'quantile', 'binned_entropy', 'agg_linear_trend', 'sum_values', 'abs_energy', 'median', 'cwt_coefficients', 'cwt_coefficients', 'cwt_coefficients', 'cwt_coefficients', 'cwt_coefficients', 'cwt_coefficients', 'fft_coefficient', 'fft_coefficient', '']


In [137]:
from tsfresh.feature_extraction import feature_calculators
ds4_FCParameter = EfficientFCParameters().copy()

#for fname in feature_calculators.__dict__.keys():
#    if fname in ds4_FCParameter and fname in wanted_features:
#        print(fname)

for fname in feature_calculators.__dict__.keys():
    if fname in ds4_FCParameter and not fname in wanted_features:
        del ds4_FCParameter[fname]





sum_values
abs_energy
median
first_location_of_maximum
percentage_of_reoccurring_values_to_all_values
percentage_of_reoccurring_datapoints_to_all_datapoints
sum_of_reoccurring_values
sum_of_reoccurring_data_points
ratio_value_number_to_time_series_length
fft_coefficient
cwt_coefficients
change_quantiles
binned_entropy
permutation_entropy
quantile
maximum
absolute_maximum
agg_linear_trend
benford_correlation
