# Feature Extraction and Selection

In [1]:
import numpy as np
import pandas as pd
import pickle
import tsfresh
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tsfresh.feature_extraction.settings import from_columns
from tsfresh.feature_selection.relevance import calculate_relevance_table

The *input_id* corresponds to the dataset used in this analysis. (See *Input* directory for more information)

In [2]:
input_id = 1

PATH = '../Input/Input_%i.csv' %(input_id)

### Load dataset

In [3]:
full_data = np.genfromtxt(PATH,delimiter=',')

In [4]:
n_measures = int(full_data[:,1].max())
n_timeseries = int(full_data[:,0].max())

inadequate_condition = int(full_data[:,-1].sum()/n_measures)
adequate_condition = n_timeseries - inadequate_condition


print(30*'-')
print('Shape:', full_data.shape)
print(30*'-')
print('Number of Time Series:', n_timeseries)
print('    Adequate Condition:', adequate_condition)
print('    Inadequate Condition:', inadequate_condition)
print(30*'-')
print('Measurments per Time Serie:', n_measures)
print(30*'-')

------------------------------
Shape: (444000, 9)
------------------------------
Number of Time Series: 592
    Adequate Condition: 352
    Inadequate Condition: 240
------------------------------
Measurments per Time Serie: 750
------------------------------


### Split between Train and Test

In [5]:
index = full_data[::n_measures,0].astype(int) - 1
target = full_data[::n_measures,-1]

L, W = full_data.shape

train_idx, test_idx, train_target, test_target = train_test_split(index, target, test_size=.3, 
                                                                  stratify=target, random_state=12)
train_idx.sort()
test_idx.sort()

L_train = train_idx.shape[0]
train_index = np.zeros(L_train*n_measures, dtype=np.int32)
for ii in range(L_train):
    train_index[ii*n_measures:(ii+1)*n_measures] = list(range(train_idx[ii]*n_measures,
                                                                (train_idx[ii]+1)*n_measures)) 
L_test = test_idx.shape[0]
test_index = np.zeros(L_test*n_measures, dtype=np.int32)
for ii in range(L_test):
    test_index[ii*n_measures:(ii+1)*n_measures] = list(range(test_idx[ii]*n_measures,
                                                                (test_idx[ii]+1)*n_measures)) 
    
train_data = full_data[train_index,:]
test_data = full_data[test_index,:]

train_inadequate_condition = int(train_data[:,-1].sum()/n_measures)
train_adequate_condition = int(train_data.shape[0]/n_measures - train_inadequate_condition)


test_inadequate_condition = int(test_data[:,-1].sum()/n_measures)
test_adequate_condition = int(test_data.shape[0]/n_measures - test_inadequate_condition)

print(30*'-')
print('Train shape:',train_data.shape)
print('    Number of Time Series:', int(train_data.shape[0]/n_measures))
print('        Adequate Condition:', train_adequate_condition)
print('        Inadequate Condition:', train_inadequate_condition)
print(30*'-')
print('Test shape:',test_data.shape)
print('    Number of Time Series:', int(test_data.shape[0]/n_measures))
print('        Adequate Condition:', test_adequate_condition)
print('        Inadequate Condition:', test_inadequate_condition)
print(30*'-')

np.savetxt('Subsets/Input_{}_Train.csv'.format(input_id), train_data, delimiter=',')
np.savetxt('Subsets/Input_{}_Test.csv'.format(input_id), test_data, delimiter=',')

------------------------------
Train shape: (310500, 9)
    Number of Time Series: 414
        Adequate Condition: 246
        Inadequate Condition: 168
------------------------------
Test shape: (133500, 9)
    Number of Time Series: 178
        Adequate Condition: 106
        Inadequate Condition: 72
------------------------------


### Normalize dataset

In [6]:
# Split dataset in 3 parts
info = train_data[:,0:2] # [id, time]
data = train_data[:,2:-1] # [Voltage A, Voltage B, Voltage C, Current A, Current B, Current C]
target = train_data[::n_measures,-1] #[target]

# Normalizing each column within (0,1)
scaler = MinMaxScaler()
data = scaler.fit_transform(data)

columns = ['id', 'time', 'voltage_A', 'voltage_B', 'voltage_C', 'current_A', 'current_B', 'current_C']
df = pd.DataFrame(np.concatenate((info,data), axis=1), columns=columns)

df.head()

Unnamed: 0,id,time,voltage_A,voltage_B,voltage_C,current_A,current_B,current_C
0,1.0,1.0,0.5,0.456,0.524823,0.366407,0.71549,0.347956
1,1.0,2.0,0.478873,0.44,0.539007,0.292153,0.718299,0.579262
2,1.0,3.0,0.485915,0.448,0.539007,0.414721,0.69382,0.622134
3,1.0,4.0,0.478873,0.44,0.546099,0.46725,0.290128,0.393154
4,1.0,5.0,0.478873,0.448,0.546099,0.421855,0.64687,0.262213


### Feature Extraction

In [7]:
extracted_features = tsfresh.extract_features(df, column_id="id", column_sort="time", n_jobs=4)

Feature Extraction: 100%|██████████| 20/20 [09:36<00:00, 28.83s/it]


### Checking NaN Features

In [8]:
features = extracted_features.columns

nan_columns = []
valid_features = []
for col in features:
    if extracted_features.loc[:,col].hasnans:
        nan_columns.append(col)
    else:
        valid_features.append(col)

print(30*'-')              
print('Number of Features: ', len(features))
print('Number of Valid Features: ', len(valid_features))
print('Number of Invalid Features: ', len(nan_columns))
print(30*'-')
print('Percentage of Valid Features:  {:.6f}%'.format(len(valid_features)*100/len(features)))
print('Percentage of Invalid Features: {:.6f}%'.format(len(nan_columns)*100/len(features)))
print(30*'-')

valid_features_dict = from_columns(valid_features)

------------------------------
Number of Features:  4722
Number of Valid Features:  4701
Number of Invalid Features:  21
------------------------------
Percentage of Valid Features:  99.555273%
Percentage of Invalid Features: 0.444727%
------------------------------


### Feature Selection

In [9]:
X = extracted_features.drop(nan_columns, axis=1)
y = pd.Series(target, index=X.index)

In [10]:
relevance_table = calculate_relevance_table(X, y)

relevant_features = relevance_table[relevance_table.relevant].feature

selected_features = X.loc[:, relevant_features]

# Extracting the selected features dictionary from pandas data frame
kind_to_fc_parameters = tsfresh.feature_extraction.settings.from_columns(selected_features)

In [11]:
print(30*'-')
print('Number of Extracted Features: ', len(features))
print('Number of Selected Features: ', selected_features.shape[1])
print(30*'-')

------------------------------
Number of Extracted Features:  4722
Number of Selected Features:  994
------------------------------


### Saving files for next stages

In [13]:
with open('Kernel/scaler__{}.pkl'.format(input_id), 'wb') as f:
    pickle.dump(scaler, f)

with open('Kernel/valid_features_dict__{}.pkl'.format(input_id), 'wb') as f:
    pickle.dump(valid_features_dict, f)
    
with open('Kernel/final_target_{}.pkl'.format(input_id), 'wb') as f:
    pickle.dump(target, f)
    
with open('Kernel/kind_to_fc_parameters_{}.pkl'.format(input_id), 'wb') as f:
    pickle.dump(kind_to_fc_parameters, f)
    
with open('Kernel/columns_{}.pkl'.format(input_id), 'wb') as f:
    pickle.dump(selected_features.columns.to_list(), f)
    
with open('Kernel/selected_features_{}.pkl'.format(input_id), 'wb') as f:
    pickle.dump(selected_features, f)