In [None]:
## General imports
import sys, os, joblib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from time import time

## sklearn imports
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold, SelectFromModel
from sklearn.model_selection import GroupKFold, cross_validate


# Define local directories
dir_current = os.getcwd()
dir_base = os.path.dirname(dir_current)

: 

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names, dtype=None):
        self.attribute_names = attribute_names
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_selected = X[self.attribute_names]
        if self.dtype:
            return X_selected.astype(self.dtype).values
        return X_selected.values

class CorrelationThreshold(BaseEstimator, TransformerMixin):
    
    """Feature selector that removes all correlated features.

    This feature selection algorithm looks only at the features (X), not the
    desired outputs (y), and can thus be used for unsupervised learning.
    
    Parameters
    ----------
    threshold : float, default=0.95
        Features with a training-set correlation higher than this threshold will
        be removed. The default is to keep all features with non-zero variance,
        i.e. remove the features that have the same value in all samples.

    Returns
    ----------
    selected_features_ : list, shape (n_features)
        Returns a list with the selected feature names.

    """

    def __init__(self, threshold = 0.95):
        self.threshold = threshold
        self.to_drop = None
        self.to_keep = None

    def fit (self, X, y = None ): 
        '''
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Sample vectors from which to compute variances.
        y : any, default=None
            Ignored. This parameter exists only for compatibility with
            sklearn.pipeline.Pipeline.
        Returns
        -------
        self
        '''
        X = pd.DataFrame(X)
        corr_matrix = X.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
        self.to_drop = [column for column in upper.columns if any(upper[column] > self.threshold)]
        self.to_keep = list(set(X.columns) - set(self.to_drop))
        return self
        
    def transform(self, X, y = None):
        X_selected = X[self.to_keep]
        return X_selected
    
    def get_support(self):
        return self.to_keep


# Importing datasets

In [3]:
# select dataframe
df_name = 'df-all-dev'
print('Dataset name:', df_name)
# directory where the datasets are located
df_path = os.path.join(dir_base, 'data', 'tsfel', "{}.csv".format(df_name))
# imports all datasets in directory
df = pd.read_csv( df_path, 
        index_col = ['Timestamp'], 
        parse_dates = ['Timestamp'],
        dayfirst = True,
        date_parser = lambda x: pd.to_datetime(x, format = '%Y-%m-%d %H:%M:%S.%f'))
print("Dataset shape:", df.shape)

Dataset name: df-all-dev
Dataset shape: (26338, 3335)


In [4]:
# insert a random variable for feature importance comparison
df.insert(0,                # position
            'random',       # column name
            np.random.RandomState(1234).uniform(low=0, high=1, size = (df.shape[0]),)) 


In [5]:
# # Feature preprocessing 
# # numeric
# numeric_features = df.columns[:-5]
# print("Numeric Features:",  numeric_features)
# print("Number of numeric features:", len(numeric_features))
# numeric_transformer = Pipeline([
#         ('ft', DataFrameSelector(numeric_features,'float64')),
#         ('var', VarianceThreshold()),
#         ('cor', CorrelationThreshold())
#      ])

# # categorical
# categorical_features = ["Breed"]
# print("Categorical features:", categorical_features)
# categorical_transformer = Pipeline([
#         ('ft', DataFrameSelector(categorical_features,'category')),
#         ('enc', OneHotEncoder(handle_unknown="ignore"))
#     ])

# # Preprocessor
# preprocessor = ColumnTransformer([
#         ("num", numeric_transformer, numeric_features),
#         ("cat", categorical_transformer, categorical_features),
#     ])

# # Random forest pipeline for feature selection
# rf_pipe = Pipeline([
#         ('prep', preprocessor),
#         ('clf', RandomForestClassifier(random_state= 42))
#     ])


Numeric Features: Index(['random', 'Back.Acc.X_Absolute energy',
       'Back.Acc.X_Area under the curve', 'Back.Acc.X_Autocorrelation',
       'Back.Acc.X_Centroid', 'Back.Acc.X_ECDF Percentile Count_0',
       'Back.Acc.X_ECDF Percentile Count_1', 'Back.Acc.X_ECDF Percentile_0',
       'Back.Acc.X_ECDF Percentile_1', 'Back.Acc.X_ECDF_0',
       ...
       'Neck.Gyr.Z_Wavelet variance_0', 'Neck.Gyr.Z_Wavelet variance_1',
       'Neck.Gyr.Z_Wavelet variance_2', 'Neck.Gyr.Z_Wavelet variance_3',
       'Neck.Gyr.Z_Wavelet variance_4', 'Neck.Gyr.Z_Wavelet variance_5',
       'Neck.Gyr.Z_Wavelet variance_6', 'Neck.Gyr.Z_Wavelet variance_7',
       'Neck.Gyr.Z_Wavelet variance_8', 'Neck.Gyr.Z_Zero crossing rate'],
      dtype='object', length=3331)
Number of numeric features: 3331
Categorical features: ['Breed']


In [7]:
feat = df.columns[:-5]
pipe = Pipeline([
        ('ft', DataFrameSelector(feat,'float64')),
        ('var', VarianceThreshold()),
        #('cor', CorrelationThreshold()),
        ('clf', RandomForestClassifier(random_state= 42))
    ])

In [10]:
from sklearn.model_selection import GroupKFold, GridSearchCV

start_time = time()
# using df_train to check on the Feature Importances for the RF classifier
# this will help me pick an optimal  number for the feature selection algorithm
# rf_cv = cross_validate(
#             estimator = rf_pipe, 
#             X = df.loc[:, feat].values, 
#             y = df.loc[:, 'Position'].values, 
#             groups = df.loc[:,'Dog'],
#             cv= GroupKFold(n_splits = 3), 
#             scoring = 'f1_weighted', 
#             return_train_score= True,
#             return_estimator = True,
#             n_jobs = -1
#         )

# prepare dataframe for evaluation: select features, label,
#   cv strategy (group = dogs, stractified folds labels proportion)
X = df.loc[:, feat]
y = df.loc[:, 'Position'].values

groups = df.loc[:,'Dog']
params = dict()
cv = GroupKFold(n_splits = 10).split(X, y, groups = groups)
gs = GridSearchCV(pipe, param_grid = params, 
        scoring = 'f1_weighted', \
        n_jobs = 40, cv = cv, return_train_score = True)
gs.fit(X,y, groups = groups)

end_time = time()
duration = end_time - start_time
print("\n\n--- %s seconds ---\n\n" % (duration))



--- 622.0925557613373 seconds ---




In [32]:
class gs_results:
    # Storing Grid Search results
    def __init__(self, gs):
        self.cv_results_ = gs.cv_results_
        self.best_estimator_ = gs.best_estimator_
        self.best_params_ = gs.best_params_
        self.best_score_ = gs.best_score_
        
print(gs)
run = "TSFEL-SELECT.pkl"
# save gs results to pickle file
gs_path = os.path.join(dir_current, run)
print(gs_path)
joblib.dump(gs_results(gs), gs_path, compress = 1)  

GridSearchCV(cv=<generator object _BaseKFold.split at 0x2b4ebda49d50>,
             estimator=Pipeline(steps=[('ft',
                                        DataFrameSelector(attribute_names=Index(['random', 'Back.Acc.X_Absolute energy',
       'Back.Acc.X_Area under the curve', 'Back.Acc.X_Autocorrelation',
       'Back.Acc.X_Centroid', 'Back.Acc.X_ECDF Percentile Count_0',
       'Back.Acc.X_ECDF Percentile Count_1', 'Back.Acc.X_ECDF Percen...
       'Neck.Gyr.Z_Wavelet variance_4', 'Neck.Gyr.Z_Wavelet variance_5',
       'Neck.Gyr.Z_Wavelet variance_6', 'Neck.Gyr.Z_Wavelet variance_7',
       'Neck.Gyr.Z_Wavelet variance_8', 'Neck.Gyr.Z_Zero crossing rate'],
      dtype='object', length=3331),
                                                          dtype='float64')),
                                       ('var', VarianceThreshold()),
                                       ('clf',
                                        RandomForestClassifier(random_state=42))]),
             n_jo

['/ichec/home/users/mmarcato/dog_posture/jupyter/TSFEL-SELECT.pkl']

In [35]:
importance_rf = pd.DataFrame({
        'features': feat[np.where(gs.best_estimator_['var'].variances_ != 0)], 
        'importance':  gs.best_estimator_['clf'].feature_importances_
        })

In [36]:
joblib.dump(importance_rf, os.path.join(dir_current, 'TSFEL-FS.pkl'), compress = 1)

['/ichec/home/users/mmarcato/dog_posture/jupyter/TSFEL-FS.pkl']

In [71]:
importance_threshold = importance_rf.loc[importance_rf['features'] == 'random','importance'].values[0]
type(importance_threshold)

numpy.float64

In [75]:
importance_rf.sort_values('importance', ascending = False, ignore_index = True, inplace = True)

In [73]:
importance_rf['keep'] = importance_rf['importance'] > importance_threshold


In [80]:
print('Number of features in original dataset: ', len(feat),
      '\nNumber of features (Variance): ', importance_rf.shape[0], 
      '\nNumber of features (Random threshold):', importance_rf['keep'].sum())

Number of features in original dataset:  3331 
Number of features (Variance):  3150 
Number of features (Random threshold): 2344


In [82]:
joblib.dump(importance_rf.loc[importance_rf['keep'], 'features'], os.path.join(dir_current, 'TSFEL-FEATURES.pkl'), compress = 1)

['/ichec/home/users/mmarcato/dog_posture/jupyter/TSFEL-FEATURES.pkl']

In [83]:
feat = joblib.load(os.path.join(dir_current, 'TSFEL-FEATURES.pkl'))

In [85]:
type(feat)

pandas.core.series.Series

In [21]:
# class cv_results:
#     # Storing Cross Validate results and the feature names
#     def __init__(self, cv, feat):
#         self.test_score = cv.test_score
#         self.train_score = cv.train_score
#         self.fit_time = cv.fit_time
#         self.score_time = cv.score_time
#         self.estimator = cv.estimator
#         self.feat = feat

# print(rf_cv)
# run = "TSFEL-SELECT.pkl"
# # save gs results to pickle file
# gs_path = os.path.join(dir_current, run)
# print(gs_path)
# joblib.dump(cv_results(rf_cv, feat), gs_path, compress = 1)

(7000,)