<h1> [Sleep-2-Learn] Part I - Before Learning </h1>

In [142]:
## Check environment
try:
    from google.colab import drive

    gpu_info = !nvidia-smi
    gpu_info = '\n'.join(gpu_info)
    if gpu_info.find('failed') >= 0:
        print('Not connected to a GPU')
    else:
        print(gpu_info)

    from psutil import virtual_memory
    ram_gb = virtual_memory().total / 1e9
    print(
        'Runtime has {:.1f} GB of available RAM\n'.format(ram_gb))

    drive.mount('/content/drive', force_remount=True)
    COLAB = True
    LAB_PATH = '/content/drive/MyDrive/+Research/__SLEEP__Workspace/[Sleep-2-Learn]_dev/'
    print("Note: Runing Google CoLab")
except:
    print("Note: Not using Google CoLab")
    COLAB = False
    LAB_PATH = ''


Note: Not using Google CoLab


# Sleep-2-Learn: Before Learning (Feature Engineering)

## Import Libraries

In [143]:
import os

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from collections import Counter

from tsfresh import select_features as SelectFeatures
from tsfresh import extract_features as ExtractFeatures

import warnings
from warnings import simplefilter


In [144]:
# Get rid of warnings
simplefilter(action='ignore', category=FutureWarning)

warnings.simplefilter("ignore")


## Setup

### Config

In [145]:
class Config(object):
    epoch        = 30   # epoch size
    time_unit    = 's'  # timestamp unit
    timeinterval = '1S' # resampling time interval

### Global Variables

In [146]:

class Var(object):

    ## --- Data Preparation ---
    subject_ids = []
    subject_joined_data = {}
    subject_time_series_dictionary = {}
    subject_formatted_time_series = {}

    ## --- Feature Extraction ---
    all_psg_df = pd.DataFrame()
    all_extracted_features_df = pd.DataFrame()
    
    ## --- Feature Selection ---
    all_selected_features_df = pd.DataFrame()


### Constants

In [147]:
class Constants(object):

    EPOCH_SIZE    = Config.epoch      
    TIME_INTERVAL = 1       # Time interval in second -- no more use

    DATASET_PATH = LAB_PATH + 'data/'             # Raw data path (Walch's data)
    CROPPED_PATH = LAB_PATH + 'outputs/cropped/'  # Walch's cropped data
    FEATURE_PATH = LAB_PATH + 'outputs/features/' # Walch's features path
    
    # Path to construct feature by tsfresh
    FEATURE_TSFRESH = LAB_PATH + '_features/feature-set/tsfresh/epoch/'
    
    #--- Uncomment for testing -> set to small dataset ---
    #CROPPED_PATH      = LAB_PATH + 'outputs/small-set/'
    #FEATURE_TSFRESH   = LAB_PATH + '_features/feature-set/test/'
      
    FEATURE_TS_EPOCH        = FEATURE_TSFRESH + '/size' + str(EPOCH_SIZE) + '/'
    
    DATA_PROCESSED_PATH     = FEATURE_TS_EPOCH + 'data-processing/'     # Path data processing feature by tsfresh
    FEATURE_EXTRACTED_PATH  = FEATURE_TS_EPOCH + 'extracted-features/'  # Path extracted feature by tsfresh
    FEATURE_SELECTED_PATH   = FEATURE_TS_EPOCH + 'selected-features/'   # Path selected feature by tsfresh

    


## Utilities

In [148]:
class Utils(object):
    
    def dict_to_csv(dictionary, csv_filename):
        export_df = pd.DataFrame.from_dict(dictionary, orient='index')
        export_df = export_df.transpose()
        export_df.to_csv(csv_filename, index=True, header=True)

    def dataframe_to_csv(data_df, csv_filename):
        df = pd.DataFrame(data_df)
        df.to_csv(csv_filename, index=True, header=True)
        
    # Check number of features extracted/selected from each raw data
    def check_number_of_features(csv_file, export_filename):
        features_dict = {}
        feature_types = ['accel_x', 'accel_y', 'accel_z', 'heart_rate']
        x, y, z, hr = [], [], [], []
        
        list_of_features = pd.read_csv(csv_file, nrows=0).columns.tolist()
        
        for feature in list_of_features:
            if 'accel_x' in feature:
                x.append(feature)
            elif'accel_y' in feature:
                y.append(feature)
            elif'accel_z' in feature:
                z.append(feature)
            elif'heart_rate' in feature:
                hr.append(feature)
        
        feature_type_arr = [x,y,z,hr]
        
        for index, feature_type in enumerate(feature_types):
            check_numbers = feature_type + ' => ' + str(len(feature_type_arr[index])) + ' features'
            print(check_numbers)
            (feature_type_arr[index]).insert(0, len(feature_type_arr[index]))
            
            features_dict[feature_type] = feature_type_arr[index]
            
        Utils.dict_to_csv(features_dict, export_filename)
   


## Module: Time Series Data

### Data Processing

In [149]:
class DataProcessing(object):

    # ----- Load data -----
    # Read sleep data from cropped data and parse to time series
    def read_subject_data(path, subject_id):

        motion_df = pd.read_csv('{}{}_cleaned_motion.out'.format(
            path, subject_id), sep=' ', names=["time", "accel_x", "accel_y", "accel_z"], 
                                parse_dates=['time'], index_col='time')

        hr_df = pd.read_csv('{}{}_cleaned_hr.out'.format(
            path, subject_id), sep=' ', names=["time", "heart_rate"],
            parse_dates=['time'], index_col='time')

        psg_df = pd.read_csv('{}{}_cleaned_psg.out'.format(
            path, subject_id), sep=' ', names=["time", "psg_label"],
            parse_dates=['time'], index_col='time')

        # replace psg '-1' to 'nan' (unscored psg data)
        psg_df.replace(-1, float('nan'), inplace=True)

        return (motion_df, hr_df, psg_df)
    
    # Convert index to time series index (timestamp)
    def convert_timeseries_index(dataframe, unit=Config.time_unit):
        dataframe.index = pd.to_datetime(dataframe.index, unit=unit)
        return dataframe
    
    # Process motion to time series data 
    # Resample (downsampling) from ~50Hz to specific timeinterval (1-second)
    # Downsampling method -> average between row
    def process_motion_timeseries_data(df, timeinterval=Config.timeinterval):
        df = DataProcessing.convert_timeseries_index(df)
        return df.resample(timeinterval).mean()

    # Process heart rate (bpm) to time series data
    # Resample (upsampling) from ~5-second to specific timeinterval (1-second)
    # Upsampling method -> linear interpolation
    def process_hr_timeseries_data(df, timeinterval=Config.timeinterval):
        df = DataProcessing.convert_timeseries_index(df)
        hr_df = df.resample(timeinterval).mean()
        hr_df = hr_df.interpolate(method='linear').astype('int')
        return hr_df
    
    # Process PSG label to time series data 
    # Resample (upsampling) from ~30-second to specific timeinterval (1-second)
    # Upsampling method -> copy from the nearest value (this does not change the original data)
    def process_psg_timeseries_data(df, timeinterval=Config.timeinterval):
        df = DataProcessing.convert_timeseries_index(df)
        df = df.dropna() # Drop unscored psg
        psg_df = df.resample(timeinterval).nearest()
        return psg_df
    
    # Synchronize data that already format to specific timeinterval (1-second) time series
    def synchronize_data(motion_df, hr_df, psg_df):
        # Merge motion and hr then fill the missing data
        synced_df = pd.merge(motion_df, hr_df,
                             how='outer', left_index=True, right_index=True)
        # Fill missing data: forward fill from above, then backward fill if the 1st row is NaN
        synced_df = synced_df.fillna(method='ffill').fillna(method='bfill')
        synced_df = pd.merge(synced_df, psg_df, how='inner',
                             left_index=True, right_index=True)
        # Extract timestamp to column and reset index to normal index value
        synced_df['timestamp'] = synced_df.index
        synced_df.reset_index(drop=True, inplace=True)
        return synced_df



### Making TimeSeries

In [150]:
class TimeSeries(object):
    
    # Join subject time series data and map to dictionary
    def map_joined_data_to_dict(path):

        for filename in os.listdir(path):

            terms = filename.split('_')
            subject_id = terms[0]

            if (subject_id not in Var.subject_ids) and (subject_id != '.DS') and (subject_id != 'desktop.ini'):
                Var.subject_ids.append(subject_id)

        # Sort the subject list
        sorted_subject_ids = sorted(Var.subject_ids)

        print("Total subjects: ", len(sorted_subject_ids))
        print(sorted_subject_ids)

        for subject_id in tqdm(sorted_subject_ids):
            motion_df, hr_df, psg_df = DataProcessing.read_subject_data(
                path, subject_id)

            # Process time series data for each data set
            motion_df = DataProcessing.process_motion_timeseries_data(
                motion_df)
            hr_df = DataProcessing.process_hr_timeseries_data(hr_df)
            psg_df = DataProcessing.process_psg_timeseries_data(psg_df)

            joined_data_df = DataProcessing.synchronize_data(motion_df, hr_df, psg_df)

            # Export csv of each subject
            subject_filename = Constants.DATA_PROCESSED_PATH + \
                str(subject_id) + '_joined_time_series_data.csv'
            Utils.dataframe_to_csv(joined_data_df, subject_filename)

            Var.subject_time_series_dictionary[subject_id] = joined_data_df
    
    # Convert timestamp to time in seconds -> combine to dataframe
    # Note: no more use
    def timestamp_to_timesecond(df):
        df['time_sec'] = df.index
        df_time = pd.to_datetime(df['time_sec'])
        df['time_sec'] = (df_time.dt.hour*60+df_time.dt.minute) * 60 + df_time.dt.second

        return df
        
    ### --- Formatting data for feature extraction ---
    def format_time_series_data():
        print('+-+-+-+- Format Time Series Data +-+-+-+')
        epoch_id = 0
        
        # Preparing for feature extraction: add epoch_id based on epoch size 
        # to assign id to each epoch
        for subject_id, df in tqdm(Var.subject_time_series_dictionary.items()):

            print("---------------", subject_id, "-----------------")
            print("--- check if nan exist ---")
            print(df.shape)
            df = df.dropna()
            print("--- cleaned data ---")
            print(df.shape)
            
            #df = TimeSeries.timestamp_to_timesecond(df)
            max_time_df = int(round(max(df.index) + 0.5))

            new_df = pd.DataFrame(columns=(list(df.columns).extend(["epoch_id", "subject_id"])))
            
            # Assign epoch id based on epoch size
            for i in np.arange(0, max_time_df + Config.epoch, Config.epoch):
                rows_in_epoch_df = pd.DataFrame(df.loc[(df.index >= (i)) & (
                    df.index < i + Config.epoch)])
                
                if not rows_in_epoch_df.empty:
                    rows_in_epoch_df['epoch_id'] = epoch_id
                    rows_in_epoch_df['subject_id'] = subject_id

                    new_df = pd.concat([new_df, rows_in_epoch_df], axis=0)

                    epoch_id += 1

            # Export csv of each subject
            subject_filename = Constants.DATA_PROCESSED_PATH + str(subject_id) + '_formated_time_series_data.csv'
            Utils.dataframe_to_csv(new_df, subject_filename)
            
            Var.subject_formatted_time_series[subject_id] = new_df
            
    # Create psg map of each epoch for feature selection and final feature-file
    def map_subject_id_psg_from_epoch(subject_id, df):
      
        subject_psg_map_epoch_df = pd.DataFrame()
        subjects_epoch_to_psg_map = {}
        
        # Get psg of each epoch
        for epoch_id in list(set(df.epoch_id)):
            psg_label_in_epoch = df[df['epoch_id'] == epoch_id]['psg_label']
            timestamp = df[df['epoch_id'] == epoch_id]['timestamp']
            # Pick the first element of epoch_id
            psg_label_in_epoch = Counter(psg_label_in_epoch).most_common(1)[0][0]
            timestamp = Counter(timestamp).most_common(1)[0][0]

            subjects_epoch_to_psg_map[epoch_id] = [subject_id, timestamp, psg_label_in_epoch, epoch_id]
            
        # Export csv of each subject psg map
        print("----- ID: [", str(subject_id),"] did map PSG to each session ---")
        ## --- psg ---
        subject_psg_map_epoch_df = pd.DataFrame.from_dict(subjects_epoch_to_psg_map, orient='index', columns=['subject_id', 'timestamp', 'psg_label', 'epoch_id'])
        subject_psg_map_epoch_df = subject_psg_map_epoch_df.sort_values(by=['epoch_id'])
        psg_filename = Constants.DATA_PROCESSED_PATH + str(subject_id) + '_psg_map_epoch.csv'
        Utils.dataframe_to_csv(subject_psg_map_epoch_df, psg_filename)
        print("  PSG map df: ", subject_psg_map_epoch_df.shape)

        return subject_psg_map_epoch_df


## Module: Feature Engineering

### Feature Extraction

In [151]:
class Extraction(object):

    # Extract features using tsfresh (each subjects)
    def extract_features_each_subject(dictionary):
        combined_subjects = pd.DataFrame()
        combined_psg_epoch = pd.DataFrame()
        
        # Extract features from each subject
        for subject_id, formatted_time_series_df in tqdm(dictionary.items()):

            print("---------------------------------------------------")
            print("======================", subject_id, "======================")
            print("---------------------------------------------------")
            
            # Map psg of valid epoch before feature extraction process
            psg_map_epoch_df = TimeSeries.map_subject_id_psg_from_epoch(subject_id, formatted_time_series_df)

            # Drop psg and non-feature data to extract feature
            no_psg_df = formatted_time_series_df.drop(
                columns=["psg_label", "subject_id", "timestamp"], axis=1).dropna()

            no_psg_df["epoch_id"] = no_psg_df["epoch_id"].astype(str)

            # Extract feature by tsfresh
            # required input: dataframe of data to extract features, column_id (epoch_id) 
            extracted_features = ExtractFeatures(no_psg_df, 
                                                 column_id="epoch_id")

            print('_____ Before Clean N/A columns _____')
            print(extracted_features.shape)
            extracted_features = extracted_features.dropna(axis='columns')
            
            print('_____ After Clean N/A columns _____')
            print(extracted_features.shape)
            
            # Sort index properly
            extracted_features.index = extracted_features.index.astype(int)
            extracted_features = extracted_features.sort_index()
            
            # Export csv of each subject
            subject_filename = Constants.FEATURE_EXTRACTED_PATH + str(subject_id) + '_extracted_features.csv'
            Utils.dataframe_to_csv(extracted_features, subject_filename)
            
            # Combine extracted featuers of each subject for next process (selecting feature)
            combined_subjects = combined_subjects.append(extracted_features, ignore_index=False)
            combined_psg_epoch = combined_psg_epoch.append(psg_map_epoch_df, ignore_index=False)
            
        return combined_subjects, combined_psg_epoch
  


### Feature Selection

In [152]:
class Selection(object):
    
    # Select features from combined extracted features of subjects 
    def select_features_all_subjects(extracted_df, psg_epoch_df):
        
        print("--------------------------------------------------")
        print("=============== Selecting Features ===============")
        print("--------------------------------------------------")

        # Get target data to select relevant features
        target_label = psg_epoch_df['psg_label']

        print('_____ Before Select Features _____')
        print(extracted_df.shape)

        # Perform feature selection by tsfresh
        selected_features_df = SelectFeatures(extracted_df, target_label)

        print('_____ After Select Features _____')
        print(selected_features_df.shape)
        
        # Combine back subject_id and psg_label after selected features
        selected_features_with_psg = pd.concat([psg_epoch_df, selected_features_df], axis=1)
        print('  After combined psg - selected features: ', selected_features_with_psg.shape)
        
        # Separate selected features of each subject and export csv
        Selection.split_subject_selected_features(selected_features_with_psg)
        
        return selected_features_with_psg
    
     # Split seleted features by subject_id
    def split_subject_selected_features(df):
        for subject_id in Var.subject_ids:
            subjects_group = df.groupby(df.subject_id)
            subject_selected_df = subjects_group.get_group(subject_id)

            subject_selected_filename = Constants.FEATURE_SELECTED_PATH + str(subject_id) + '_selected_features.csv'
            Utils.dataframe_to_csv(subject_selected_df, subject_selected_filename)


## Feature Engineering Method

### Prepare data

In [153]:
data_path = Constants.CROPPED_PATH

TimeSeries.map_joined_data_to_dict(data_path)


Total subjects:  31
['1066528', '1360686', '1449548', '1455390', '1818471', '2598705', '2638030', '3509524', '3997827', '4018081', '4314139', '4426783', '46343', '5132496', '5383425', '5498603', '5797046', '6220552', '759667', '7749105', '781756', '8000685', '8173033', '8258170', '844359', '8530312', '8686948', '8692923', '9106476', '9618981', '9961348']


100%|██████████| 31/31 [00:33<00:00,  1.09s/it]


##### Check original data

In [171]:
# Original data
motion_check, hr_check, psg_check = DataProcessing.read_subject_data(data_path, '46343')


In [172]:
# motion
motion_check

Unnamed: 0_level_0,accel_x,accel_y,accel_z
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.000721,-0.234650,0.905975,0.362747
0.017162,-0.231232,0.893265,0.371613
0.037183,-0.227814,0.915848,0.369049
0.057108,-0.240524,0.919159,0.352890
0.076929,-0.240448,0.889175,0.350143
...,...,...,...
16979.905934,-0.442413,-0.527573,0.723450
16979.925889,-0.441925,-0.527573,0.721985
16979.945849,-0.441437,-0.526108,0.721985
16979.965927,-0.443405,-0.525116,0.723434


In [188]:
motion_check.head(50)

Unnamed: 0_level_0,accel_x,accel_y,accel_z
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.000721,-0.23465,0.905975,0.362747
0.017162,-0.231232,0.893265,0.371613
0.037183,-0.227814,0.915848,0.369049
0.057108,-0.240524,0.919159,0.35289
0.076929,-0.240448,0.889175,0.350143
0.097206,-0.230652,0.898056,0.354538
0.11689,-0.24437,0.888184,0.349152
0.136826,-0.245895,0.891678,0.355942
0.171897,-0.246414,0.904938,0.355866
0.176799,-0.249878,0.899094,0.362213


In [187]:
motion_check.tail(50)

Unnamed: 0_level_0,accel_x,accel_y,accel_z
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
16979.007057,-0.441925,-0.527573,0.72345
16979.027061,-0.442902,-0.527084,0.721985
16979.047005,-0.442413,-0.526092,0.722946
16979.066979,-0.442917,-0.524628,0.722946
16979.08705,-0.441925,-0.526092,0.72345
16979.107146,-0.44194,-0.526566,0.725388
16979.127233,-0.441437,-0.526108,0.722473
16979.14722,-0.441925,-0.525604,0.722946
16979.167381,-0.441925,-0.524139,0.721481
16979.187068,-0.443893,-0.525116,0.722458


In [175]:
# heart rate
hr_check

Unnamed: 0_level_0,heart_rate
time,Unnamed: 1_level_1
8.47840,97.0
13.47840,95.0
18.47840,96.0
19.47840,95.0
24.47840,95.0
...,...
16954.47226,73.0
16959.47229,72.0
16964.47229,74.0
16970.47229,75.0


In [186]:
hr_check.head(50)

Unnamed: 0_level_0,heart_rate
time,Unnamed: 1_level_1
8.4784,97.0
13.4784,95.0
18.4784,96.0
19.4784,95.0
24.4784,95.0
29.4784,95.0
34.4784,95.0
43.4784,86.0
44.4784,86.0
53.4784,85.0


In [185]:
hr_check.tail(50)

Unnamed: 0_level_0,heart_rate
time,Unnamed: 1_level_1
16726.47226,75.0
16731.47229,76.0
16736.47229,79.0
16741.47229,79.0
16746.47229,77.0
16751.47229,76.0
16757.47229,73.0
16763.47229,71.0
16767.47229,69.0
16776.47229,72.0


In [178]:
# PSG label
psg_check

Unnamed: 0_level_0,psg_label
time,Unnamed: 1_level_1
0.0,
30.0,
60.0,
90.0,
120.0,
...,...
16830.0,0.0
16860.0,0.0
16890.0,0.0
16920.0,0.0


In [184]:
psg_check.head(50)

Unnamed: 0_level_0,psg_label
time,Unnamed: 1_level_1
0.0,
30.0,
60.0,
90.0,
120.0,
150.0,
180.0,
210.0,
240.0,
270.0,


In [180]:
psg_check.tail(50)

Unnamed: 0_level_0,psg_label
time,Unnamed: 1_level_1
16380.0,0.0
16410.0,0.0
16440.0,0.0
16470.0,0.0
16500.0,0.0
16530.0,0.0
16560.0,0.0
16590.0,0.0
16620.0,0.0
16650.0,0.0


##### Check synced time series data

In [168]:
# Check synced data
synced_check = Var.subject_time_series_dictionary['46343']

In [183]:
synced_check

Unnamed: 0,accel_x,accel_y,accel_z,heart_rate,psg_label,timestamp
0,-0.424678,0.922138,-0.094113,90.0,0.0,1970-01-01 00:06:30
1,-0.448256,0.816442,-0.228871,90.0,0.0,1970-01-01 00:06:31
2,-0.463808,0.767302,-0.429698,90.0,0.0,1970-01-01 00:06:32
3,-0.494360,0.798880,-0.076882,90.0,0.0,1970-01-01 00:06:33
4,-0.416413,0.860588,-0.092435,91.0,0.0,1970-01-01 00:06:34
...,...,...,...,...,...,...
16556,-0.441267,-0.525272,0.724310,73.0,0.0,1970-01-01 04:42:26
16557,-0.441876,-0.525352,0.724083,73.0,0.0,1970-01-01 04:42:27
16558,-0.442227,-0.525543,0.723603,73.0,0.0,1970-01-01 04:42:28
16559,-0.441982,-0.525540,0.723801,73.0,0.0,1970-01-01 04:42:29


In [181]:
synced_check.head(50)

Unnamed: 0,accel_x,accel_y,accel_z,heart_rate,psg_label,timestamp
0,-0.424678,0.922138,-0.094113,90.0,0.0,1970-01-01 00:06:30
1,-0.448256,0.816442,-0.228871,90.0,0.0,1970-01-01 00:06:31
2,-0.463808,0.767302,-0.429698,90.0,0.0,1970-01-01 00:06:32
3,-0.49436,0.79888,-0.076882,90.0,0.0,1970-01-01 00:06:33
4,-0.416413,0.860588,-0.092435,91.0,0.0,1970-01-01 00:06:34
5,-0.502803,0.736835,-0.418127,92.0,0.0,1970-01-01 00:06:35
6,-0.544052,0.644129,-0.527199,93.0,0.0,1970-01-01 00:06:36
7,-0.574588,0.426891,-0.686528,94.0,0.0,1970-01-01 00:06:37
8,-0.566095,0.50652,-0.649637,95.0,0.0,1970-01-01 00:06:38
9,-0.5819,0.530933,-0.616732,96.0,0.0,1970-01-01 00:06:39


In [182]:
synced_check.tail(50)

Unnamed: 0,accel_x,accel_y,accel_z,heart_rate,psg_label,timestamp
16511,-0.442028,-0.525613,0.723343,77.0,0.0,1970-01-01 04:41:41
16512,-0.442004,-0.526078,0.722974,77.0,0.0,1970-01-01 04:41:42
16513,-0.44157,-0.525652,0.723532,77.0,0.0,1970-01-01 04:41:43
16514,-0.441573,-0.525554,0.723404,77.0,0.0,1970-01-01 04:41:44
16515,-0.44186,-0.526223,0.723386,77.0,0.0,1970-01-01 04:41:45
16516,-0.442304,-0.526015,0.723586,77.0,0.0,1970-01-01 04:41:46
16517,-0.442224,-0.525976,0.72345,77.0,0.0,1970-01-01 04:41:47
16518,-0.441274,-0.525687,0.724028,77.0,0.0,1970-01-01 04:41:48
16519,-0.441922,-0.525718,0.723675,77.0,0.0,1970-01-01 04:41:49
16520,-0.442331,-0.526046,0.723344,76.0,0.0,1970-01-01 04:41:50


### Format data - get valid epoch

In [155]:
# Format time series data for feature selection (adding epoch_id using X second window epoch)
TimeSeries.format_time_series_data()

+-+-+-+- Format Time Series Data +-+-+-+


  0%|          | 0/31 [00:00<?, ?it/s]

--------------- 1066528 -----------------
--- check nan data ---
(28484, 6)
--- clean data ---
(28484, 6)


  3%|▎         | 1/31 [00:01<00:31,  1.04s/it]

--------------- 1360686 -----------------
--- check nan data ---
(28411, 6)
--- clean data ---
(28411, 6)


  6%|▋         | 2/31 [00:02<00:29,  1.03s/it]

--------------- 1449548 -----------------
--- check nan data ---
(28561, 6)
--- clean data ---
(28561, 6)


 10%|▉         | 3/31 [00:03<00:28,  1.03s/it]

--------------- 1455390 -----------------
--- check nan data ---
(28621, 6)
--- clean data ---
(28621, 6)


 13%|█▎        | 4/31 [00:04<00:28,  1.07s/it]

--------------- 1818471 -----------------
--- check nan data ---
(28711, 6)
--- clean data ---
(28711, 6)


 16%|█▌        | 5/31 [00:05<00:27,  1.06s/it]

--------------- 2598705 -----------------
--- check nan data ---
(28591, 6)
--- clean data ---
(28591, 6)


 19%|█▉        | 6/31 [00:06<00:26,  1.05s/it]

--------------- 2638030 -----------------
--- check nan data ---
(28411, 6)
--- clean data ---
(28411, 6)


 23%|██▎       | 7/31 [00:07<00:25,  1.07s/it]

--------------- 3509524 -----------------
--- check nan data ---
(12451, 6)
--- clean data ---
(12451, 6)


 26%|██▌       | 8/31 [00:07<00:19,  1.16it/s]

--------------- 3997827 -----------------
--- check nan data ---
(28711, 6)
--- clean data ---
(28711, 6)


 29%|██▉       | 9/31 [00:08<00:20,  1.06it/s]

--------------- 4018081 -----------------
--- check nan data ---
(15241, 6)
--- clean data ---
(15241, 6)


 32%|███▏      | 10/31 [00:09<00:17,  1.23it/s]

--------------- 4314139 -----------------
--- check nan data ---
(28801, 6)
--- clean data ---
(28801, 6)


 35%|███▌      | 11/31 [00:10<00:17,  1.12it/s]

--------------- 4426783 -----------------
--- check nan data ---
(29341, 6)
--- clean data ---
(29341, 6)


 39%|███▊      | 12/31 [00:11<00:18,  1.04it/s]

--------------- 46343 -----------------
--- check nan data ---
(16561, 6)
--- clean data ---
(16561, 6)


 42%|████▏     | 13/31 [00:12<00:15,  1.18it/s]

--------------- 5132496 -----------------
--- check nan data ---
(13891, 6)
--- clean data ---
(13891, 6)


 45%|████▌     | 14/31 [00:12<00:12,  1.36it/s]

--------------- 5383425 -----------------
--- check nan data ---
(29281, 6)
--- clean data ---
(29281, 6)


 48%|████▊     | 15/31 [00:13<00:13,  1.20it/s]

--------------- 5498603 -----------------
--- check nan data ---
(22291, 6)
--- clean data ---
(22291, 6)


 52%|█████▏    | 16/31 [00:14<00:12,  1.20it/s]

--------------- 5797046 -----------------
--- check nan data ---
(28141, 6)
--- clean data ---
(28141, 6)


 55%|█████▍    | 17/31 [00:15<00:12,  1.11it/s]

--------------- 6220552 -----------------
--- check nan data ---
(28591, 6)
--- clean data ---
(28591, 6)


 58%|█████▊    | 18/31 [00:16<00:12,  1.05it/s]

--------------- 759667 -----------------
--- check nan data ---
(14191, 6)
--- clean data ---
(14191, 6)


 61%|██████▏   | 19/31 [00:17<00:09,  1.22it/s]

--------------- 7749105 -----------------
--- check nan data ---
(27982, 6)
--- clean data ---
(27982, 6)


 65%|██████▍   | 20/31 [00:18<00:09,  1.14it/s]

--------------- 781756 -----------------
--- check nan data ---
(29371, 6)
--- clean data ---
(29371, 6)


 68%|██████▊   | 21/31 [00:19<00:09,  1.06it/s]

--------------- 8000685 -----------------
--- check nan data ---
(28711, 6)
--- clean data ---
(28711, 6)


 71%|███████   | 22/31 [00:20<00:08,  1.01it/s]

--------------- 8173033 -----------------
--- check nan data ---
(28621, 6)
--- clean data ---
(28621, 6)


 74%|███████▍  | 23/31 [00:21<00:08,  1.03s/it]

--------------- 8258170 -----------------
--- check nan data ---
(29101, 6)
--- clean data ---
(29101, 6)


 77%|███████▋  | 24/31 [00:22<00:07,  1.04s/it]

--------------- 844359 -----------------
--- check nan data ---
(26881, 6)
--- clean data ---
(26881, 6)


 81%|████████  | 25/31 [00:23<00:06,  1.03s/it]

--------------- 8530312 -----------------
--- check nan data ---
(28441, 6)
--- clean data ---
(28441, 6)


 84%|████████▍ | 26/31 [00:24<00:05,  1.05s/it]

--------------- 8686948 -----------------
--- check nan data ---
(28621, 6)
--- clean data ---
(28621, 6)


 87%|████████▋ | 27/31 [00:25<00:04,  1.05s/it]

--------------- 8692923 -----------------
--- check nan data ---
(28021, 6)
--- clean data ---
(28021, 6)


 90%|█████████ | 28/31 [00:26<00:03,  1.06s/it]

--------------- 9106476 -----------------
--- check nan data ---
(28771, 6)
--- clean data ---
(28771, 6)


 94%|█████████▎| 29/31 [00:27<00:02,  1.05s/it]

--------------- 9618981 -----------------
--- check nan data ---
(28291, 6)
--- clean data ---
(28291, 6)


 97%|█████████▋| 30/31 [00:29<00:01,  1.06s/it]

--------------- 9961348 -----------------
--- check nan data ---
(21541, 6)
--- clean data ---
(21541, 6)


100%|██████████| 31/31 [00:29<00:00,  1.04it/s]


#### Check formatted data

In [156]:
# Formatted time series data -> specify epoch_id for each epoch
Var.subject_formatted_time_series['46343'].head(50)


Unnamed: 0,accel_x,accel_y,accel_z,heart_rate,psg_label,timestamp,epoch_id,subject_id
0,-0.424678,0.922138,-0.094113,90.0,0.0,1970-01-01 00:06:30,10489,46343
1,-0.448256,0.816442,-0.228871,90.0,0.0,1970-01-01 00:06:31,10489,46343
2,-0.463808,0.767302,-0.429698,90.0,0.0,1970-01-01 00:06:32,10489,46343
3,-0.49436,0.79888,-0.076882,90.0,0.0,1970-01-01 00:06:33,10489,46343
4,-0.416413,0.860588,-0.092435,91.0,0.0,1970-01-01 00:06:34,10489,46343
5,-0.502803,0.736835,-0.418127,92.0,0.0,1970-01-01 00:06:35,10489,46343
6,-0.544052,0.644129,-0.527199,93.0,0.0,1970-01-01 00:06:36,10489,46343
7,-0.574588,0.426891,-0.686528,94.0,0.0,1970-01-01 00:06:37,10489,46343
8,-0.566095,0.50652,-0.649637,95.0,0.0,1970-01-01 00:06:38,10489,46343
9,-0.5819,0.530933,-0.616732,96.0,0.0,1970-01-01 00:06:39,10489,46343


In [157]:
Var.subject_formatted_time_series['46343'].tail(50)


Unnamed: 0,accel_x,accel_y,accel_z,heart_rate,psg_label,timestamp,epoch_id,subject_id
16511,-0.442028,-0.525613,0.723343,77.0,0.0,1970-01-01 04:41:41,11039,46343
16512,-0.442004,-0.526078,0.722974,77.0,0.0,1970-01-01 04:41:42,11039,46343
16513,-0.44157,-0.525652,0.723532,77.0,0.0,1970-01-01 04:41:43,11039,46343
16514,-0.441573,-0.525554,0.723404,77.0,0.0,1970-01-01 04:41:44,11039,46343
16515,-0.44186,-0.526223,0.723386,77.0,0.0,1970-01-01 04:41:45,11039,46343
16516,-0.442304,-0.526015,0.723586,77.0,0.0,1970-01-01 04:41:46,11039,46343
16517,-0.442224,-0.525976,0.72345,77.0,0.0,1970-01-01 04:41:47,11039,46343
16518,-0.441274,-0.525687,0.724028,77.0,0.0,1970-01-01 04:41:48,11039,46343
16519,-0.441922,-0.525718,0.723675,77.0,0.0,1970-01-01 04:41:49,11039,46343
16520,-0.442331,-0.526046,0.723344,76.0,0.0,1970-01-01 04:41:50,11039,46343


### Extract features

In [158]:
Var.all_extracted_features_df, Var.all_psg_df = Extraction.extract_features_each_subject(Var.subject_formatted_time_series)


  0%|          | 0/31 [00:00<?, ?it/s]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 1066528 ] did map PSG to each session ---
  PSG map df:  (950, 4)


Feature Extraction: 100%|██████████| 20/20 [00:31<00:00,  1.60s/it]


_____ Before Clean N/A columns _____
(950, 3148)
_____ After Clean N/A columns _____
(950, 1388)


  3%|▎         | 1/31 [00:36<18:01, 36.04s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 1360686 ] did map PSG to each session ---
  PSG map df:  (948, 4)


Feature Extraction: 100%|██████████| 20/20 [00:32<00:00,  1.63s/it]


_____ Before Clean N/A columns _____
(948, 3148)
_____ After Clean N/A columns _____
(948, 868)


  6%|▋         | 2/31 [01:12<17:25, 36.06s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 1449548 ] did map PSG to each session ---
  PSG map df:  (953, 4)


Feature Extraction: 100%|██████████| 20/20 [00:32<00:00,  1.62s/it]


_____ Before Clean N/A columns _____
(953, 3148)
_____ After Clean N/A columns _____
(953, 868)


 10%|▉         | 3/31 [01:48<16:49, 36.05s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 1455390 ] did map PSG to each session ---
  PSG map df:  (955, 4)


Feature Extraction: 100%|██████████| 20/20 [00:31<00:00,  1.58s/it]


_____ Before Clean N/A columns _____
(955, 3148)
_____ After Clean N/A columns _____
(955, 868)


 13%|█▎        | 4/31 [02:23<16:01, 35.62s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 1818471 ] did map PSG to each session ---
  PSG map df:  (958, 4)


Feature Extraction: 100%|██████████| 20/20 [00:32<00:00,  1.61s/it]


_____ Before Clean N/A columns _____
(958, 3148)
_____ After Clean N/A columns _____
(958, 868)


 16%|█▌        | 5/31 [02:58<15:26, 35.64s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 2598705 ] did map PSG to each session ---
  PSG map df:  (954, 4)


Feature Extraction: 100%|██████████| 20/20 [00:31<00:00,  1.55s/it]


_____ Before Clean N/A columns _____
(954, 3148)
_____ After Clean N/A columns _____
(954, 868)


 19%|█▉        | 6/31 [03:33<14:45, 35.42s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 2638030 ] did map PSG to each session ---
  PSG map df:  (948, 4)


Feature Extraction: 100%|██████████| 20/20 [00:32<00:00,  1.63s/it]


_____ Before Clean N/A columns _____
(948, 3148)
_____ After Clean N/A columns _____
(948, 868)


 23%|██▎       | 7/31 [04:10<14:16, 35.69s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 3509524 ] did map PSG to each session ---
  PSG map df:  (416, 4)


Feature Extraction: 100%|██████████| 20/20 [00:15<00:00,  1.27it/s]


_____ Before Clean N/A columns _____
(416, 3148)
_____ After Clean N/A columns _____
(416, 868)


 26%|██▌       | 8/31 [04:27<11:26, 29.87s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 3997827 ] did map PSG to each session ---
  PSG map df:  (958, 4)


Feature Extraction: 100%|██████████| 20/20 [00:31<00:00,  1.58s/it]


_____ Before Clean N/A columns _____
(958, 3148)
_____ After Clean N/A columns _____
(958, 868)


 29%|██▉       | 9/31 [05:02<11:33, 31.54s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 4018081 ] did map PSG to each session ---
  PSG map df:  (509, 4)


Feature Extraction: 100%|██████████| 20/20 [00:17<00:00,  1.11it/s]


_____ Before Clean N/A columns _____
(509, 3148)
_____ After Clean N/A columns _____
(509, 868)


 32%|███▏      | 10/31 [05:22<09:48, 28.00s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 4314139 ] did map PSG to each session ---
  PSG map df:  (961, 4)


Feature Extraction: 100%|██████████| 20/20 [00:32<00:00,  1.64s/it]


_____ Before Clean N/A columns _____
(961, 3148)
_____ After Clean N/A columns _____
(961, 868)


 35%|███▌      | 11/31 [05:59<10:12, 30.61s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 4426783 ] did map PSG to each session ---
  PSG map df:  (979, 4)


Feature Extraction: 100%|██████████| 20/20 [00:32<00:00,  1.64s/it]


_____ Before Clean N/A columns _____
(979, 3148)
_____ After Clean N/A columns _____
(979, 868)


 39%|███▊      | 12/31 [06:35<10:14, 32.33s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 46343 ] did map PSG to each session ---
  PSG map df:  (553, 4)


Feature Extraction: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s]


_____ Before Clean N/A columns _____
(553, 3148)
_____ After Clean N/A columns _____
(553, 868)


 42%|████▏     | 13/31 [06:57<08:44, 29.12s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 5132496 ] did map PSG to each session ---
  PSG map df:  (464, 4)


Feature Extraction: 100%|██████████| 20/20 [00:16<00:00,  1.21it/s]


_____ Before Clean N/A columns _____
(464, 3148)
_____ After Clean N/A columns _____
(464, 868)


 45%|████▌     | 14/31 [07:15<07:21, 25.94s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 5383425 ] did map PSG to each session ---
  PSG map df:  (977, 4)


Feature Extraction: 100%|██████████| 20/20 [00:32<00:00,  1.64s/it]


_____ Before Clean N/A columns _____
(977, 3148)
_____ After Clean N/A columns _____
(977, 868)


 48%|████▊     | 15/31 [07:52<07:45, 29.11s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 5498603 ] did map PSG to each session ---
  PSG map df:  (744, 4)


Feature Extraction: 100%|██████████| 20/20 [00:26<00:00,  1.30s/it]


_____ Before Clean N/A columns _____
(744, 3148)
_____ After Clean N/A columns _____
(744, 868)


 52%|█████▏    | 16/31 [08:21<07:16, 29.10s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 5797046 ] did map PSG to each session ---
  PSG map df:  (939, 4)


Feature Extraction: 100%|██████████| 20/20 [00:32<00:00,  1.61s/it]


_____ Before Clean N/A columns _____
(939, 3148)
_____ After Clean N/A columns _____
(939, 868)


 55%|█████▍    | 17/31 [08:56<07:14, 31.05s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 6220552 ] did map PSG to each session ---
  PSG map df:  (954, 4)


Feature Extraction: 100%|██████████| 20/20 [00:32<00:00,  1.64s/it]


_____ Before Clean N/A columns _____
(954, 3148)
_____ After Clean N/A columns _____
(954, 868)


 58%|█████▊    | 18/31 [09:33<07:04, 32.67s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 759667 ] did map PSG to each session ---
  PSG map df:  (474, 4)


Feature Extraction: 100%|██████████| 20/20 [00:17<00:00,  1.17it/s]


_____ Before Clean N/A columns _____
(474, 3148)
_____ After Clean N/A columns _____
(474, 868)


 61%|██████▏   | 19/31 [09:52<05:43, 28.65s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 7749105 ] did map PSG to each session ---
  PSG map df:  (933, 4)


Feature Extraction: 100%|██████████| 20/20 [00:25<00:00,  1.28s/it]


_____ Before Clean N/A columns _____
(933, 3148)
_____ After Clean N/A columns _____
(933, 1486)


 65%|██████▍   | 20/31 [10:22<05:18, 28.96s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 781756 ] did map PSG to each session ---
  PSG map df:  (980, 4)


Feature Extraction: 100%|██████████| 20/20 [00:33<00:00,  1.68s/it]


_____ Before Clean N/A columns _____
(980, 3148)
_____ After Clean N/A columns _____
(980, 868)


 68%|██████▊   | 21/31 [10:59<05:14, 31.44s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 8000685 ] did map PSG to each session ---
  PSG map df:  (958, 4)


Feature Extraction: 100%|██████████| 20/20 [00:33<00:00,  1.67s/it]


_____ Before Clean N/A columns _____
(958, 3148)
_____ After Clean N/A columns _____
(958, 868)


 71%|███████   | 22/31 [11:36<04:57, 33.10s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 8173033 ] did map PSG to each session ---
  PSG map df:  (955, 4)


Feature Extraction: 100%|██████████| 20/20 [00:33<00:00,  1.66s/it]


_____ Before Clean N/A columns _____
(955, 3148)
_____ After Clean N/A columns _____
(955, 868)


 74%|███████▍  | 23/31 [12:13<04:34, 34.31s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 8258170 ] did map PSG to each session ---
  PSG map df:  (971, 4)


Feature Extraction: 100%|██████████| 20/20 [00:33<00:00,  1.65s/it]


_____ Before Clean N/A columns _____
(971, 3148)
_____ After Clean N/A columns _____
(971, 868)


 77%|███████▋  | 24/31 [12:50<04:05, 35.13s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 844359 ] did map PSG to each session ---
  PSG map df:  (897, 4)


Feature Extraction: 100%|██████████| 20/20 [00:32<00:00,  1.60s/it]


_____ Before Clean N/A columns _____
(897, 3148)
_____ After Clean N/A columns _____
(897, 868)


 81%|████████  | 25/31 [13:26<03:31, 35.20s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 8530312 ] did map PSG to each session ---
  PSG map df:  (949, 4)


Feature Extraction: 100%|██████████| 20/20 [00:31<00:00,  1.60s/it]


_____ Before Clean N/A columns _____
(949, 3148)
_____ After Clean N/A columns _____
(949, 868)


 84%|████████▍ | 26/31 [14:01<02:56, 35.36s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 8686948 ] did map PSG to each session ---
  PSG map df:  (955, 4)


Feature Extraction: 100%|██████████| 20/20 [00:32<00:00,  1.62s/it]


_____ Before Clean N/A columns _____
(955, 3148)
_____ After Clean N/A columns _____
(955, 868)


 87%|████████▋ | 27/31 [14:37<02:22, 35.58s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 8692923 ] did map PSG to each session ---
  PSG map df:  (935, 4)


Feature Extraction: 100%|██████████| 20/20 [00:31<00:00,  1.58s/it]


_____ Before Clean N/A columns _____
(935, 3148)
_____ After Clean N/A columns _____
(935, 868)


 90%|█████████ | 28/31 [15:13<01:46, 35.53s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 9106476 ] did map PSG to each session ---
  PSG map df:  (960, 4)


Feature Extraction: 100%|██████████| 20/20 [00:33<00:00,  1.65s/it]


_____ Before Clean N/A columns _____
(960, 3148)
_____ After Clean N/A columns _____
(960, 868)


 94%|█████████▎| 29/31 [15:50<01:11, 35.89s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 9618981 ] did map PSG to each session ---
  PSG map df:  (944, 4)


Feature Extraction: 100%|██████████| 20/20 [00:32<00:00,  1.64s/it]


_____ Before Clean N/A columns _____
(944, 3148)
_____ After Clean N/A columns _____
(944, 868)


 97%|█████████▋| 30/31 [16:26<00:36, 36.08s/it]

---------------------------------------------------
---------------------------------------------------
----- ID: [ 9961348 ] did map PSG to each session ---
  PSG map df:  (719, 4)


Feature Extraction: 100%|██████████| 20/20 [00:25<00:00,  1.26s/it]


_____ Before Clean N/A columns _____
(719, 3148)
_____ After Clean N/A columns _____
(719, 868)


100%|██████████| 31/31 [16:54<00:00, 32.73s/it]


In [159]:
Var.all_extracted_features_df

Unnamed: 0,accel_x__variance_larger_than_standard_deviation,accel_x__has_duplicate_max,accel_x__has_duplicate_min,accel_x__has_duplicate,accel_x__sum_values,accel_x__abs_energy,accel_x__mean_abs_change,accel_x__mean_change,accel_x__mean_second_derivative_central,accel_x__median,...,"accel_x__fft_coefficient__attr_""abs""__coeff_11","accel_x__fft_coefficient__attr_""angle""__coeff_8","accel_x__fft_coefficient__attr_""angle""__coeff_9","accel_x__fft_coefficient__attr_""angle""__coeff_10","accel_x__fft_coefficient__attr_""angle""__coeff_11",accel_x__fourier_entropy__bins_2,accel_x__fourier_entropy__bins_3,accel_x__fourier_entropy__bins_5,accel_x__fourier_entropy__bins_10,accel_x__fourier_entropy__bins_100
0,0.0,0.0,0.0,0.0,12.142583,4.914785,0.000521,-0.000069,-0.000005,0.404833,...,,,,,,,,,,
1,0.0,0.0,0.0,0.0,12.119595,4.896154,0.000173,-0.000012,-0.000011,0.404009,...,,,,,,,,,,
2,0.0,0.0,0.0,0.0,12.094308,4.875753,0.000300,-0.000110,-0.000031,0.403221,...,,,,,,,,,,
3,0.0,0.0,0.0,0.0,12.017801,4.814292,0.000563,0.000105,-0.000039,0.401116,...,,,,,,,,,,
4,0.0,0.0,0.0,0.0,12.022661,4.818164,0.000292,0.000021,-0.000014,0.401101,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26745,0.0,0.0,0.0,0.0,-10.358294,4.048534,,,,-0.385863,...,,,,,,,,,,
26746,0.0,0.0,0.0,0.0,-9.582493,3.214328,,,,-0.284248,...,,,,,,,,,,
26747,0.0,0.0,0.0,0.0,-7.860814,2.060029,,,,-0.260443,...,,,,,,,,,,
26748,0.0,0.0,0.0,0.0,-7.863563,2.061455,,,,-0.260910,...,,,,,,,,,,


In [160]:
Var.all_psg_df

Unnamed: 0,subject_id,timestamp,psg_label,epoch_id
0,1066528,1970-01-01 00:00:00,0.0,0
1,1066528,1970-01-01 00:00:30,0.0,1
2,1066528,1970-01-01 00:01:00,0.0,2
3,1066528,1970-01-01 00:01:30,0.0,3
4,1066528,1970-01-01 00:02:00,0.0,4
...,...,...,...,...
26745,9961348,1970-01-01 05:57:00,0.0,26745
26746,9961348,1970-01-01 05:57:30,0.0,26746
26747,9961348,1970-01-01 05:58:00,0.0,26747
26748,9961348,1970-01-01 05:58:30,0.0,26748


### Select features

In [161]:
print('  Before clean N/A from Combined extracted features ')
print('  => ', Var.all_extracted_features_df.shape)
Var.all_extracted_features_df = Var.all_extracted_features_df.dropna(axis='columns')

print('  After clean N/A from Combined extracted features ')
print('  => ', Var.all_extracted_features_df.shape)


  Before clean N/A from Combined extracted features 
  =>  (26750, 1486)
  After clean N/A from Combined extracted features 
  =>  (26750, 868)


In [162]:
# Export all extracted features
all_extracted_features_filename = Constants.FEATURE_EXTRACTED_PATH + \
    "all_extracted_features.csv"
Utils.dataframe_to_csv(Var.all_extracted_features_df,
                       all_extracted_features_filename)

all_psg_features_filename = Constants.FEATURE_EXTRACTED_PATH + \
    "all_psg_for_features.csv"
Utils.dataframe_to_csv(Var.all_psg_df,
                       all_psg_features_filename)


In [163]:
Var.all_selected_features_df = Selection.select_features_all_subjects(Var.all_extracted_features_df, Var.all_psg_df)


--------------------------------------------------
--------------------------------------------------
_____ Before Select Features _____
(26750, 868)
_____ After Select Features _____
(26750, 558)
  After combined psg - selected features:  (26750, 562)


In [164]:
Var.all_selected_features_df


Unnamed: 0,subject_id,timestamp,psg_label,epoch_id,"accel_y__change_quantiles__f_agg_""var""__isabs_False__qh_0.2__ql_0.0","accel_y__change_quantiles__f_agg_""var""__isabs_True__qh_0.2__ql_0.0",accel_x__cid_ce__normalize_True,accel_y__cid_ce__normalize_True,"accel_x__change_quantiles__f_agg_""var""__isabs_True__qh_0.2__ql_0.0","accel_y__change_quantiles__f_agg_""mean""__isabs_True__qh_0.2__ql_0.0",...,heart_rate__cid_ce__normalize_True,heart_rate__symmetry_looking__r_0.05,"heart_rate__fft_aggregated__aggtype_""centroid""",accel_x__range_count__max_1000000000000.0__min_0,accel_x__count_above__t_0,accel_x__count_below__t_0,"accel_x__agg_autocorrelation__f_agg_""var""__maxlag_40","heart_rate__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.6",heart_rate__index_mass_quantile__q_0.6,accel_y__range_count__max_1__min_-1
0,1066528,1970-01-01 00:00:00,0.0,0,1.604580e-09,1.310589e-09,3.626645,1.959643,2.522157e-07,0.000036,...,2.857143,1.0,0.140288,30.0,1.000000,0.000000,0.254807,0.000000,0.633333,30.0
1,1066528,1970-01-01 00:00:30,0.0,1,0.000000e+00,0.000000e+00,5.466670,5.044820,0.000000e+00,0.000294,...,1.242118,0.0,0.218842,30.0,1.000000,0.000000,0.731362,0.285714,0.600000,30.0
2,1066528,1970-01-01 00:01:00,0.0,2,0.000000e+00,0.000000e+00,4.104515,3.308974,4.866016e-07,0.000015,...,1.816348,1.0,0.170308,30.0,1.000000,0.000000,0.813601,0.000000,0.633333,30.0
3,1066528,1970-01-01 00:01:30,0.0,3,2.179462e-08,2.179462e-08,4.177129,4.581555,8.384268e-08,0.000178,...,3.594254,0.0,0.100301,30.0,1.000000,0.000000,0.146028,0.000000,0.600000,30.0
4,1066528,1970-01-01 00:02:00,0.0,4,6.336852e-08,9.366799e-09,2.685677,3.332420,0.000000e+00,0.000232,...,1.151518,0.0,0.480428,30.0,1.000000,0.000000,0.746472,0.571429,0.633333,30.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26745,9961348,1970-01-01 05:57:00,0.0,26745,2.650976e-02,1.659943e-02,4.419813,4.542659,3.053737e-06,0.162818,...,2.096143,1.0,0.172392,2.0,0.066667,0.933333,0.037064,0.000000,0.600000,30.0
26746,9961348,1970-01-01 05:57:30,0.0,26746,1.505345e-02,1.505345e-02,3.577219,5.875899,5.024317e-05,0.094461,...,1.379632,0.0,0.214307,0.0,0.000000,1.000000,0.336281,0.454545,0.633333,30.0
26747,9961348,1970-01-01 05:58:00,0.0,26747,5.750511e-09,5.750511e-09,5.116522,5.358162,5.215440e-06,0.000123,...,3.251226,1.0,0.215803,0.0,0.000000,1.000000,0.063019,0.000000,0.600000,30.0
26748,9961348,1970-01-01 05:58:30,0.0,26748,6.698735e-06,2.236967e-06,4.608577,5.864141,3.126384e-06,0.002135,...,1.743715,0.0,0.111722,0.0,0.000000,1.000000,0.123103,0.166667,0.633333,30.0


## Summary: Features Data

In [165]:
# Export all selected features
all_selected_features_filename = Constants.FEATURE_SELECTED_PATH + "all_selected_features.csv"
Utils.dataframe_to_csv(Var.all_selected_features_df,
                       all_selected_features_filename)


In [166]:
# Check number of features
check_selected_features_filename = Constants.FEATURE_SELECTED_PATH + "all_selected_features_type.csv"
Utils.check_number_of_features(all_selected_features_filename, check_selected_features_filename)


accel_x => 148 features
accel_y => 164 features
accel_z => 146 features
heart_rate => 100 features


In [167]:
print("-------- END of [Sleep-2-Learn] Part I - Before Learning (Feature Engineering) -------- ")


-------- END of [Sleep-2-Learn] Part I - Before Learning (Feature Engineering) -------- 


#### Finish