# Data processing & feature extraction

This notebook does the following:

1. Opens file index to retrieve data file paths for all participants
2. Selects participants for processing (manual input of participant IDs for inclusion)
3. Generates a CSV file for each included participant that contains downsampled, synchronised, filtered and normalised data
4. Plots overview of all signals for both conditions for each participant
5. Extracts and saves entire condition (air vs co2) and time windowed features for each participant
6. Merges extracted features from all participants into one file

Input: PARTICIPANTS_TO_PROCESS

Output: 

1. 'temp/synced_participant_data/' - synchronised, downsampled, normalised and filtered data for each participant
2. 'segment_features/' - extracted features for each participant for entire segment (co2 and air separately)
3. 'windowed_features/' - extracted features for each participant for each time window
4. 'segment_features.csv' - merged segment features from all participants
5. 'windowed_features.csv' - merged time windowed features from all participants

* Due to a bug in latest versions of NeuroKit2, version 0.2.5 must be installed (https://github.com/neuropsychology/NeuroKit/issues/961)

In [None]:
# Imports
import os
import pandas as pd
from utils.constants import AirFiles, CO2Files
from utils.timestamps import read_unix
from classes.Participant import Participant
from classes.DataHandler import DataHandler
from utils.plots import Plots

SYNCED_DATA_DIRECTORY = os.path.join(os.getcwd(), 'temp', 'synced_participant_data')

In [None]:
# Imports
import os
if('notebooks' in os.getcwd()):
    os.chdir('..')
import json
import pandas as pd
from utils.constants import AirFiles, CO2Files, DATA_COLUMNS, FREQUENCIES
from utils.load_data import load_data_with_event_matching
from utils.timestamps import read_unix, read_j2000, j2000_to_unix, generate_biopac_unix_timestamps
from classes.Participant import Participant
from classes.DataHandler import DataHandler
import numpy as np
import matplotlib.pyplot as plt
from utils.plots import Plots
from utils.normalisation import eye_tracking as normalise_pupil_size
import math

SYNCED_DATA_DIRECTORY = os.path.join(os.getcwd(), 'temp', 'synced_participant_data')

In [None]:
# Open file index JSON for reading
file_index = pd.read_json(os.path.join(os.getcwd(), 'temp/file_index.json'))
file_index = file_index.sort_index()
file_index_ids = file_index.index
print('File index contains entries for ' + str(len(file_index_ids)) + ' participants')

In [None]:
# Participants to process
PARTICIPANTS_TO_PROCESS = [2,7,9,11,12,13,14,15,16,17,18,19,20,22,23,24,25,26,27,28,
                           29,32,33,34,35,36,37,38,43,44,45,46,47,48,49,51,52,53,54,
                           55,57,59,60,61,62,63]
PARTICIPANTS_TO_PROCESS = [id - 1 for id in PARTICIPANTS_TO_PROCESS]

print('Selected ' + str(len(PARTICIPANTS_TO_PROCESS)) + ' participants to be processed')

In [None]:
# Loop that generates downsampled, synced files for every participant
for participant_id in PARTICIPANTS_TO_PROCESS:
    participant_to_retrieve = participant_id
    participant_file_index = file_index[0][participant_to_retrieve]
    participant = Participant(participant_file_index['id'])
    
    synced_participant_file = os.path.join(SYNCED_DATA_DIRECTORY, str(participant.id)) + '.csv'
    if(os.path.exists(synced_participant_file)):
        print('Synced file for participant: ' + str(participant.id) + ' found. Loading existing file.')
        participant.set_synced_data(pd.read_csv(synced_participant_file))
    else:
        print('Generating new synced participant file for participant: ' + str(participant.id))
        
        # AIR
        if(participant_file_index[AirFiles.MASK.value] is None):
            print('Air mask file missing')
        else:
            print('Loading Air condition Data')
            air_mask_file = participant_file_index[AirFiles.MASK.value]
            air_event_file = participant_file_index[AirFiles.EVENT.value]
            air_eyetracking_file = participant_file_index[AirFiles.EYE.value]
            air_biopac_file = participant_file_index[AirFiles.BIOPAC.value]
            air_biopac_start_unix = participant_file_index[AirFiles.BIOPAC_UNIX_START_TIME.value]
            print(read_unix(air_biopac_start_unix))

            # Load mask data
            participant.set_air_mask_data(DataHandler.load_mask_data(air_mask_file, air_event_file, participant.id))
            # Load eye tracking data
            participant.set_air_eye_data(DataHandler.load_eyetracking_data(air_eyetracking_file, participant.id, 'air'))  
            # Load biopac data
            participant.set_air_biopac_data(DataHandler.load_biopac_data(air_biopac_file, air_biopac_start_unix, participant.id))  
            # Sync eye tracking and biopac data
            participant.set_air_synced_data(DataHandler.sync_signal_data(participant.air_mask_data, participant.air_eye_data, participant.air_biopac_data, air_biopac_start_unix))

        #CO2
        print('Loading CO2 condition Data')
        co2_mask_file = participant_file_index[CO2Files.MASK.value]
        co2_event_file = participant_file_index[CO2Files.EVENT.value]
        co2_eyetracking_file = participant_file_index[CO2Files.EYE.value]
        co2_biopac_file = participant_file_index[CO2Files.BIOPAC.value]
        co2_biopac_start_unix = participant_file_index[CO2Files.BIOPAC_UNIX_START_TIME.value]

        # Load mask data
        participant.set_co2_mask_data(DataHandler.load_mask_data(co2_mask_file, co2_event_file, participant.id))
        # Load eye tracking data
        participant.set_co2_eye_data(DataHandler.load_eyetracking_data(co2_eyetracking_file, participant.id, 'co2'))  
        # Load biopac data
        participant.set_co2_biopac_data(DataHandler.load_biopac_data(co2_biopac_file, co2_biopac_start_unix, participant.id))  
        # Sync eye tracking and biopac data
        participant.set_co2_synced_data(DataHandler.sync_signal_data(participant.co2_mask_data, participant.co2_eye_data, participant.co2_biopac_data, co2_biopac_start_unix))
    
        print('Downsampling and combining data')
        #Downsample and combine data. This data is also saved
        participant.set_synced_data(DataHandler.downsample_participant_data(participant.id, DataHandler.label_data(participant.air_synced_data), DataHandler.label_data(participant.co2_synced_data)))
    
    # Filter data
    filtered_data = DataHandler.filter_data(participant.synced_data)
    # Normalise data per participant
    normalised_data = DataHandler.normalise_data(filtered_data)
    
    Plots.participant_overview(normalised_data, True)
    
    # Feature extraction
    windowed_features = DataHandler.extract_features(normalised_data)
    segment_features = DataHandler.extract_features_entire_condition(normalised_data)

    print('Finished data processing for participant: ' + participant.id)



In [None]:
# Merge individual participant windowed feature files 
combined_features_windowed = DataHandler.merge_participant_windowed_feature_files()

In [None]:
# Merge individual participant segment feature files
combined_features_segments = DataHandler.merge_participant_segment_feature_files()