# Data Preprocess

## Import Module

In [1]:
import os
import glob
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import math
import scipy.signal as signal
import seaborn as sns

In [2]:
# 5s rest, 30s trail, 5s feedback, 20s recovery
t_base = 5
t_trail = 35
t_feedback = 40
t_rest = 60
trail_times = 23
transfer_times = 8

## Calculate Oxy

In [3]:
def Oxy_Cal(df, age):
    '''
    return: Oxygenated blood information

    inputs:
        df: df of LED information
        age: age of the patients
    '''
    eHHb_730 = 1.102
    eHHb_850 = 0.691
    eHbO_730 = 0.39
    eHbO_850 = 1.058
    DPF_730 = 223.3+0.05624*(age**0.8493)-5.723*10**(-7)*(730**3)+0.001245*(730**2)+(-0.9025)*730
    DPF_850 = 223.3+0.05624*(age**0.8493)-5.723*10**(-7)*(850**3)+0.001245*(850**2)+(-0.9025)*850
    Oxy_data = pd.DataFrame() 
    for i in range(0, 4):
        # Take the average of first ten data as the baseline
        ori_730 = df.iloc[:10, 2*i].mean()
        ori_850 = df.iloc[:10, 2*i+1].mean()
        OD_730 = np.log10(df.iloc[:, 2*i] / ori_730)
        OD_850 = np.log10(df.iloc[:, 2*i+1] / ori_850)
        
        Oxy_data['CH'+str(i+1)+'_Oxy'] = 10*(eHHb_730*(OD_850/DPF_850) - eHHb_850*(OD_730/DPF_730)) / ((eHHb_730*eHbO_850 - eHHb_850*eHbO_730)*0.03)
        Oxy_data['CH'+str(i+1)+'_Deoxy'] = 10*(eHbO_850*(OD_730/DPF_730) - eHbO_730*(OD_850/DPF_850)) / ((eHHb_730*eHbO_850 - eHHb_850*eHbO_730)*0.03)
    Oxy_data['Trails'] = df.Trails

    Oxy_data = Oxy_data.iloc[10:]
    return Oxy_data

## Fill NaN

In [4]:
def fill_nan(df):
    '''
    return: df without NaN

    inputs:
        df: uncleared dataframe
    '''
    OwO_wona = pd.DataFrame()
    # replace inf with NaN
    df[np.isinf] = np.nan

    # time series -> forward fill
    OwO_wona = df.fillna(method='ffill')
    return OwO_wona

## Filter

In [5]:
def filtering(df):
    '''
    return: filterd dataframe

    inputs:
        df: dataframe without filter
    '''
    # butter-wotrh 4 order low-pass filter
    N = 4
    nyq = 0.5 * 17
    OwO_filterd = pd.DataFrame()
    for col in df.drop(columns = 'Trails'):
        b, a = signal.butter(N, 0.1/nyq, 'low')
        filterd = signal.filtfilt(b, a, df[col])
        OwO_filterd[col] = filterd

    OwO_filterd.index = df.index
    OwO_filterd['Trails'] = df['Trails']
    return OwO_filterd

## Average

In [6]:
def ave(df):
    '''
    return: mean of all trails

    inputs: 
        df: dataframe of all trails
    '''
    df_total = df[df['Trails'] == 1].iloc[-1010:].drop(columns = 'Trails').values
    for i in range(2, 24):
        df_total = df_total + df[df['Trails'] == i].iloc[-1010:].drop(columns = 'Trails').values
        
    OwO_trail_sum = pd.DataFrame(df_total, index = np.linspace(0, 60, 1010), columns = df.drop(columns = 'Trails').columns)
    return OwO_trail_sum/trail_times

## Normalize

In [7]:
def normalize(df):
    '''
    return: normalized dataframe

    inputs:
        df: dataframe
    '''
    OwO_normalized = pd.DataFrame()
    OwO_normalized = (df - df.min())/(df.max()-df.min())
    return OwO_normalized

## Main
sub07 week8 要特別處理

In [None]:
# load patients information
patients = pd.read_csv(r'path\ts_patient_list.csv')

# get all directory 
sub_dir = []
for dirPath, dirNames, fileNames in os.walk(r'path\TS_original_data'):
    sub_dir.append(dirPath)

# get all csv
for sub in sub_dir:
    split_list = sub.split('\\')
    data_link = os.path.join('.\\' , split_list[-2], split_list[-1], "*.csv")
    file_list = glob.glob(data_link)

    # load data
    for file in file_list:
        data = pd.read_csv(file)
        # get useful informaton
        LED_Data = pd.DataFrame()
        LED_Data['Time'] = data['Time_Arduino']  - data.loc[10, 'Time_Arduino']
        LED_Data = LED_Data.set_index('Time')
        LED_Data = data.loc[:, 'CH1_PD730' : 'CH4_PD850'] 
        LED_Data['Trails'] = data['trail_times']

        # get age and group
        age = patients.set_index('case').loc[split_list[-1], 'age']

        # preprocess
        try: 
            OwO = Oxy_Cal(LED_Data, age)
            OWO_wona = fill_nan(OwO)
            OwO_filterd = filtering(OWO_wona)
            OwO_ave = ave(OwO_filterd)
            OwO_normalized = normalize(OwO_ave)
            OwO_normalized.to_csv('file_name')
        except:
            print(file)

## 修資料

In [None]:
path = 'path\sub07'
os.chdir(path)
file_name_1 = 'sub07_week8_Trail1-26.csv'
data_1 = pd.read_csv(file_name_1)
file_name_2 = 'sub07_week8_Trail27-32.csv'
data_2 = pd.read_csv(file_name_2)

In [None]:
LED_Data = data_1.loc[:, 'CH1_PD730' : 'CH4_PD850']
LED_Data['Trails'] = data_1.trail_times
LED_Data['Time'] = data_1['Time_Arduino'] - data_1.loc[10, 'Time_Arduino']
LED_Data = LED_Data.set_index('Time')
OwO = Oxy_Cal(LED_Data, age)
OwO_wona = fill_nan(OwO)
OwO_filterd_1 = filtering(OwO_wona)

In [None]:
LED_Data = data_2.loc[:, 'CH1_PD730' : 'CH4_PD850']
data_2['trail_times'] = data_2['trail_times'].apply(lambda x: x+26)
LED_Data['Trails'] = data_2.trail_times
LED_Data['Time'] = data_2['Time_Arduino'] - data_2.loc[10, 'Time_Arduino']
LED_Data = LED_Data.set_index('Time')
OwO = Oxy_Cal(LED_Data, age)
OwO_wona = fill_nan(OwO)
OwO_filterd_2 = filtering(OwO_wona)

In [None]:
OwO_1 = OwO_filterd_1[OwO_filterd_1['Trails'] < 26]
OwO_2 = OwO_filterd_2.loc[5.61:]
idd = OwO_2.index
timeOwO = [x + 1558.7 for x in idd]
OwO_2.index = timeOwO
OwO_combine = pd.concat([OwO_1, OwO_2])

In [None]:
OwO_combine.to_csv('file_name')