# CARWatch – Saliva Data Cleaning and Processing

In [None]:
from pathlib import Path

import pandas as pd
import numpy as np

import biopsykit as bp
from biopsykit.utils.time import time_to_datetime
from biopsykit.utils.dataframe_handling import multi_xs, int_from_str_idx, camel_to_snake

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib widget
%load_ext autoreload
%autoreload 2

In [None]:
plt.close('all')
sns.set(style='ticks')

## Load Saliva Data

In [None]:
data_path = Path("../../../../Data")
export_path = Path("../..").joinpath("exports")
quest_path = data_path.joinpath("Questionnaire_Data")

In [None]:
quest = bp.io.load_questionnaire_data(quest_path.joinpath("Questionnaire_Data_CARWatch.csv"), subject_col='subject')
quest.head()

In [None]:
cort_long = bp.utils.dataframe_handling.wide_to_long(quest, stubname='cort', levels=["night", "sample"])
cort_long = cort_long.rename(columns={'cort': 'cortisol', 'cortTime': 'time_abs'})
cort_long = int_from_str_idx(cort_long, "night", "N(\w)", lambda x: x-1)

### Load Merged Sleep & Questionnaire Data

In [None]:
chrono_sleep = pd.read_csv(export_path.joinpath("imu_questionnaire_merged.csv"), index_col=['subject', 'night'])

### Convert Saliva Times into Minutes relative to S0

In [None]:
cort_long['wake_onset_time'] = pd.to_timedelta(chrono_sleep['wake_onset_time'])
cort_long['time'] = bp.saliva.utils.sample_times_datetime_to_minute(cort_long['time_abs'])
cort_long['time_abs'] = time_to_datetime(cort_long['time_abs'])

### Assign Recording Dates to Nights

In [None]:
date_start = pd.to_datetime(quest['dateStart'], format="%d.%m.%y")
date_start.name = 'date'
if 'date' not in cort_long.columns:
    cort_long = cort_long.join(date_start, on='subject')

In [None]:
date_delta = pd.TimedeltaIndex(cort_long.index.get_level_values('night'), unit='days')
date = cort_long['date'] + date_delta
cort_long['date'] = date

In [None]:
cort_long.head()

## Descriptives

**Number of Subjects**

In [None]:
num_subjects = len(quest.index.get_level_values('subject').unique())
print("Number of Subjects: {}".format(num_subjects))

**Age**

In [None]:
quest[["age"]].agg(['mean', 'std'])

**Gender**

In [None]:
gender = quest[["gender"]].value_counts()
gender = gender / gender.sum() * 100
gender

## Data Cleaning

### Remove SA and Night 2 (last night of Study without CAR)

In [None]:
cort_long = cort_long.drop('SA', level='sample', errors='ignore').drop(2, level='night', errors='ignore').unstack().dropna(how='all').stack()
print("Data after remove SA and Night 2: {}".format(len(cort_long.unstack('sample'))))
cort_long.head()

In [None]:
cort_uncleaned = cort_long.copy()

In [None]:
print("Data before cleaning: {}".format(len(cort_uncleaned.unstack('sample'))))

### Remove CARs with any missing cortisol values

In [None]:
missing_mask = (cort_long['cortisol'].unstack('sample').isna()).any(axis=1)
missing_mask = np.logical_or(missing_mask, (cort_long['cortisol'].unstack('sample') < 0.1).any(axis=1))
cort_long = cort_long.loc[~missing_mask]
print("CARs removed because of missing cortisol values. Resulting: {}".format(len(cort_long.unstack('sample'))))

### Remove CARs with no valid recording date, wake onset or sample time information

In [None]:
cort_long = cort_long.dropna(subset=['date', 'time_abs', 'wake_onset_time'])
print("CARs removed because of no valid recording dates, wake onset or sample time information. Resulting: {}".format(len(cort_long.unstack('sample'))))

### Remove CARs with Differences >5 min between Wake Onset and S0

In [None]:
times = cort_long.xs('S0', level='sample')[['wake_onset_time', 'time_abs']]

wo_mask = np.abs(times.diff(axis=1)['time_abs']) > pd.Timedelta("5min")

cort_long = cort_long.loc[~wo_mask]
print("Data after remove Wake Onset Difference: {}".format(len(cort_long.unstack('sample'))))

### Remove CARs with absolute difference between two consecutive saliva samples of >5 min from the actual time

In [None]:
time_mask = ((cort_long['time'].unstack(level='sample').diff(axis=1) - 15).abs() > 5).any(axis=1)
cort_long = cort_long.loc[~time_mask]
print("Data after remove Saliva Time Difference: {}".format(len(cort_long.unstack('sample'))))

### Remove Statistical Outlier ($> 3 \sigma$)

Remove CARs where any cortisol sample differs more than 3 standard deviations from the mean

In [None]:
outlier_mask = (cort_long.unstack('sample').transform(lambda df: (df - df.mean()) / df.std()).abs() > 3.0).any(axis=1)
cort_long = cort_long.loc[~outlier_mask]
print("Data after remove Statistical Outlier: {}".format(len(cort_long.unstack('sample'))))

### Remove Physiological Outlier (Cortisol > 70 nmol/l)

In [None]:
phys_mask = (cort_long.unstack('sample')['cortisol'] > 70).any(axis=1)
cort_long = cort_long.loc[~phys_mask]
print("Data after remove Physiological Outlier: {}".format(len(cort_long.unstack('sample'))))

In [None]:
cort_long

## Adding Categorial Variables

### Wakeup Sources

In [None]:
col = "wakeupSource"

wakeup_source = quest.filter(like=col)
wakeup_source = bp.questionnaires.utils.wide_to_long(wakeup_source, col, levels='night')
wakeup_source = int_from_str_idx(wakeup_source, 'night', "N(\w)", lambda x: x-1)
wakeup_source = wakeup_source.fillna(0)
if col not in cort_long.columns:
    cort_long = cort_long.join(wakeup_source)

### Condition

In [None]:
col = 'condition'

condition = quest.filter(like=col)
condition = bp.questionnaires.utils.wide_to_long(condition, col, levels='night')
condition = int_from_str_idx(condition, 'night', "N(\w)", lambda x: x-1)

if col not in cort_long.columns:
    cort_long = cort_long.join(condition)

### Weekend

In [None]:
cort_long['weekend'] = cort_long['date'].dt.weekday.isin([5, 6]).astype(int)

### Wakeup Hour

In [None]:
cort_long['wakeup_hour'] = (pd.Timestamp('today').normalize() + cort_long['wake_onset_time']).dt.hour

### Chronotype

In [None]:
cort_long['MEQ'] = chrono_sleep['MEQ']
cort_long['chronotype'] = chrono_sleep['chronotype_coarse']
cort_long['within_ideal_bed_time'] = chrono_sleep['within_ideal_bed_time']

### Apply Codebook: Rename Index Codes, Set Index Levels, Reorder Columns

In [None]:
cort_long.columns = [camel_to_snake(s) if s != "MEQ" else s for s in cort_long.columns]

Set desired Index Order

In [None]:
index_cols = ['condition', 'subject', 'chronotype', 'MEQ', 'night', 'within_ideal_bed_time', 'wakeup_source', 'date', 'weekend', 'wakeup_hour', 'sample']

In [None]:
cort_long = cort_long.reset_index()
cort_long.set_index(index_cols, inplace=True)
cort_long = cort_long[['cortisol', 'time']]

In [None]:
codebook = pd.read_excel(quest_path.joinpath("Codebook_CARWatch.xlsx"), index_col="variable")
cort_long = bp.utils.dataframe_handling.apply_codebook(codebook, cort_long)
cort_long.head()

## Feature Computation

In [None]:
cort_auc = bp.saliva.auc(cort_long, saliva_type='cortisol', remove_s0=False)
cort_auc.head()

In [None]:
cort_inc = bp.saliva.max_increase(cort_long, saliva_type='cortisol', remove_s0=False)
cort_inc.head()

In [None]:
cort_slope = bp.saliva.slope(cort_long, sample_idx=[0, 3], saliva_type='cortisol')
cort_slope = cort_slope.join(bp.saliva.slope(cort_long, sample_idx=[0, 4], saliva_type='cortisol'))
#cort_slope = cort_slope.join(bp.saliva.slope(cort_long, sample_idx=[2, 4], biomarker_type='cortisol'))
cort_slope.head()

In [None]:
cort_max = pd.DataFrame(cort_long['cortisol'].unstack('sample').max(axis=1), columns=['cortisol_cmax'])
cort_max.head()

In [None]:
cort_cini = cort_long.xs('S0', level='sample')[['cortisol']]
cort_cini.columns = ["cortisol_cini"]
cort_cini.head()

In [None]:
cort_feat = pd.concat([cort_auc, cort_inc, cort_slope, cort_cini, cort_max], axis=1)
cort_feat = bp.saliva.utils.saliva_feature_wide_to_long(cort_feat, "cortisol")
cort_feat.head()

Drop CARs that don't fulfill the criterion by Weizman et al (1971): Maximum increase of CAR should be at least 2.5 nmol/l (*currently not applied due to recommendations of the CAR Expert consensus guidelines*)

In [None]:
#car_inc_mask = cort_feat['cortisol'].xs('cortisol_max_inc', level='biomarker') >= 2.5
#cort_feat = cort_feat.loc[car_inc_mask]
#cort_long = cort_long.unstack(level='sample').loc[cort_mask].stack()

### Remove Statistical Outlier ($> 3 \sigma$)

(*currently not applied because raw samples were already checked for statistical outlier*)

In [None]:
#outlier_mask = (cort_feat.unstack('biomarker').transform(lambda df: (df - df.mean()) / df.std()).abs() > 3.0)
#cort_feat = cort_feat.loc[~outlier_mask.stack()['cortisol']]
#cort_feat

In [None]:
#print("Number of nights before statistical outlier removal: {}".format(len(cort_long.unstack('sample'))))
#print("Number of nights after statistical outlier removal: {}".format(len(cort_long_out.unstack('sample'))))

## Export

In [None]:
cort_long.to_csv(export_path.joinpath('cortisol_samples_cleaned.csv'))
cort_feat.to_csv(export_path.joinpath('cortisol_features_cleaned.csv'))
#cort_long_out.to_csv(export_path.joinpath('cortisol_samples_outlier_removed.csv'))
#cort_feat_out.to_csv(export_path.joinpath('cortisol_features_outlier_removed.csv'))