# Saliva Data Cleaning

In [None]:
from pathlib import Path
import json

import pandas as pd
import numpy as np
import pingouin as pg

import biopsykit as bp
from biopsykit.utils.time import time_to_timedelta
from biopsykit.utils.dataframe_handling import multi_xs, int_from_str_idx, camel_to_snake

from fau_colors import cmaps

import matplotlib.pyplot as plt
import seaborn as sns

from carwatch_analysis.datasets import CarWatchDatasetProcessed
from carwatch_analysis.data_cleaning.saliva import (
    clean_missing_values,
    clean_missing_date_information,
    clean_s0_after_wake_onset,
    clean_sampling_time_difference,
    clean_statistical_outlier,
    clean_physiological_outlier
)

%matplotlib widget
%load_ext autoreload
%autoreload 2

In [None]:
plt.close("all")

palette = sns.color_palette(cmaps.faculties)
sns.set_theme(context="notebook", style="ticks", palette=palette)

plt.rcParams["figure.figsize"] = (8, 4)
plt.rcParams["pdf.fonttype"] = 42
plt.rcParams["mathtext.default"] = "regular"

pg.options["round"] = 4

palette

## Setup Paths

In [None]:
deploy_type = "local"

In [None]:
# build path to data folder
config_dict = json.load(Path("../../../config.json").open(encoding="utf-8"))
base_path = Path("..").joinpath(config_dict[deploy_type]["base_path"])
base_path

In [None]:
dataset = CarWatchDatasetProcessed(base_path)
dataset

## Load Data

### Questionnaire

In [None]:
sleep_info = dataset.sleep_information_merged

### Saliva

In [None]:
cort_samples = dataset.cortisol_samples
cort_samples.head()

In [None]:
cort_features = dataset.cortisol_features
cort_features.head()

## Data Cleaning

In [None]:
print(f"Number of CARs before cleaning: {cort_samples.unstack('sample').shape[0]}")

### Remove CARs with any missing cortisol values

In [None]:
cort_samples = clean_missing_values(cort_samples)

### Remove CARs with missing date information
Missing date information is defined as no valid data for any of:
* recording date
* wake onset
* sample time information

In [None]:
cort_samples = clean_missing_date_information(cort_samples)

### Remove CARs with Differences >5 min between Wake Onset and S0

In [None]:
cort_samples = clean_s0_after_wake_onset(cort_samples)

### Remove CARs with absolute difference between two consecutive saliva samples of >5 min from the actual time

In [None]:
cort_samples = clean_sampling_time_difference(cort_samples)

### Remove Statistical Outlier ($> 3 \sigma$)

Remove CARs where any cortisol sample differs more than 3 standard deviations from the mean

In [None]:
cort_samples = clean_statistical_outlier(cort_samples)

### Remove Physiological Outlier (Cortisol > 70 nmol/l)

In [None]:
cort_samples = clean_physiological_outlier(cort_samples)

In [None]:
cort_samples.head()

## Apply Index to Cortisol Features

In [None]:
cort_features = cort_features.unstack("saliva_feature").reindex(cort_samples.unstack("sample").index)
cort_features = cort_features.stack()
cort_features.head()

## Adding Categorial Variables

### Wakeup Sources

In [None]:
wakeup_source = dataset.questionnaire["wakeup_source"].fillna(0).astype(int)

### Weekend

In [None]:
# NOTE: 4 = Friday, 5 = Saturday; this is chosen because "date" corresponds to the day when the night *started*,
# i.e, the nights from Friday to Saturday and from Saturday to Sunday are considered weekend
weekend = dataset.date["date"].dt.weekday.isin([4, 5]).astype(int)
weekend.name = "weekend"

### Wakeup Hour

In [None]:
wakeup_hour = np.floor(dataset.sleep_information_merged["wake_onset_time"] / pd.Timedelta(hours=1))
wakeup_hour.name = "wakeup_hour"

### Chronotype

In [None]:
meq = sleep_info["MEQ"]
chronotype = sleep_info["chronotype_coarse"]
chronotype.name = "chronotype"
within_ideal_bed_time = sleep_info["within_ideal_bed_time"]

In [None]:
for index in [wakeup_source, wakeup_hour, weekend, meq, chronotype, within_ideal_bed_time]:
    if index.name not in cort_samples.reset_index().columns:
        cort_samples = cort_samples.join(index)
        cort_features = cort_features.join(index)

### Apply Codebook: Rename Index Codes, Set Index Levels, Reorder Columns

Set desired Index Order

In [None]:
index_cols = [
    "subject", 
    "condition", 
    "chronotype", 
    "MEQ", 
    "night", 
    "within_ideal_bed_time", 
    "wakeup_source", 
    "weekend", 
    "wakeup_hour"
]

In [None]:
cort_samples = cort_samples.reset_index().set_index(index_cols + ["sample"])
cort_samples = cort_samples[["cortisol", "time"]]

cort_features = cort_features.reset_index().set_index(index_cols + ["saliva_feature"])

display(cort_samples.head())
display(cort_features.head())

In [None]:
codebook = dataset.codebook
cort_samples = bp.utils.dataframe_handling.apply_codebook(cort_samples, codebook)
cort_features = bp.utils.dataframe_handling.apply_codebook(cort_features, codebook)

display(cort_samples.head())
display(cort_features.head())

## Export

In [None]:
export_path = Path("../../exports")
export_path.mkdir(exist_ok=True)

In [None]:
cort_samples.to_csv(export_path.joinpath("cortisol_samples_cleaned.csv"))
cort_features.to_csv(export_path.joinpath("cortisol_features_cleaned.csv"))