# 1. Experiements towards reference speech characterization

The following cells import the necessary functions, and define the code to run the experiments. Update the paths in the cell bellow.

In [None]:
ROOT = "/path/to/ReferenceSpeech/" # UPDATE THIS!

# output_dir: used to save results and configs.
MAIN_OUTPUT_DIR = ROOT + "/results/" # UPDATE THIS!

print ("Let's get started!")

In [None]:
import pandas as pd
import numpy as np
import random
import joblib
import json
import pickle
import os 

SEED=741
np.random.seed(seed=SEED)
random.seed(SEED)

import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [None]:
from utils.speech_model import SpeechModel             # speech model class
from utils.outlier_removal import OutlierRemoval       # outlier removal
from utils.reference_intervals import RefIntEstimator  # class for defining reference interval
from utils.mannwhitheyu import feats_failling_mwut     # functions for mann-whitney U test (partition reference population)
from utils.io import *                                 # Function for loading data
from utils.scale import *                              # scale/normalization functions
from utils.plots import *                              # function to creatre the radar plot, and other plots
from utils.utils import *

In [None]:
# Run one experiment:

def experiment(
    ref_speech_model, control_config, disease_configs,
    disease_names, features_by_task, features_to_drop=[],
    norm_strat="none", ref_scaler=None):
    """
    : ref speech model
    : control config
    : disease_configs (list of configs)
    : disease names (useful for image captions)
    : features to drop (list of features to exclude, e.g. because they have invalid intervals)
    : features_by_task (mapping of features to tasks)
    : norm_strat (string) It accepts {"none", "control_scaler", "ref_scaler"}.
        <control_scaler> means that controls will be used to train the scaler
        <ref_scaler> means that scaler from ref model is be used. 
    : ref_scaler
    """
    color_lst = ["blue", "magenta", "orange", "red"]
    
    # create control model
    print ("[INFO]: Getting model for controls ...")
    control_model = SpeechModel(control_config)
    _ = control_model.transform_feats_df_by_task(
        features_by_task, drop_features=features_to_drop)
    
    # normalize data:
    assert norm_strat in ["none", "control_scaler", "ref_scaler"]
    if norm_strat == "control_scaler":
        control_model.normalize(pretrained_scaler=None, train_scaler=True)
        control_scaler = control_model.scaler
    elif norm_strat == "ref_scaler":
        assert ref_scaler is not None, "if norm_strat is set to 'ref_scaler', ref_scaler cannot be None."
        control_model.normalize(pretrained_scaler=ref_scaler, train_scaler=False)
            
    # get RI dataframe
    ri_df = ref_speech_model.ri_df.copy()
    control_model.ri_df = ri_df
    ri_df["keep_feature"] = True
    
    # list features to be analysed
    features_to_include = [
        f 
        for f in ri_df[ri_df.keep_feature].feature.values 
        if f not in features_to_drop]

    # are control values inside RI?
    control_feats_vs_ri_df = control_model.are_individual_feats_in_ri(
        control_model.feats_df,
        feature_names=features_to_include,
        friendly_summary=False) 
    control_feats_vs_ri_df = control_feats_vs_ri_df[
            control_feats_vs_ri_df.n_non_nan_samples > 0
        ][["feature", "prop_out_ri_over_subgroup"]]

    # count number of features in RI per control subject
    feats_in_ri_per_control_subj = control_model.count_feats_out_ri(
        control_model.feats_df, 
        feature_names=features_to_include)
    
    # Summary for control subjects
    print()
    print ("*** SUMMARY FOR CONTROLS ***")
    print ("number of features", len(control_feats_vs_ri_df))

    print(len(feats_in_ri_per_control_subj[feats_in_ri_per_control_subj.n_feats_out < 1]), 
          "indiv with 0 feats out ri, out of",
          len(feats_in_ri_per_control_subj),
          "subjects. i.e, a proportion of", 
          len(feats_in_ri_per_control_subj[feats_in_ri_per_control_subj.n_feats_out < 1])/len(feats_in_ri_per_control_subj),
         )
    
    print ()
    print("Average number of features outside RI:", 
          np.mean(feats_in_ri_per_control_subj.n_feats_out.values))
    print("Average number of features larger than RI:", 
          np.mean(feats_in_ri_per_control_subj.n_feats_over_upperlimit.values))
    print("Average number of features smaller than RI:", 
          np.mean(feats_in_ri_per_control_subj.n_feats_below_lowerlimit.values))
    
    # Make radar plot:
    fig_f = clear_plot(ref_speech_model)
    plotted_vals = control_model.add_trace_to_radar_plot(to_plot="all", line_opacity=0.1, 
            fig=fig_f, 
            original_plot_feature_names=ref_speech_model.feature_names_lst,
            legend_prefix="control", save_fig=False,
            color=color_lst[0])
    
    # EXPLORE DISEASE POPULATION:
    feats_vs_ri_lst = [control_feats_vs_ri_df]
    feats_in_ri_per_subj_lst = [feats_in_ri_per_control_subj]
    disease_models = []
    for i, c in enumerate(disease_configs):

        # create disease model
        print ("[INFO]: Getting model for disease ", disease_names[i])
        disease_model = SpeechModel(c)
        _ = disease_model.transform_feats_df_by_task(features_by_task, drop_features=features_to_drop)
        
        # normalize data
        if norm_strat == "control_scaler":
            disease_model.normalize(pretrained_scaler=control_scaler, train_scaler=False)
        elif norm_strat == "ref_scaler":
            disease_model.normalize(pretrained_scaler=ref_scaler, train_scaler=False)
            
        # are features inside RI?
        disease_feats_vs_ri_df = control_model.are_individual_feats_in_ri(
            disease_model.feats_df,
            feature_names=features_to_include,
            friendly_summary=False,
            mode="inside_ri") 

        feats_in_ri_per_d_subject = control_model.count_feats_out_ri(
            disease_model.feats_df, feature_names=features_to_include)
        
        # store relevant info:
        feats_vs_ri_lst.append(disease_feats_vs_ri_df)
        feats_in_ri_per_subj_lst.append(feats_in_ri_per_d_subject)
        disease_models.append(disease_model)
        
        # Summary for control subjects
        print()
        print ("*** SUMMARY FOR DISEASE ", disease_names[i] ," ***")
        print(len(feats_in_ri_per_d_subject[feats_in_ri_per_d_subject.n_feats_out < 1]), 
              "indiv with 0 feats out ri, out of",
              len(feats_in_ri_per_d_subject),
              "subjects. i.e, a proportion of",
              len(feats_in_ri_per_d_subject[feats_in_ri_per_d_subject.n_feats_out < 1])/len(feats_in_ri_per_d_subject),
             )

        print("Average number of features outside RI:",
              np.mean(feats_in_ri_per_d_subject.n_feats_out.values))
        print("Average number of features larger than RI:", np.mean(feats_in_ri_per_d_subject.n_feats_over_upperlimit.values))
        print("Average number of features smaller than RI:", np.mean(feats_in_ri_per_d_subject.n_feats_below_lowerlimit.values))

        
        # Make radar plot:
        fig_f = clear_plot(ref_speech_model)
        _ = disease_model.add_trace_to_radar_plot(to_plot="all", line_opacity=0.1, 
                fig=fig_f, 
                original_plot_feature_names=ref_speech_model.feature_names_lst,
                legend_prefix="control", save_fig=False,
                color=color_lst[i+1])
    
    # print table with feature summary
    for i, (disease, d_df) in enumerate(zip(disease_names, feats_vs_ri_lst[1:])):
    
        if i==0:
            # merge controls and dementia
            summary_df = pd.merge(
                control_feats_vs_ri_df[["feature", "prop_out_ri_over_subgroup"]], 
                d_df[["feature", "prop_out_ri_over_subgroup"]], 
                on="feature", 
                suffixes=("#Cont", "#" + disease))
        else:
            # merge
            tmp_df = d_df[["feature", "prop_out_ri_over_subgroup"]]
            tmp_df = tmp_df.rename(columns={"prop_out_ri_over_subgroup": "prop_out_ri_over_subgroup#" + disease})
            summary_df = pd.merge(summary_df, tmp_df, on="feature")

    print ("[INFO]: Done! :) ")
    return summary_df, control_model, disease_models


def clear_plot(ref_model):
    fig_f = ref_model.build_radar_plot(
        plot_mean=True, save_fig=False,
        plot_ri=True, mean_color="green")
    return fig_f   

# 2. Configurations 

Here we organize feature names, features that will be excluded because they may have failed some tests, and mappings between features and tasks. Notice that although we define *excluded_due_to_read_speech_test* and *invalid_intervals* here, they result from design exploration experiments described below in sections 4. and 5..

In [None]:
FEATURE_NAMES = {
    "linguistic": [
        "content_density", "idea_density", "honore_statistic", "brunet_index", "type_token_ratio",
        "discourse_marker_rate", 
        "polarity", 
        "repeat_ratio",
        "mean_coher_sentence","variability_coher_sentence","mean_coher_14tokens","variability_coher_14tokens",
        "1st_pronouns_ratio",
        "ambiguous_ref_ratio","ref_chain_ratio"
        ],
    "praat": [
        "meanF0","stdevF0","minF0","maxF0","hnr",
        "localJitter","localabsoluteJitter","rapJitter","ppq5Jitter","ddpJitter",
        "localShimmer","localdbShimmer","apq3Shimmer","aqpq5Shimmer","apq11Shimmer","ddaShimmer",
        "f1_mean","f2_mean","f3_mean","f4_mean","f1_median","f2_median","f3_median","f4_median",

        "speechrate(nsyll / dur)","articulation rate(nsyll / phonationtime)","ASD(speakingtime / nsyll)",
         "mean_pause_dur","mean_speech_dur","silence_rate (silencetime/dur)","silence_speech_ratio","mean_sil_count","lsil_rate","lsil_speech_ratio","mean_lsil_count"
        ],
    "egemaps": [
        "F0semitoneFrom27.5Hz_sma3nz_amean", "F0semitoneFrom27.5Hz_sma3nz_stddevNorm", "F0semitoneFrom27.5Hz_sma3nz_percentile20.0", 
        "F0semitoneFrom27.5Hz_sma3nz_percentile50.0", 
        "F0semitoneFrom27.5Hz_sma3nz_percentile80.0", "F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2",
        "F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope", 
        "F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope", "F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope", 
        "F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope",

        "HNRdBACF_sma3nz_amean", "HNRdBACF_sma3nz_stddevNorm",
        "jitterLocal_sma3nz_amean", "jitterLocal_sma3nz_stddevNorm", "shimmerLocaldB_sma3nz_amean", "shimmerLocaldB_sma3nz_stddevNorm", 

        "F1frequency_sma3nz_amean", "F1frequency_sma3nz_stddevNorm", "F1bandwidth_sma3nz_amean", "F1bandwidth_sma3nz_stddevNorm",
        "F2frequency_sma3nz_amean", "F2frequency_sma3nz_stddevNorm", "F2bandwidth_sma3nz_amean","F2bandwidth_sma3nz_stddevNorm",
        "F3frequency_sma3nz_amean", "F3frequency_sma3nz_stddevNorm", "F3bandwidth_sma3nz_amean","F3bandwidth_sma3nz_stddevNorm",

        "loudnessPeaksPerSec", "VoicedSegmentsPerSec", "MeanVoicedSegmentLengthSec", "StddevVoicedSegmentLengthSec", "MeanUnvoicedSegmentLength", "StddevUnvoicedSegmentLength"
        ],
    "rythm_features": [
        "speechrate(nsyll / dur)","articulation rate(nsyll / phonationtime)",
        "ASD(speakingtime / nsyll)",
        "mean_pause_dur","mean_speech_dur","silence_rate (silencetime/dur)",
        "silence_speech_ratio","mean_sil_count","lsil_rate",
        "lsil_speech_ratio","mean_lsil_count",
        "loudnessPeaksPerSec", "VoicedSegmentsPerSec", "MeanVoicedSegmentLengthSec", "StddevVoicedSegmentLengthSec", 
        "MeanUnvoicedSegmentLength", "StddevUnvoicedSegmentLength"
        ],
    "for_outlier_detect": ['F0semitoneFrom27.5Hz_sma3nz_amean_foroutlierdetect', 'F0semitoneFrom27.5Hz_sma3nz_stddevNorm_foroutlierdetect',
         'F0semitoneFrom27.5Hz_sma3nz_percentile20.0_foroutlierdetect', 'F0semitoneFrom27.5Hz_sma3nz_percentile50.0_foroutlierdetect',
         'F0semitoneFrom27.5Hz_sma3nz_percentile80.0_foroutlierdetect', 'F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2_foroutlierdetect',
         'F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope_foroutlierdetect', 'F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope_foroutlierdetect',
         'F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope_foroutlierdetect', 'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope_foroutlierdetect',
         'HNRdBACF_sma3nz_amean_foroutlierdetect', 'HNRdBACF_sma3nz_stddevNorm_foroutlierdetect',  'jitterLocal_sma3nz_amean_foroutlierdetect',
         'jitterLocal_sma3nz_stddevNorm_foroutlierdetect', 'shimmerLocaldB_sma3nz_amean_foroutlierdetect', 'shimmerLocaldB_sma3nz_stddevNorm_foroutlierdetect',
         'F1frequency_sma3nz_amean_foroutlierdetect', 'F1frequency_sma3nz_stddevNorm_foroutlierdetect', 'F1bandwidth_sma3nz_amean_foroutlierdetect',
         'F1bandwidth_sma3nz_stddevNorm_foroutlierdetect', 'F2frequency_sma3nz_amean_foroutlierdetect', 'F2frequency_sma3nz_stddevNorm_foroutlierdetect',
         'F2bandwidth_sma3nz_amean_foroutlierdetect', 'F2bandwidth_sma3nz_stddevNorm_foroutlierdetect', 'F3frequency_sma3nz_amean_foroutlierdetect', 
         'F3frequency_sma3nz_stddevNorm_foroutlierdetect', 'F3bandwidth_sma3nz_amean_foroutlierdetect', 'F3bandwidth_sma3nz_stddevNorm_foroutlierdetect',
         'loudnessPeaksPerSec_foroutlierdetect', 'VoicedSegmentsPerSec_foroutlierdetect', 'MeanVoicedSegmentLengthSec_foroutlierdetect', 'StddevVoicedSegmentLengthSec_foroutlierdetect',
         'MeanUnvoicedSegmentLength_foroutlierdetect', 'StddevUnvoicedSegmentLength_foroutlierdetect'
        ],
    "excluded_due_to_read_speech_test": ['lsil_rate', 'lsil_speech_ratio', 'maxF0', 'mean_lsil_count', 'minF0'],
                                         
    "invalid_intervals": ['A#F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope',
       'A#F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2',
       'A#F0semitoneFrom27.5Hz_sma3nz_percentile50.0',
       'A#F0semitoneFrom27.5Hz_sma3nz_percentile80.0',
       'A#F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope',
       'A#HNRdBACF_sma3nz_amean', 'A#HNRdBACF_sma3nz_stddevNorm',
       'A#localabsoluteJitter', 'A#ppq5Jitter',
       'Pic#F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2',
       'Pic#ambiguous_ref_ratio', 'Pic#discourse_marker_rate'
       ],
}         

In [None]:
# Let's organize the features by task:

FEATURES_BY_TASK = {
        
    "vowel_a": add_del_lst(add=[FEATURE_NAMES["egemaps"], FEATURE_NAMES["praat"]],
                          subtract=[FEATURE_NAMES["rythm_features"]]),
    
    "read_speech": add_del_lst(
        add=[FEATURE_NAMES["egemaps"], FEATURE_NAMES["praat"]], 
        subtract=[FEATURE_NAMES["excluded_due_to_read_speech_test"]]),
    
    "spont_speech_interview": add_del_lst( 
        add=[FEATURE_NAMES["egemaps"], FEATURE_NAMES["praat"]], 
        subtract=[FEATURE_NAMES["excluded_due_to_read_speech_test"]]),
    
    "spont_speech_picture":  add_del_lst( 
        add=[FEATURE_NAMES["egemaps"], FEATURE_NAMES["praat"], FEATURE_NAMES["linguistic"]], 
        subtract=[FEATURE_NAMES["excluded_due_to_read_speech_test"]])
}

# 3. Outlier removal

Outlier removal is a very important step prior to the definition of reference intervals. The outlier estimation strategy chosen was the mahalanobis distance + MDC for covariance estimator, and IQR for threshold definition. Mahalanobis works well for multivariate data.

We opted to use only egemaps-based features for the outlier removal. We excluded praat features because there was too much dependency with egemaps features, and the matrix was not full rank. We excluded linguistic features because they were not meaningfull for read speech (no variance for the same texts).

Normalization of the datasets is expected to impact the results of outlier estimation, but it is rarely discussed in the literature, and there are no clear guidelines of which normalization method works best for each outlier removal strategy.
For this reason, and the fact that the mahalanobis distance should be robust to different scales, we decided not to perform normalization prior to the outlier removal. We only normalize it to fit the pca, for better data visualization.

In [None]:
# Determine outliers and FOR SENTENCES

main_output_dir = MAIN_OUTPUT_DIR + "/outlier_removal_sentence/"

# Define config for reference speech model
reference_config = {
    "feature_dir" : ROOT + "/features/",
    "metadata_dir": ROOT + "/data_info/",
    "datasets": ["clac_healthy_w_vowel", "voxceleb_annotated_usa_concatenated", "timit_concatenated"],
    "features": "large_set",
    "feature_columns_to_drop": [],
    "output_dir": main_output_dir +"/ref_model/",
    "metadata_report": False, #TODO
    "use_feat_subset": None,
    
    # subselect data
    "sex": "both", #"both",  #"male", "female"
    "age": {  # allows specifying an age range
        "min": None,  # either None or an int value
        "max": None,  # either None or an int value
    },
    "subselect_data": {"task_type": ["pic_description", "read_speech", "concat_interview_segm"]},
    
    # outlier removal
    "outlier_removal": True,
    "outlier_removal_conf":{
        "from_id_list": False,
        "outlier_id_list": None,
        "method": "mcd_mahalanobis_dist",
        "pca_oultier_limits": None,
        "attribute_to_color_in_pca": None, #"origin_dataset", #"task_type",
        "save_data_after": False,
    },
    "distinct_feats_for_outlier_detect": True,
            
    # normalization -> we are not yet normalizing.
    "scale": False,
    "multi_normalize": False,
    "multi_normalize_conditions": None,
    "scaling_mode": None, #"mean_0_std_1", #None, "min_max", "mean_0_std_1", mean_1_std_1
    "train_scaler": False,
    "save_data_after_normalize": False,
}

# Initialize reference model
reference_speech_model = SpeechModel(reference_config)
sentence_outliers = reference_speech_model.meta_df[reference_speech_model.meta_df.outlier == True].wav_file_id.values

In [None]:
# Determine outliers and FOR VOWELS

main_output_dir = MAIN_OUTPUT_DIR + "/outlier_removal_vowel/"

# Define config for reference speech model
reference_vowels_config = {
    "feature_dir" : ROOT + "/features/",
    "metadata_dir": ROOT + "/data_info/",
    "datasets": ["clac_healthy_w_vowel", "voxceleb_annotated_usa_concatenated", "timit_concatenated"],
    "features": "large_set",
    "feature_columns_to_drop": [],
    "output_dir": main_output_dir +"/ref_model/",
    "metadata_report": False, #TODO
    "use_feat_subset": None,
    
    # subselect data
    "sex": "both", #"both",  #"male", "female"
    "age": {  # allows specifying an age range
        "min": None,  # either None or an int value
        "max": None,  # either None or an int value
    },
    "subselect_data": {"task_type": ["vowel_a"]},
    
    # outlier removal
    "outlier_removal": True,
    "outlier_removal_conf":{
        "from_id_list": False,
        "outlier_id_list": None,
        "method": "mcd_mahalanobis_dist",
        "pca_oultier_limits": None,
        "attribute_to_color_in_pca": None, #"origin_dataset", #"task_type",
        "save_data_after": False,
    },
    "distinct_feats_for_outlier_detect": True, #False
            
    # normalization -> we are not yet normalizing.
    "scale": False,
    "multi_normalize": False,
    "multi_normalize_conditions": None,
    "scaling_mode": None, #"mean_0_std_1", #None, "min_max", "mean_0_std_1", mean_1_std_1
    "train_scaler": False,
    "save_data_after_normalize": False,
}

# Initialize reference model
reference_speech_model = SpeechModel(reference_vowels_config)
vowel_outliers = reference_speech_model.meta_df[reference_speech_model.meta_df.outlier == True].wav_file_id.values

In [None]:
outlier_ids = vowel_outliers.tolist() + sentence_outliers.tolist()
print ("In total, ", len(outlier_ids), "outliers were detected, and will be excluded from further analysis.")

# 4. When  should we partition the reference population, and derive distinct reference intervals?

Because the features used are affected not only by speech affecting diseases but also other factors, such as demographics and speech tasks, here we explore whether we should partition the reference population and derive distinct Reference Intervals for several factors: gender, age range, speech task, corpus, and normalized corpus.

If you are not interested in this section, you may skip directlty to section 6.

### Explore gender
- Using Mann-Whitney U test
- Using the corpus CLAC and the speech task picture_description.

In [None]:
# Initialize read speech model:
main_output_dir = MAIN_OUTPUT_DIR + "partitioning/gender/"

# Define config for reference speech model
config = {
    "feature_dir" : ROOT + "/features/",
    "metadata_dir": ROOT + "/data_info/",
    "datasets": ["clac_healthy_w_vowel"],
    "features": "large_set",
    "feature_columns_to_drop": [],
    "output_dir": main_output_dir +"/ref_model/",
    "metadata_report": False, #TODO
    "use_feat_subset": None,
    
    # subselect data
    "sex": "both", #"both",  #"male", "female"
    "age": {  # allows specifying an age range
        "min": None,  # either None or an int value
        "max": None,  # either None or an int value
    },
    "subselect_data": {"task_type": ["pic_description"]},
    
    # outlier removal
    "outlier_removal": True,
    "outlier_removal_conf":{
        "from_id_list": True,
        "outlier_id_list": outlier_ids,
        "method": None, #"mcd_mahalanobis_dist",
        "pca_oultier_limits": None,
        "attribute_to_color_in_pca": None, #"origin_dataset", #"task_type", #"origin_dataset", None
        "save_data_after": False,
    },
    "distinct_feats_for_outlier_detect": True,
            
    # normalization
    "scale": True,
    "multi_normalize": False,
    "multi_normalize_conditions": None,
    "scaling_mode": "mean_0_std_1", #None, "min_max", "mean_0_std_1", mean_1_std_1
    "train_scaler": True,
    "save_data_after_normalize": False,
}

# Initialize reference model
speech_model = SpeechModel(config)


# Get features and metadata:
feats_names = speech_model.feature_names_lst
meta = speech_model.meta_df.copy()
feats = speech_model.feats_df.copy()


# Run Mann-Whitney U test
print (" ------------ ANALYSIS USING ALL DATA  ------------")
features_that_should_have_distinct_ris = feats_failling_mwut(
    meta, feats, feats_names, {"gender": ["female"]}, {"gender": ["male"]})

print (
    len(features_that_should_have_distinct_ris), 
    " features should have distinct RIs for each group, out of ",
    len(feats_names) )
print (
    "On the other hand, we may derive the same RI for both groups for",
    len(feats_names) - len(features_that_should_have_distinct_ris),
    "features."
)

### Explore age ranges (<50 vs >=50)
- Using Mann-Whitney U test
- Using the corpus CLAC and the speech task picture_description.
- Two separate analyses for male and female, given that previously the decision was to derive different models for male and female subjects.

In [None]:
# Initialize read speech model:
main_output_dir = MAIN_OUTPUT_DIR + "partitioning/age/"

# Define config for reference speech model
config = {
    "feature_dir" : ROOT + "/features/",
    "metadata_dir": ROOT + "/data_info/",
    "datasets": ["clac_healthy_w_vowel"],
    "features": "large_set",
    "feature_columns_to_drop": [],
    "output_dir": main_output_dir +"/ref_model/",
    "metadata_report": False, #TODO
    "use_feat_subset": None,
    
    # subselect data
    "sex": "both", #"both",  #"male", "female"
    "age": {  # allows specifying an age range
        "min": None,  # either None or an int value
        "max": None,  # either None or an int value
    },
    "subselect_data": {"task_type": ["pic_description"]},
    
    # outlier removal
    "outlier_removal": True,
    "outlier_removal_conf":{
        "from_id_list": True,
        "outlier_id_list": outlier_ids,
        "method": None, #"mcd_mahalanobis_dist",
        "pca_oultier_limits": None,
        "attribute_to_color_in_pca": None, #"origin_dataset", #"task_type", #"origin_dataset", None
        "save_data_after": False,
    },
    "distinct_feats_for_outlier_detect": True,
            
    # normalization
    "scale": True,
    "multi_normalize": True,
    "multi_normalize_conditions": [
        {
        "train_scaler":True, 
        "pretrained_scaler":None, 
        "cols_for_data_selection":{
            "gender": ["female"],
         }},
        {
        "train_scaler":True, 
        "pretrained_scaler":None, 
        "cols_for_data_selection":{
            "gender": ["male"]
         }}
    ],
    "scaling_mode": "mean_0_std_1", #None, "min_max", "mean_0_std_1", mean_1_std_1
    "train_scaler": True,
    "save_data_after_normalize": False,
}

# Initialize reference model
speech_model = SpeechModel(config)


# Get features and metadata:
feats_names = speech_model.feature_names_lst
meta = speech_model.meta_df.copy()
feats = speech_model.feats_df.copy()


# Run Mann-Whitney U test
# male
print (" ------------ ANALYSIS FOR MALE  ------------")
features_that_should_have_distinct_ris_male = feats_failling_mwut(
    meta, feats, feats_names, {"age": (0, 49)}, {"age": (50, 100)},
    cols_for_data_selection={"gender": ["male"]})

print (
    len(features_that_should_have_distinct_ris_male), 
    " features should have distinct RIs for each group, out of ",
    len(feats_names))
print (
    "On the other hand, we may derive the same RI for both groups for",
    len(feats_names) - len(features_that_should_have_distinct_ris_male),
    "features.")

# female
print (" ------------ ANALYSIS FOR FEMALE  ------------")
features_that_should_have_distinct_ris_female = feats_failling_mwut(
    meta, feats, feats_names, {"age": (0, 49)}, {"age": (50, 10000)},
    cols_for_data_selection={"gender": ["female"]})

print (
    len(features_that_should_have_distinct_ris_female), 
    " features should have distinct RIs for each group, out of ",
    len(feats_names))
print (
    "On the other hand, we may derive the same RI for both groups for",
    len(feats_names) - len(features_that_should_have_distinct_ris_female),
    "features.")

print (" ------------ COMBINING BOTH GENDERS ------------ ")
feats_diff_ris_both_gender = np.unique(
    np.concatenate((
        features_that_should_have_distinct_ris_male, 
        features_that_should_have_distinct_ris_female)))
print ("Combining both genders:", len(feats_diff_ris_both_gender))
print ("Number of features with p>= 0.001 (Combining both genders):", len(feats_names) - len(feats_diff_ris_both_gender))

### Explore speech tasks 
- Using Mann-Whitney U test
- Using the corpus CLAC, and comparing the read speech task and the picture_description task.
- Two separate analyses for male and female, given that previously the decision was to derive different models for male and female subjects.

In [None]:
# Initialize read speech model:
main_output_dir = MAIN_OUTPUT_DIR + "partitioning/task/"

# Define config for reference speech model
config = {
    "feature_dir" : ROOT + "/features/",
    "metadata_dir": ROOT + "/data_info/",
    "datasets": ["clac_healthy_w_vowel"],
    "features": "large_set",
    "feature_columns_to_drop": [],
    "output_dir": main_output_dir +"/ref_model/",
    "metadata_report": False, #TODO
    "use_feat_subset": None,
    
    # subselect data
    "sex": "both",  #"male", "female"
    "age": {  # allows specifying an age range
        "min": None,  # either None or an int value
        "max": None,  # either None or an int value
    },
    "subselect_data": {"task_type": ["pic_description", "read_speech"]},
    
    # outlier removal
    "outlier_removal": True,
    "outlier_removal_conf":{
        "from_id_list": True,
        "outlier_id_list": outlier_ids,
        "method": None, #"mcd_mahalanobis_dist",
        "pca_oultier_limits": None,
        "attribute_to_color_in_pca": None, #"origin_dataset", #"task_type", #"origin_dataset", None
        "save_data_after": False,
    },
    "distinct_feats_for_outlier_detect": True,
            
    # normalization
    "scale": True,
    "multi_normalize": True,
    "multi_normalize_conditions": [
        {
        "train_scaler":True, 
        "pretrained_scaler":None, 
        "cols_for_data_selection":{
            "gender": ["female"],
         }},
        {
        "train_scaler":True, 
        "pretrained_scaler":None, 
        "cols_for_data_selection":{
            "gender": ["male"]
         }}
    ],
    "scaling_mode": "mean_0_std_1", #None, "min_max", "mean_0_std_1", mean_1_std_1
    "train_scaler": True,
    "save_data_after_normalize": False,
}

# Initialize reference model
speech_model = SpeechModel(config)


# Get features and metadata:
feats_names = FEATURE_NAMES["egemaps"] + FEATURE_NAMES["praat"]
meta = speech_model.meta_df.copy()
feats = speech_model.feats_df.copy()


# Run Mann-Whitney U test
# male
print (" ------------ ANALYSIS FOR MALE  ------------")
features_that_should_have_distinct_ris_male = feats_failling_mwut(
    meta, feats, feats_names, {"task_type": ["pic_description"]}, {"task_type": ["read_speech"]},
    cols_for_data_selection={"gender": ["male"]})

print (
    len(features_that_should_have_distinct_ris_male), 
    " features should have distinct RIs for each group, out of ",
    len(feats_names))
print (
    "On the other hand, we may derive the same RI for both groups for",
    len(feats_names) - len(features_that_should_have_distinct_ris_male),
    "features.")

# female
print (" ------------ ANALYSIS FOR FEMALE  ------------")
features_that_should_have_distinct_ris_female = feats_failling_mwut(
    meta, feats, feats_names, {"task_type": ["pic_description"]}, {"task_type": ["read_speech"]},
    cols_for_data_selection={"gender": ["female"]})

print (
    len(features_that_should_have_distinct_ris_female), 
    " features should have distinct RIs for each group, out of ",
    len(feats_names))
print (
    "On the other hand, we may derive the same RI for both groups for",
    len(feats_names) - len(features_that_should_have_distinct_ris_female),
    "features.")

print (" ------------ COMBINING BOTH GENDERS ------------ ")
feats_diff_ris_both_gender = np.unique(
    np.concatenate((
        features_that_should_have_distinct_ris_male, 
        features_that_should_have_distinct_ris_female)))
print ("Combining both genders:", len(feats_diff_ris_both_gender))
print ("Number of features with p>= 0.001 (Combining both genders):", len(feats_names) - len(feats_diff_ris_both_gender))

### Explore different corpora
- Using Mann-Whitney U test
- Using the speech task read speech, and comparing the corpus CLAC and TIMIT.
- Two separate analyses for male and female, given that previously the decision was to derive different models for male and female subjects.

In [None]:
# Initialize read speech model:
main_output_dir = MAIN_OUTPUT_DIR + "partitioning/corpus/"

# Define config for reference speech model
config = {
    "feature_dir" : ROOT + "/features/",
    "metadata_dir": ROOT + "/data_info/",
    "datasets": ["clac_healthy_w_vowel", "timit_concatenated"],
    "features": "large_set",
    "feature_columns_to_drop": [],
    "output_dir": main_output_dir +"/ref_model/",
    "metadata_report": False, #TODO
    "use_feat_subset": None,
    
    # subselect data
    "sex": "both",  #"male", "female"
    "age": {  # allows specifying an age range
        "min": None,  # either None or an int value
        "max": None,  # either None or an int value
    },
    "subselect_data": {"task_type": ["read_speech"]},
    
    # outlier removal
    "outlier_removal": True,
    "outlier_removal_conf":{
        "from_id_list": True,
        "outlier_id_list": outlier_ids,
        "method": None, #"mcd_mahalanobis_dist",
        "pca_oultier_limits": None,
        "attribute_to_color_in_pca": None, #"origin_dataset", #"task_type", #"origin_dataset", None
        "save_data_after": False,
    },
    "distinct_feats_for_outlier_detect": True,
            
    # normalization
    "scale": True,
    "multi_normalize": True,
    "multi_normalize_conditions": [
        {
        "train_scaler":True, 
        "pretrained_scaler":None, 
        "cols_for_data_selection":{
            "gender": ["female"],
         }},
        {
        "train_scaler":True, 
        "pretrained_scaler":None, 
        "cols_for_data_selection":{
            "gender": ["male"]
         }}
    ],
    "scaling_mode": "mean_0_std_1", #None, "min_max", "mean_0_std_1", mean_1_std_1
    "train_scaler": True,
    "save_data_after_normalize": False,
}

# Initialize reference model
speech_model = SpeechModel(config)


# Get features and metadata:
feats_names = FEATURE_NAMES["egemaps"] + FEATURE_NAMES["praat"]
meta = speech_model.meta_df.copy()
feats = speech_model.feats_df.copy()


# Run Mann-Whitney U test
# male
print (" ------------ ANALYSIS FOR MALE  ------------")
features_that_should_have_distinct_ris_male = feats_failling_mwut(
    meta, feats, feats_names, {"origin_dataset": ["timit_concatenated"]}, {"origin_dataset": ["clac_healthy_w_vowel"]},
    cols_for_data_selection={"gender": ["male"]})

print (
    len(features_that_should_have_distinct_ris_male), 
    " features should have distinct RIs for each group, out of ",
    len(feats_names))
print (
    "On the other hand, we may derive the same RI for both groups for",
    len(feats_names) - len(features_that_should_have_distinct_ris_male),
    "features.")

# female
print (" ------------ ANALYSIS FOR FEMALE  ------------")
features_that_should_have_distinct_ris_female = feats_failling_mwut(
    meta, feats, feats_names, {"origin_dataset": ["timit_concatenated"]}, {"origin_dataset": ["clac_healthy_w_vowel"]},
    cols_for_data_selection={"gender": ["female"]})

print (
    len(features_that_should_have_distinct_ris_female), 
    " features should have distinct RIs for each group, out of ",
    len(feats_names))
print (
    "On the other hand, we may derive the same RI for both groups for",
    len(feats_names) - len(features_that_should_have_distinct_ris_female),
    "features.")

print (" ------------ COMBINING BOTH GENDERS ------------ ")
feats_diff_ris_both_gender = np.unique(
    np.concatenate((
        features_that_should_have_distinct_ris_male, 
        features_that_should_have_distinct_ris_female)))
print ("Combining both genders:", len(feats_diff_ris_both_gender))
print ("Number of features with p>= 0.001 (Combining both genders):", len(feats_names) - len(feats_diff_ris_both_gender))

**What if we normalized each corpus separately? In that case, can we combine different corpora under the same Reference Interval?**

In [None]:
# Notice how in this experiment the Speech Model will be created 
# without any normalization. We will perform the normalization 
# only inside the function feats_failling_mwut

# Initialize read speech model:
main_output_dir = MAIN_OUTPUT_DIR + "partitioning/corpus_normalized/"

# Define config for reference speech model
config = {
    "feature_dir" : ROOT + "/features/",
    "metadata_dir": ROOT + "/data_info/",
    "datasets": ["clac_healthy_w_vowel", "timit_concatenated"],
    "features": "large_set",
    "feature_columns_to_drop": [],
    "output_dir": main_output_dir +"/ref_model/",
    "metadata_report": False, #TODO
    "use_feat_subset": None,
    
    # subselect data
    "sex": "both",  #"male", "female"
    "age": {  # allows specifying an age range
        "min": None,  # either None or an int value
        "max": None,  # either None or an int value
    },
    "subselect_data": {"task_type": ["read_speech"]},
    
    # outlier removal
    "outlier_removal": True,
    "outlier_removal_conf":{
        "from_id_list": True,
        "outlier_id_list": outlier_ids,
        "method": None, #"mcd_mahalanobis_dist",
        "pca_oultier_limits": None,
        "attribute_to_color_in_pca": None, #"origin_dataset", #"task_type", #"origin_dataset", None
        "save_data_after": False,
    },
    "distinct_feats_for_outlier_detect": True,
            
    # normalization
    "scale": False,
    "multi_normalize": False,
    "multi_normalize_conditions": None,
    "scaling_mode": "mean_0_std_1", #None, "min_max", "mean_0_std_1", mean_1_std_1
    "train_scaler": True,
    "save_data_after_normalize": False,
}

# Initialize reference model
speech_model = SpeechModel(config)


# Get features and metadata:
feats_names = FEATURE_NAMES["egemaps"] + FEATURE_NAMES["praat"]
meta = speech_model.meta_df.copy()
feats = speech_model.feats_df.copy()


# Run Mann-Whitney U test
# male
print (" ------------ ANALYSIS FOR MALE  ------------")
features_that_should_have_distinct_ris_male = feats_failling_mwut(
    meta, feats, feats_names, 
    {"origin_dataset": ["timit_concatenated"]}, 
    {"origin_dataset": ["clac_healthy_w_vowel"]},
    cols_for_data_selection={"gender": ["male"]},
    separate_scale = True,
)

print (
    len(features_that_should_have_distinct_ris_male), 
    " features should have distinct RIs for each group, out of ",
    len(feats_names))
print (
    "On the other hand, we may derive the same RI for both groups for",
    len(feats_names) - len(features_that_should_have_distinct_ris_male),
    "features.")

# female
print (" ------------ ANALYSIS FOR FEMALE  ------------")
features_that_should_have_distinct_ris_female = feats_failling_mwut(
    meta, feats, feats_names, 
    {"origin_dataset": ["timit_concatenated"]}, 
    {"origin_dataset": ["clac_healthy_w_vowel"]},
    cols_for_data_selection={"gender": ["female"]},
    separate_scale = True,
)

print (
    len(features_that_should_have_distinct_ris_female), 
    " features should have distinct RIs for each group, out of ",
    len(feats_names))
print (
    "On the other hand, we may derive the same RI for both groups for",
    len(feats_names) - len(features_that_should_have_distinct_ris_female),
    "features.")

print (" ------------ COMBINING BOTH GENDERS ------------ ")
feats_diff_ris_both_gender = np.unique(
    np.concatenate((
        features_that_should_have_distinct_ris_male, 
        features_that_should_have_distinct_ris_female)))
print ("Combining both genders:", len(feats_diff_ris_both_gender))
print ("Number of features with p>= 0.001 (Combining both genders):", len(feats_names) - len(feats_diff_ris_both_gender))

# 5. Deriving valid reference intervals

We derive reference intervals via the non-parametric approach, using the 2.5 and 97.5 percentiles.
To provide a confidence measure on the estimated RI, we derive 99% confidence intervals (CIs) for both the lower and upper limits of the RI via boostrapping.
Data was resampled 1000 times to estimate the CIs. If the CI for any of the reference limits is larger than 20% of the RI, then the RI is not considered valid. 
After the bootsrapping, we fixed each RI as the outer bounds of the CI.

In [None]:
# Initialize the model
main_output_dir = MAIN_OUTPUT_DIR + "/valid_intervals/"

# Define config for reference speech model
reference_config_male = {
    "feature_dir" : ROOT + "/features/",
    "metadata_dir": ROOT + "/data_info/",
    "datasets": ["clac_healthy_w_vowel", "voxceleb_annotated_usa_concatenated", "timit_concatenated"],
    "features": "large_set",
    "feature_columns_to_drop": [],
    "output_dir": main_output_dir +"/ref_model/male/",
    "metadata_report": False, #TODO
    "use_feat_subset": None,
    
    # subselect data
    "sex": "male", #"both",  #"male", "female"
    "age": {  # allows specifying an age range
        "min": None,  # either None or an int value
        "max": None,  # either None or an int value
    },
    "subselect_data": None, # {"task_type": ["pic_description"]},
    
    # outlier removal
    "outlier_removal": True,
    "outlier_removal_conf":{
        "from_id_list": True,
        "outlier_id_list": outlier_ids,
        "method": None,
        "pca_oultier_limits": None,
        "attribute_to_color_in_pca": None, #"origin_dataset", #"task_type", #"origin_dataset", None
        "save_data_after": False,
    },
    "distinct_feats_for_outlier_detect": True,
    
    # RIs:
    "use_50p_for_ri": True,
    "use_outer_bound_of_CI": False,
            
    # normalization -> we are not yet normalizing, only after separating the features by speech tasks.
    #                  But we will use the multi_normalize_conditions when we normalize.
    "scale": False,
    "multi_normalize": False,
    "multi_normalize_conditions": [
        {
        "train_scaler":True, 
        "pretrained_scaler":None, 
        "cols_for_data_selection":{
             "origin_dataset": ["clac_healthy_w_vowel"],
         }},
        {
        "train_scaler":True, 
        "pretrained_scaler":None, 
        "cols_for_data_selection":{
             "origin_dataset": ["voxceleb_annotated_usa_concatenated"],
         }},
        {
        "train_scaler": True, 
        "pretrained_scaler":None, 
        "cols_for_data_selection":{
             "origin_dataset": ["timit_concatenated"],
         }}
    ],
    "scaling_mode": "mean_0_std_1", #None, "min_max", "mean_0_std_1", mean_1_std_1
    "train_scaler": True,
    "save_data_after_normalize": False,
}

# Initialize reference model for MALES
print ("-------------------------- MALES -------------------------------")
male_reference_speech_model = SpeechModel(reference_config_male)

# Now let's separate features by speech task in the reference speech model
_ = male_reference_speech_model.transform_feats_df_by_task(FEATURES_BY_TASK)

# Normalize model:
male_reference_speech_model.multi_normalize(reference_config_male["multi_normalize_conditions"])

# Get reference intervals via the non-parametric approach
male_reference_speech_model.get_reference_intervals(method="np_ri", return_friendly=False)

ri_m = male_reference_speech_model.ri_df
ri_m["valid_interval"] = ri_m["valid_CI_lower_limit"] & ri_m["valid_CI_upper_limit"]
invalid_ri_m = ri_m[ri_m.valid_interval == False].feature.values
print ("Invalid reference intervals via the non parametric approach:")
print (len(ri_m[ri_m.valid_interval == False]))
print (invalid_ri_m)


# Initialize reference model for FEMALES
print ("-------------------------- FEMALES -------------------------------")
reference_config_female = reference_config_male.copy()
reference_config_female["sex"] = "female"
reference_config_female["output_dir"] = main_output_dir +"/ref_model/female/"

female_reference_speech_model = SpeechModel(reference_config_female)

# Now let's separate features by speech task in the reference speech model
_ = female_reference_speech_model.transform_feats_df_by_task(FEATURES_BY_TASK)

# Normalize model:
female_reference_speech_model.multi_normalize(reference_config_female["multi_normalize_conditions"])

# Get reference intervals via the non-parametric approach
female_reference_speech_model.get_reference_intervals(method="np_ri", return_friendly=False)

ri_f = female_reference_speech_model.ri_df
ri_f["valid_interval"] = ri_f["valid_CI_lower_limit"] & ri_f["valid_CI_upper_limit"]
invalid_ri_f = ri_f[ri_f.valid_interval == False].feature.values
print ("Invalid reference intervals via the non parametric approach:")
print (len(ri_f[ri_f.valid_interval == False]))
print (invalid_ri_f)

print ("------------- Union of the ivalid features for both genders -------------")
invalid_ris = np.unique(np.concatenate([invalid_ri_m, invalid_ri_f]))
print (invalid_ris)

**Invalid Intervals:**

'A#F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope',
'A#F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2',
'A#F0semitoneFrom27.5Hz_sma3nz_percentile50.0',
'A#F0semitoneFrom27.5Hz_sma3nz_percentile80.0',
'A#F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope',
'A#HNRdBACF_sma3nz_amean', 
'A#HNRdBACF_sma3nz_stddevNorm',
'A#localabsoluteJitter', 
'A#ppq5Jitter',
'Pic#F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2',
'Pic#ambiguous_ref_ratio', 
'Pic#discourse_marker_rate'

# 6. Finally, the reference models 
One for male and one for female speakers.
with all the decisions we have made so far.

In [None]:
# Initialize the model
main_output_dir = MAIN_OUTPUT_DIR + "experiments/"
drop_feats = FEATURE_NAMES["invalid_intervals"] # we will exclude these from the analysis because they do not have valid RI's

# Define config for reference speech model
reference_config_male = {
    "feature_dir" : ROOT + "/features/",
    "metadata_dir": ROOT + "/data_info/",
    "datasets": ["clac_healthy_w_vowel", "voxceleb_annotated_usa_concatenated", "timit_concatenated"], 
    "features": "large_set",
    "feature_columns_to_drop": [],
    "output_dir": main_output_dir +"/ref_model/male/",
    "metadata_report": False, #TODO
    "use_feat_subset": None,
    
    # subselect data
    "sex": "male", #"both",  #"male", "female"
    "age": {  # allows specifying an age range
        "min": None,  # either None or an int value
        "max": None,  # either None or an int value
    },
    "subselect_data": None,
    
    # outlier removal
    "outlier_removal": True,
    "outlier_removal_conf":{
        "from_id_list": True,
        "outlier_id_list": outlier_ids,
        "method": None, #"mcd_mahalanobis_dist",
        "pca_oultier_limits": None,
        "attribute_to_color_in_pca": None, #"origin_dataset", #"task_type", #"origin_dataset", None
        "save_data_after": False,
    },
    "distinct_feats_for_outlier_detect": True,
    
    # RIs:
    "use_50p_for_ri": False,
    "use_outer_bound_of_CI": True,
            
    # normalization -> we are not yet normalizing, only below after separating features by speech task
    "scale": False,
    "multi_normalize": False,
    "multi_normalize_conditions": [
        {
        "train_scaler":True, 
        "pretrained_scaler":None, 
        "cols_for_data_selection":{
             "origin_dataset": ["clac_healthy_w_vowel"],
         }},
        {
        "train_scaler":True, 
        "pretrained_scaler":None, 
        "cols_for_data_selection":{
             "origin_dataset": ["voxceleb_annotated_usa_concatenated"],
         }},
        {
        "train_scaler":True, 
        "pretrained_scaler":None, 
        "cols_for_data_selection":{
             "origin_dataset": ["timit_concatenated"],
         }}
    ],
    "scaling_mode": "mean_0_std_1", #None, "min_max", "mean_0_std_1", mean_1_std_1
    "train_scaler": True,
    "save_data_after_normalize": True,
}

######### MALE MODEL:

# Initialize reference model
ref_model_male = SpeechModel(reference_config_male)

# Now let's separate features by speech task in the reference speech model
_ = ref_model_male.transform_feats_df_by_task(FEATURES_BY_TASK, drop_features=drop_feats)

# Normalize model:
ref_model_male.multi_normalize(reference_config_male["multi_normalize_conditions"])

# Get reference intervals via the non-parametric approach
RI_male_df = ref_model_male.get_reference_intervals(method="np_ri", return_friendly=False)

# Save reference intervals to output dir:
ref_model_male.ri_df.to_csv(
    reference_config_male["output_dir"] + "/ref_intervals.csv", 
    index=False
)

######### FEMALE MODEL:

# Config
reference_config_female = reference_config_male.copy()
reference_config_female["sex"] = "female"
reference_config_female["output_dir"] = main_output_dir +"/ref_model/female/"

# Initialize reference model
ref_model_female = SpeechModel(reference_config_female)

# Now let's separate features by speech task in the reference speech model
_ = ref_model_female.transform_feats_df_by_task(FEATURES_BY_TASK, drop_features=drop_feats)

# Normalize model:
ref_model_female.multi_normalize(reference_config_female["multi_normalize_conditions"])

# Get reference intervals via the non-parametric approach
RI_female_df = ref_model_female.get_reference_intervals(method="np_ri", return_friendly=False)

# Save reference intervals to output dir:
ref_model_female.ri_df.to_csv(
    reference_config_female["output_dir"] + "/ref_intervals.csv", 
    index=False
)


In [None]:
# make reference plots

# make reference plot for male speakers
print ("[INFO]: Gernerating radar plot...")
fig_m = ref_model_male.build_radar_plot(
    plot_mean=True, save_fig=False,
    plot_ri=True, mean_color="green")

# make reference plot for female speakers
print ("[INFO]: Gernerating radar plot...")
fig_f = ref_model_female.build_radar_plot(
    plot_mean=True, save_fig=False,
    plot_ri=True, mean_color="green")

# 7. Comparison of the reference models with the datasets for diasease detection. 
We compare the reference intervals defined for the reference population 
with pc-gita (controls and patients of Parkinson's disease) and 
DementiaBank (controls and patients of Alzheimer's disease and depression).
We compare male and female separately.

####  Experiment in pc-gita vowel A, female 

In [None]:
# define configs
pcg_controls_config={
    "feature_dir" : ROOT + "/features/",
    "metadata_dir": ROOT + "data_info/",
    "datasets": ["pcgita_vowel"],
    "features": "large_set",
    "feature_columns_to_drop": [],
    "output_dir": main_output_dir + "pcgita_vowel/female/controls/",
    "metadata_report": False, #TODO
    "use_feat_subset": None,
    
    # subselect data:
    "sex": "female",
    "age": {  # allows specifying an age range
        "exact" : None,
        "min": None,  # either None or an int value
        "max": None,  # either None or an int value
    },
    "subselect_data": {
        "pd_label": [0],
    },
    # outlier removal:
    "outlier_removal": False,   
    "outlier_removal_conf":{},
    "distinct_feats_for_outlier_detect": True,
    
    # RIs:
    "use_50p_for_ri": False,
    "use_outer_bound_of_CI": True,
    
    # normalize
    "scale": False,
    "multi_normalize": False,
    "multi_normalize_conditions": None,
    "scaling_mode": "mean_0_std_1",
    "train_scaler": False,
    "save_data_after_normalize": False,
}

pcg_pd_config = pcg_controls_config.copy()
pcg_pd_config["subselect_data"] = {
        "pd_label": [1],
    }
pcg_pd_config["output_dir"] = main_output_dir + "pcgita_vowel/female/pd/"

# run experiment
summary_df, cm, disease_models = experiment(
    ref_speech_model=ref_model_female, 
    control_config=pcg_controls_config, 
    disease_configs=[pcg_pd_config],
    disease_names=["PD"], 
    features_to_drop=drop_feats, 
    features_by_task=FEATURES_BY_TASK,
    norm_strat="control_scaler", #"ref_scaler", #"none", 
    ref_scaler=None
)

# Save reference intervals to output dir:
summary_df = summary_df.sort_values(by=["prop_out_ri_over_subgroup#PD"], ascending=False).round(3)
summary_df.to_csv(
    main_output_dir + "pcgita_vowel/female/summary_results.csv", 
    index=False
)

# Preview
summary_df

####  New experiment in pc-gita vowel A, male 

In [None]:
# define configs
pcg_controls_config={
    "feature_dir" : ROOT + "/features/",
    "metadata_dir": ROOT + "data_info/",
    "datasets": ["pcgita_vowel"],
    "features": "large_set",
    "feature_columns_to_drop": [],
    "output_dir": main_output_dir + "pcgita_vowel/male/controls/",
    "metadata_report": False, #TODO
    "use_feat_subset": None,
    
    # subselect data:
    "sex": "male",  #"male", "female", "both"
    "age": {  # allows specifying an age range
        "exact" : None,
        "min": None,  # either None or an int value
        "max": None,  # either None or an int value
    },
    "subselect_data": {
        "pd_label": [0],
    },
    # outlier removal:
    "outlier_removal": False,   
    "outlier_removal_conf":{},
    "distinct_feats_for_outlier_detect": True,
    
    # RIs:
    "use_50p_for_ri": False,
    "use_outer_bound_of_CI": True,
    
    # normalize
    "scale": False,
    "multi_normalize": False,
    "multi_normalize_conditions": None,
    "scaling_mode": "mean_0_std_1",
    "train_scaler": False, #True,
    "save_data_after_normalize": False,
}

pcg_pd_config = pcg_controls_config.copy()
pcg_pd_config["subselect_data"] = {
        "pd_label": [1],
    }
pcg_pd_config["output_dir"] = main_output_dir + "pcgita_vowel/male/pd/"

# run experiment
summary_df, cm, disease_models = experiment(
    ref_speech_model=ref_model_male, 
    control_config=pcg_controls_config, 
    disease_configs=[pcg_pd_config],
    disease_names=["PD"], 
    features_to_drop=drop_feats, 
    features_by_task=FEATURES_BY_TASK,
    norm_strat="control_scaler", #"ref_scaler", #"none", 
    ref_scaler=None, #ref_scaler_male) #
)

# Save reference intervals to output dir:
summary_df = summary_df.sort_values(by=["prop_out_ri_over_subgroup#PD"], ascending=False).round(3)
summary_df.to_csv(
    main_output_dir + "pcgita_vowel/male/summary_results.csv", 
    index=False
)

# Preview
summary_df

####  New experiment in dementia bank, female 

In [None]:
# define configs
db_controls_config={
    "feature_dir" : ROOT + "/features/",
    "metadata_dir": ROOT + "data_info/",
    "datasets": ["dementiabank"],
    "features": "large_set",
    "feature_columns_to_drop": [],
    "output_dir":  main_output_dir + "db/female/controls/",
    "metadata_report": False, #TODO
    "use_feat_subset": None,
    
    # subselect data:
    "sex": "female",  #"male", "female", "both"
    "age": {  # allows specifying an age range
        "exact" : None,
        "min": None,  # either None or an int value
        "max": None,  # either None or an int value
    },
    "subselect_data": {
        "transcript_diag": ["Control"],
        "hamilton": (0,7)
    },
    # outlier removal:
    "outlier_removal": False,   
    "outlier_removal_conf":{},
    "distinct_feats_for_outlier_detect": True,

    # RIs:
    "use_50p_for_ri": False,
    "use_outer_bound_of_CI": True,
    
    # normalize
    "scale": False,
    "multi_normalize": False,
    "multi_normalize_conditions": None,
    "scaling_mode": "mean_0_std_1",
    "train_scaler": False, 
    "save_data_after_normalize": False,
}

# ad config
ad_config = db_controls_config.copy()
ad_config["subselect_data"] = {
        "transcript_diag": ["ProbableAD", "Probable"],
        "hamilton": (0,7)
    }
ad_config["output_dir"] = main_output_dir + "db/female/ad/"

# depression config
dep_config = db_controls_config.copy()
dep_config["subselect_data"] = {
        "transcript_diag": ["Control"],
        "hamilton": (8,10000)
    }
dep_config["output_dir"] = main_output_dir + "db/female/dep/"

# depression + ad config
dep_ad_config = db_controls_config.copy()
dep_ad_config["subselect_data"] = {
        "transcript_diag": ["ProbableAD", "Probable"],
        "hamilton": (8,100000)
    }
dep_ad_config["output_dir"] = main_output_dir + "db/female/dep_ad/"

# run experiment
summary_df, cm, disease_models = experiment(
    ref_speech_model=ref_model_female, 
    control_config=db_controls_config, 
    disease_configs=[ad_config, dep_config, dep_ad_config],
    disease_names=["AD", "Dep", "AD+Dep"], 
    features_to_drop=drop_feats, 
    features_by_task=FEATURES_BY_TASK,
    norm_strat="control_scaler", #"ref_scaler", #"none", 
    ref_scaler=None
)

# Save reference intervals to output dir:
summary_df = summary_df.sort_values(by=["prop_out_ri_over_subgroup#AD"], ascending=False).round(3)
summary_df.to_csv(
    main_output_dir + "db/female/summary_results.csv", 
    index=False
)

# Preview
summary_df

####  New experiment in dementia bank, male 

In [None]:
# define configs
db_controls_config={
    "feature_dir" : ROOT + "/features/",
    "metadata_dir": ROOT + "data_info/",
    "datasets": ["dementiabank"],
    "features": "large_set",
    "feature_columns_to_drop": [],
    "output_dir": main_output_dir + "db/male/controls/",
    "metadata_report": False, #TODO
    "use_feat_subset": None,
    
    # subselect data:
    "sex": "male",  #"male", "female", "both"
    "age": {  # allows specifying an age range
        "exact" : None,
        "min": None,  # either None or an int value
        "max": None,  # either None or an int value
    },
    "subselect_data": {
        "transcript_diag": ["Control"],
        "hamilton": (0,7)
    },
    # outlier removal:
    "outlier_removal": False,   
    "outlier_removal_conf":{},
    "distinct_feats_for_outlier_detect": True,

    # RIs:
    "use_50p_for_ri": False,
    "use_outer_bound_of_CI": True,
    
    # normalize
    "scale": False,
    "multi_normalize": False,
    "multi_normalize_conditions": None,
    "scaling_mode": "mean_0_std_1",
    "train_scaler": False, #True,
    "save_data_after_normalize": False,
}

# ad config
ad_config = db_controls_config.copy()
ad_config["subselect_data"] = {
        "transcript_diag": ["ProbableAD", "Probable"],
        "hamilton": (0,7)
    }
ad_config["output_dir"] = main_output_dir + "db/male/ad/"

# depression config
dep_config = db_controls_config.copy()
dep_config["subselect_data"] = {
        "transcript_diag": ["Control"],
        "hamilton": (8,10000)
    }
dep_config["output_dir"] = main_output_dir + "db/male/dep/"

# depression + ad config
dep_ad_config = db_controls_config.copy()
dep_ad_config["subselect_data"] = {
        "transcript_diag": ["ProbableAD", "Probable"],
        "hamilton": (8,100000)
    }
dep_ad_config["output_dir"] = main_output_dir + "db/male/dep_ad/"

# run experiment
summary_df, cm, disease_models = experiment(
    ref_speech_model=ref_model_male, 
    control_config=db_controls_config, 
    disease_configs=[ad_config, dep_config, dep_ad_config],
    disease_names=["AD", "Dep", "AD+Dep"], 
    features_to_drop=drop_feats, 
    features_by_task=FEATURES_BY_TASK,
    norm_strat="control_scaler", #"ref_scaler", #"none", 
    ref_scaler=None
)

# Save reference intervals to output dir:
summary_df = summary_df.sort_values(by=["prop_out_ri_over_subgroup#AD"], ascending=False).round(3)
summary_df.to_csv(
    main_output_dir + "db/male/summary_results.csv", 
    index=False
)

# Preview
summary_df