# Check for outliers
Plot some histograms to identify outliers on a by-participant and by-age group basis to determine if we should retrack formants with a different LPC order. 

In [None]:
from espspy.readers import EspsFormantReader
import os, sys, fnmatch
from audiolabel import read_label
import re
import pandas as pd
from sys import argv
from phonlab.utils import dir2df
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# read formant measurements in

allages = pd.read_csv('med_formants_2018.csv') # all formants from all speakers
lpc8 = pd.read_csv('med_formants_lpcorder_8.csv') # F1 & F2 where lpc order = 8 for 4-6 y/os

### Plot individual adults

In [None]:
# for adult plots (includes F4)
# lots of mis-classified F2

adult_spkrs = allages.loc[(allages.age_yrs=='adult')]

for adult in adult_spkrs.spkr.unique():
    
    print(adult)
        
    sub_df = adult_spkrs.loc[(adult_spkrs.spkr==adult)]  # select speaker
    
    plt.figure(adult)
        
    ax2 = sns.distplot(sub_df["f1_midpt_med"], hist = False, kde = True, kde_kws = {'shade': True, 'linewidth': 3})
    sns.distplot(sub_df["f2_midpt_med"], hist = False, kde = True, kde_kws = {'shade': True, 'linewidth': 3},ax=ax2)
    sns.distplot(sub_df["f3_midpt_med"], hist = False, kde = True, kde_kws = {'shade': True, 'linewidth': 3},ax=ax2)
    sns.distplot(sub_df["f4_midpt_med"], hist = False, kde = True, kde_kws = {'shade': True, 'linewidth': 3},ax=ax2)
    
    ax2.set(xlabel='Formant frequency (Hz)', ylabel='Probability density')
    ax2.text(350,0.001,"F1")
    ax2.text(1200,0.0003,"F2")
    ax2.text(2300,0.0004,"F3")
    ax2.text(3500,0.00075,"F4")
    ax2.text(3800,0.0028,adult,fontsize=22)



### Plot individual children

In [None]:
# for child plots (no F4)
# lpc order = 10

child_spkrs = allages.loc[(allages.age_yrs!='adult')]

for child in child_spkrs.spkr.unique():
    
    print(child)
        
    sub_df = child_spkrs.loc[(child_spkrs.spkr==child)]  # select speaker
    
    plt.figure(child)
    
    ax2 = sns.distplot(sub_df["f1_midpt_med"], hist = False, kde = True, kde_kws = {'shade': True, 'linewidth': 3})
    sns.distplot(sub_df["f2_midpt_med"], hist = False, kde = True, kde_kws = {'shade': True, 'linewidth': 3},ax=ax2)
    sns.distplot(sub_df["f3_midpt_med"], hist = False, kde = True, kde_kws = {'shade': True, 'linewidth': 3},ax=ax2)
    
    ax2.set(xlabel='Formant frequency (Hz)', ylabel='Probability density')
    ax2.text(350,0.001,"F1")
    ax2.text(1800,0.0003,"F2")
    ax2.text(3300,0.0004,"F3")
    c_age = sub_df['age_yrs'].iloc[0] # define child age
    label = ''.join([child, ' ', c_age])
    ax2.text(3000,0.001, label, fontsize=22)


### Plot by age group

In [None]:
# lpc order = 10
# all ages
for a in allages.age_yrs.unique():
    
    print(a)
        
    sub_df = allages.loc[(allages.age_yrs==a)]  # select age group
    
    plt.figure(a) # to separate the plots (remove if want overlaid)
    ax2 = sns.distplot(sub_df["f1_midpt_med"], hist = False, kde = True, kde_kws = {'shade': True, 'linewidth': 3})
    sns.distplot(sub_df["f2_midpt_med"], hist = False, kde = True, kde_kws = {'shade': True, 'linewidth': 3},ax=ax2)
    sns.distplot(sub_df["f3_midpt_med"], hist = False, kde = True, kde_kws = {'shade': True, 'linewidth': 3},ax=ax2)
    
    ax2.set(xlabel='Formant frequency (Hz)', ylabel='Probability density')
    ax2.text(350,0.001,"F1")
    ax2.text(1800,0.0003,"F2")
    ax2.text(3300,0.0004,"F3")
    ax2.text(3000,0.001,a,fontsize=22)

### when lpc=8 for the 4-6 y/os

In [None]:
# only 5-6 y/os
# lpc order = 8
# seems to misclassify F3 as F2 because when lpc=8 it only looks for 2 formants
for a in lpc8.age_yrs.unique():
    
    print(a)
        
    sub_df = lpc8.loc[(lpc8.age_yrs==a)]  # select age group

    plt.figure(a) # to separate the plots (remove if want overlaid)
    ax2 = sns.distplot(sub_df["f1_qrtr_med"], hist = False, kde = True, kde_kws = {'shade': True, 'linewidth': 3})
    sns.distplot(sub_df["f2_qrtr_med"], hist = False, kde = True, kde_kws = {'shade': True, 'linewidth': 3},ax=ax2)

    ax2.set(xlabel='Formant frequency (Hz)', ylabel='Probability density')
    ax2.text(350,0.001,"F1")
    ax2.text(1800,0.0003,"F2")
    ax2.text(3000,0.001,a,fontsize=22)