# Appendix A Tables

This notebook creates the formatted descriptive statistics from appendix a. To create the data for these tables, all of the child data is consolodated into one dataframe, and then basic statistics are calculated. 


### Imports and Load Data

In [7]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
current_dir = %pwd
if not current_dir == '/home/melissa/Dropbox/experiments/python/':
    %cd '/home/melissa/Dropbox/experiments/python/'

/home/melissa/Dropbox/experiments/python


In [8]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as sci
import seaborn as sns
from statannot import add_stat_annotation

In [9]:
homedir = "/home/melissa/Dropbox/experiments/python/current_projects/jcl_multisyllabic_neighborhoods_2021/"
datadir = "data/combined/"
data = Path(homedir+datadir)

In [10]:
convert = {'PACT':'float64','Pct Child':'float64','Pct Adult':'float64',
           'len_syllables':'int32','ND':'int32', 'SOND':'int32'}
data3 = pd.read_pickle(data/'data3.pickle')
data3 = data3.astype(convert)
data4 = pd.read_pickle(data/'data4.pickle')
data4 = data4.astype(convert)
data6 = pd.read_pickle(data/'data6.pickle')
data6 = data6.astype(convert)

In [11]:
data3.columns

Index(['PACT', 'Pct Child', 'Pct Adult', 'syllables', 'len_syllables',
       'len_phonemes', 'stress_syl_pos', 'orthographic', 'ND', 'SOND', 'PLD20',
       'PFEAT20', 'iscvc', 'ismulti'],
      dtype='object')

In [12]:
data3.dtypes

PACT              float64
Pct Child         float64
Pct Adult         float64
syllables          object
len_syllables       int32
len_phonemes       object
stress_syl_pos     object
orthographic       object
ND                  int32
SOND                int32
PLD20             float64
PFEAT20           float64
iscvc                bool
ismulti              bool
dtype: object

## Creating Dataframes

In [13]:
i3 = set(list(data3.index.values))
i4 = set(list(data4.index.values))
i6 = set(list(data6.index.values))
alli = i3 | i4 | i6

In [14]:
sim = 'ND'            
phon_cvc = pd.DataFrame(columns=['Three','Four','Six'],index=alli)
phon_cvc['Three'] = data3[sim][data3.iscvc]
phon_cvc['Four'] = data4[sim][data4.iscvc]
phon_cvc['Six'] = data6[sim][data6.iscvc]

In [15]:
sim = 'ND'            
phon_multi = pd.DataFrame(columns=['Three','Four','Six'],index=alli)
phon_multi['Three'] = data3[sim][data3.ismulti]
phon_multi['Four'] = data4[sim][data4.ismulti]
phon_multi['Six'] = data6[sim][data6.ismulti]

In [16]:
sim = 'SOND'            
son = pd.DataFrame(columns=['Three','Four','Six'],index=alli)
son['Three'] = data3[sim][data3.ismulti]
son['Four'] = data4[sim][data4.ismulti]
son['Six'] = data6[sim][data6.ismulti]

In [17]:
sim = 'PLD20'            
pld20 = pd.DataFrame(columns=['Three','Four','Six'],index=alli)
pld20['Three'] = data3[sim][data3.ismulti]
pld20['Four'] = data4[sim][data4.ismulti]
pld20['Six'] = data6[sim][data6.ismulti]

In [18]:
sim = 'PFEAT20'            
pfeat = pd.DataFrame(columns=['Three','Four','Six'],index=alli)
pfeat['Three'] = data3[sim][data3.ismulti]
pfeat['Four'] = data4[sim][data4.ismulti]
pfeat['Six'] = data6[sim][data6.ismulti]

In [19]:
# pcts - all words spoken by children
pcts = pd.DataFrame(columns=['c3','a3','c4','a4','c6','a6'],index=alli)
pcts['c3'] = np.exp(data3['Pct Child'][np.isfinite(data3['Pct Adult'])])
pcts['a3'] = np.exp(data3['Pct Adult'][np.isfinite(data3['Pct Adult'])])

pcts['c4'] = np.exp(data4['Pct Child'][np.isfinite(data4['Pct Adult'])])
pcts['a4'] = np.exp(data4['Pct Adult'][np.isfinite(data4['Pct Adult'])])

pcts['c6'] = np.exp(data6['Pct Child'][np.isfinite(data6['Pct Adult'])])
pcts['a6'] = np.exp(data6['Pct Adult'][np.isfinite(data6['Pct Adult'])])

In [20]:
# all PACT
pact_all = pd.DataFrame(columns=['Three','Four','Six'],index=alli)
pact_all['Three'] = data3['PACT']
pact_all['Four'] = data4['PACT']
pact_all['Six'] = data6['PACT']

In [21]:
# multisyllabic PACT
pact_multi = pd.DataFrame(columns=['Three','Four','Six'],index=alli)
pact_multi['Three'] = data3['PACT'][data3.ismulti]
pact_multi['Four'] = data4['PACT'][data4.ismulti]
pact_multi['Six'] = data6['PACT'][data6.ismulti]

In [22]:
# CVC PACT
pact_cvc = pd.DataFrame(columns=['Three','Four','Six'],index=alli)
pact_cvc['Three'] = data3['PACT'][data3.iscvc]
pact_cvc['Four'] = data4['PACT'][data4.iscvc]
pact_cvc['Six'] = data6['PACT'][data6.iscvc]

# Descriptive Tables

In [23]:
def create_tables(data):
    upper = np.max(data)
    lower = np.min(data)
    mean = np.mean(data)
    std = np.std(data)
    median = np.median(data[np.isfinite(data)])
    mode,count = sci.mode(data)
    return lower, upper, mean, std, median, mode[0], count[0]

In [24]:
desc = pd.DataFrame(index=['min', 'max', 'mean', 'std', 'median', 'mode', 'mode count'])

In [25]:
# CVC ND
desc = desc.assign(sim_3_nd_cvc=create_tables(phon_cvc.Three))
desc = desc.assign(sim_4_nd_cvc=create_tables(phon_cvc.Four))
desc = desc.assign(sim_6_nd_cvc=create_tables(phon_cvc.Six))


In [26]:
# multisyllabic ND
desc = desc.assign(sim_3_nd_multi=create_tables(phon_multi.Three))
desc = desc.assign(sim_4_nd_multi=create_tables(phon_multi.Four))
desc = desc.assign(sim_6_nd_multi=create_tables(phon_multi.Six))

# multisyllabic levenshtein distance
desc = desc.assign(sim_3_lev_multi=create_tables(pld20.Three))
desc = desc.assign(sim_4_lev_multi=create_tables(pld20.Four))
desc = desc.assign(sim_6_lev_multi=create_tables(pld20.Six))

# multisyllabic p-feat20
desc = desc.assign(sim_3_pfeat_multi=create_tables(pfeat.Three))
desc = desc.assign(sim_4_pfeat_multi=create_tables(pfeat.Four))
desc = desc.assign(sim_6_pfeat_multi=create_tables(pfeat.Six))

# multisyllabic SON
desc = desc.assign(sim_3_son_multi=create_tables(son.Three))
desc = desc.assign(sim_4_son_multi=create_tables(son.Four))
desc = desc.assign(sim_6_son_multi=create_tables(son.Six))

In [27]:
# percent transcripts by children and adults - all words
desc = desc.assign(pct_3_child=create_tables(pcts.c3))
desc = desc.assign(pct_3_adult=create_tables(pcts.a3))
desc = desc.assign(pct_4_child=create_tables(pcts.c4))
desc = desc.assign(pct_4_adult=create_tables(pcts.a4))
desc = desc.assign(pct_6_child=create_tables(pcts.c6))
desc = desc.assign(pct_6_adult=create_tables(pcts.a6))

In [28]:
# CVC PACT
desc = desc.assign(pact_3_cvc=create_tables(pact_cvc.Three))
desc = desc.assign(pact_4_cvc=create_tables(pact_cvc.Four))
desc = desc.assign(pact_6_cvc=create_tables(pact_cvc.Six))

In [29]:
# multisyllabic PACT
desc = desc.assign(pact_3_multi=create_tables(pact_multi.Three))
desc = desc.assign(pact_4_multi=create_tables(pact_multi.Four))
desc = desc.assign(pact_6_multi=create_tables(pact_multi.Six))

In [30]:
# all PACT
desc = desc.assign(pact_3_all=create_tables(pact_all.Three))
desc = desc.assign(pact_4_all=create_tables(pact_all.Four))
desc = desc.assign(pact_6_all=create_tables(pact_all.Six))

In [31]:
desc = desc.T

In [32]:
desc

Unnamed: 0,min,max,mean,std,median,mode,mode count
sim_3_nd_cvc,0.0,22.0,7.65285,3.734547,7.0,6.0,51.0
sim_4_nd_cvc,0.0,22.0,7.997455,3.864432,8.0,7.0,44.0
sim_6_nd_cvc,1.0,22.0,9.501035,4.413125,9.0,6.0,49.0
sim_3_nd_multi,0.0,7.0,0.5590062,1.068142,0.0,0.0,445.0
sim_4_nd_multi,0.0,8.0,0.553696,1.070189,0.0,0.0,494.0
sim_6_nd_multi,0.0,10.0,0.6531027,1.251127,0.0,0.0,663.0
sim_3_lev_multi,1.65,6.9,3.055901,0.961509,2.85,2.8,30.0
sim_4_lev_multi,1.6,6.7,3.045467,0.973701,2.85,1.95,43.0
sim_6_lev_multi,1.5,8.3,3.010631,0.974273,2.85,2.9,48.0
sim_3_pfeat_multi,0.761667,2.923158,1.56515,0.389755,1.538333,1.25,4.0


In [33]:
fmt = desc.round(2)
fmt;

In [34]:
# change if integer
def cii(x):
    if float(x).is_integer():
        return int(x)
    return x

In [35]:
for idx in fmt.index:
    fmt.loc[idx,'range'] = f'{cii(fmt.loc[idx,"min"])} to {cii(fmt.loc[idx,"max"])}'
    fmt.loc[idx,'median'] = f'{cii(fmt.loc[idx,"median"])}'
    fmt.loc[idx,'mean std'] = f'{cii(fmt.loc[idx,"mean"])} ({cii(fmt.loc[idx,"std"])})'
    fmt.loc[idx,'mode count'] = f'{cii(fmt.loc[idx,"mode"])} ({cii(fmt.loc[idx,"mode count"])})'

In [36]:
fmt = fmt[['range','mean std','median','mode count']]
fmt

Unnamed: 0,range,mean std,median,mode count
sim_3_nd_cvc,0 to 22,7.65 (3.73),7.0,6 (51)
sim_4_nd_cvc,0 to 22,8 (3.86),8.0,7 (44)
sim_6_nd_cvc,1 to 22,9.5 (4.41),9.0,6 (49)
sim_3_nd_multi,0 to 7,0.56 (1.07),0.0,0 (445)
sim_4_nd_multi,0 to 8,0.55 (1.07),0.0,0 (494)
sim_6_nd_multi,0 to 10,0.65 (1.25),0.0,0 (663)
sim_3_lev_multi,1.65 to 6.9,3.06 (0.96),2.85,2.8 (30)
sim_4_lev_multi,1.6 to 6.7,3.05 (0.97),2.85,1.95 (43)
sim_6_lev_multi,1.5 to 8.3,3.01 (0.97),2.85,2.9 (48)
sim_3_pfeat_multi,0.76 to 2.92,1.57 (0.39),1.54,1.25 (4)


In [39]:
fmt.to_csv(homedir+'data/table_data/descriptive_apndxa.csv')