# Appendix D: univariate statistics

Contains the univariate analysis for the CVC and multisyllabic words in Appendix D.

In [3]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
current_dir = %pwd
if not current_dir == '/home/melissa/Dropbox/experiments/python/':
    %cd '/home/melissa/Dropbox/experiments/python/'

/home/melissa/Dropbox/experiments/python


In [4]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import statsmodels.api as sm

In [16]:
data = Path.cwd()/'current_projects/jcl_multisyllabic_neighborhoods_2021/data/combined/'
outdir = Path.cwd()/'current_projects/jcl_multisyllabic_neighborhoods_2021/data/table_data/'

In [6]:
data3 = pd.read_pickle(data/'data3.pickle')
data4 = pd.read_pickle(data/'data4.pickle')
data6 = pd.read_pickle(data/'data6.pickle')

In [7]:
convert = {'PACT':'float64','Pct Child':'float64','Pct Adult':'float64',
           'len_syllables':'int32','ND':'int32', 'SOND':'int32'}
data3 = data3.astype(convert)
data4 = data4.astype(convert)
data6 = data6.astype(convert)

In [8]:
c3 = data3['iscvc']
c4 = data4['iscvc']
c6 = data6['iscvc']

In [9]:
m3 = data3['ismulti']
m4 = data4['ismulti']
m6 = data6['ismulti']

In [10]:
data3.columns,data3.iscvc.sum(),data3.ismulti.sum()

(Index(['PACT', 'Pct Child', 'Pct Adult', 'syllables', 'len_syllables',
        'len_phonemes', 'stress_syl_pos', 'orthographic', 'ND', 'SOND', 'PLD20',
        'PFEAT20', 'iscvc', 'ismulti'],
       dtype='object'),
 386,
 644)

## Bootstrapping and Serializing

In [11]:
bootstrap_num = 1000
alpha = 0.95

Note that the following function works for both simple and multiple regression. Currently, I am doing a series of simple regression for the analyses.

In [12]:
def ols_and_boot(child, independent, dependent):
    
    def find_CI(dist):
        ordered = np.sort(dist)
        p_low = ((1.0 - alpha) / 2.0) * 100
        lower = max(0.0, np.percentile(ordered, p_low))
        p_up = (alpha + ((1.0 - alpha) / 2.0)) * 100
        upper = min(1.0, np.percentile(ordered, p_up))
        return [lower, upper]
    
    # independent variables
    rsquareds = list()
    original = sm.OLS(child[dependent], sm.add_constant(child[independent])).fit()
    for count in range(0,bootstrap_num):
        random = child.sample(n=len(child),replace=True)
        random_regr = sm.OLS(random[dependent], sm.add_constant(random[independent])).fit()
        rsquareds.append(random_regr.rsquared)
    rci = find_CI(rsquareds)
    # observations, df numerator, df denominator, fvalue, pvalue, condition number, 
    # r2, lower r2 CI, upper r2 CI
    return original.nobs, original.df_model, original.df_resid, original.fvalue, \
            original.f_pvalue, original.rsquared, rci[0], rci[1]

In [13]:
df = pd.DataFrame(index=['observations', 'df numerator', 'df denominator', 'fvalue', 
                         'pvalue', 'r2', 'lower r2 CI', 'upper r2 CI'])

'PACT', 'Pct Child', 'Pct Adult', 'syllables', 'len_syllables',
        'len_phonemes', 'stress_syl_pos', 'orthographic', 'ND', 'SOND', 'PLD20',
        'PFEAT20', 'iscvc', 'ismulti']

In [14]:
# CVC
df = df.assign(cvc_3_ND=ols_and_boot(data3[data3.iscvc],'ND','PACT'))
df = df.assign(cvc_4_ND=ols_and_boot(data4[data4.iscvc],'ND','PACT'))
df = df.assign(cvc_6_ND=ols_and_boot(data6[data6.iscvc],'ND','PACT'))

# multisyllabic age 3
df = df.assign(multi_3_PFEAT20=ols_and_boot(data3[data3.ismulti],'PFEAT20','PACT'))
df = df.assign(multi_3_PLD20=ols_and_boot(data3[data3.ismulti],'PLD20','PACT'))
df = df.assign(multi_3_SOND=ols_and_boot(data3[data3.ismulti],'SOND','PACT'))

# multisyllabic age 4
df = df.assign(multi_4_PFEAT20=ols_and_boot(data4[data4.ismulti],'PFEAT20','PACT'))
df = df.assign(multi_4_PLD20=ols_and_boot(data4[data4.ismulti],'PLD20','PACT'))
df = df.assign(multi_4_SOND=ols_and_boot(data4[data4.ismulti],'SOND','PACT'))

# multisyllabic age 6
df = df.assign(multi_6_PFEAT20=ols_and_boot(data6[data6.ismulti],'PFEAT20','PACT'))
df = df.assign(multi_6_PLD20=ols_and_boot(data6[data6.ismulti],'PLD20','PACT'))
df = df.assign(multi_6_SOND=ols_and_boot(data6[data6.ismulti],'SOND','PACT'))



In [18]:
df = df.T

In [19]:
df.to_csv(outdir/'inferential.csv')

In [20]:
df

Unnamed: 0,observations,df numerator,df denominator,fvalue,pvalue,r2,lower r2 CI,upper r2 CI
cvc_3_ND,386.0,1.0,384.0,8.290264,0.004209,0.021133,0.001081,0.066036
cvc_4_ND,393.0,1.0,391.0,5.925372,0.015371,0.014928,0.000474,0.049859
cvc_6_ND,483.0,1.0,481.0,6.10419,0.013832,0.012532,0.000448,0.038484
multi_3_PFEAT20,644.0,1.0,642.0,5.134484,0.023787,0.007934,0.000214,0.025578
multi_3_PLD20,644.0,1.0,642.0,2.986293,0.084453,0.00463,2.3e-05,0.020803
multi_3_SOND,644.0,1.0,642.0,0.257646,0.611916,0.000401,1e-06,0.009699
multi_4_PFEAT20,717.0,1.0,715.0,0.557241,0.455619,0.000779,2e-06,0.009406
multi_4_PLD20,717.0,1.0,715.0,1.419805,0.233831,0.001982,6e-06,0.014715
multi_4_SOND,717.0,1.0,715.0,0.218374,0.640424,0.000305,2e-06,0.007631
multi_6_PFEAT20,983.0,1.0,981.0,5.558187,0.01859,0.005634,0.000231,0.019467


Please note that the upper and lower bounds of the confidence intervals will vary due to random row selection in the bootstrapping process.