# Appendix D: univariate statistics

Contains the univariate analysis for the CVC and multisyllabic words in Appendix D.

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
current_dir = %pwd
if not current_dir == '/home/melissa/Dropbox/experiments/python/':
    %cd '/home/melissa/Dropbox/experiments/python/'

/home/melissa/Dropbox/experiments/python


In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import statsmodels.api as sm

ModuleNotFoundError: No module named 'statsmodels'

In [None]:
data = Path.cwd()/'current_projects/jcl_multisyllabic_neighborhoods_2021/data/combined/'
outdir = Path.cwd()/'current_projects/jcl_multisyllabic_neighborhoods_2021/data/data_tables/'

In [None]:
data3 = pd.read_pickle(data/'data3.pickle')
data4 = pd.read_pickle(data/'data4.pickle')
data6 = pd.read_pickle(data/'data6.pickle')

In [None]:
convert = {'PACT':'float64','Pct Child':'float64','Pct Adult':'float64',
           'len_syllables':'int32','ND':'int32', 'SOND':'int32'}
data3 = data3.astype(convert)
data4 = data4.astype(convert)
data6 = data6.astype(convert)

In [None]:
c3 = data3['iscvc']
c4 = data4['iscvc']
c6 = data6['iscvc']

In [None]:
m3 = data3['ismulti']
m4 = data4['ismulti']
m6 = data6['ismulti']

In [None]:
data3.columns,data3.iscvc.sum(),data3.ismulti.sum()

## Bootstrapping and Serializing

In [None]:
bootstrap_num = 1000
alpha = 0.95

Note that the following function works for both simple and multiple regression. Currently, I am doing a series of simple regression for the analyses.

In [None]:
def ols_and_boot(child, independent, dependent):
    
    def find_CI(dist):
        ordered = np.sort(dist)
        p_low = ((1.0 - alpha) / 2.0) * 100
        lower = max(0.0, np.percentile(ordered, p_low))
        p_up = (alpha + ((1.0 - alpha) / 2.0)) * 100
        upper = min(1.0, np.percentile(ordered, p_up))
        return [lower, upper]
    
    # independent variables
    rsquareds = list()
    original = sm.OLS(child[dependent], sm.add_constant(child[independent])).fit()
    for count in range(0,bootstrap_num):
        random = child.sample(n=len(child),replace=True)
        random_regr = sm.OLS(random[dependent], sm.add_constant(random[independent])).fit()
        rsquareds.append(random_regr.rsquared)
    rci = find_CI(rsquareds)
    # observations, df numerator, df denominator, fvalue, pvalue, condition number, 
    # r2, lower r2 CI, upper r2 CI
    return original.nobs, original.df_model, original.df_resid, original.fvalue, \
            original.f_pvalue, original.rsquared, rci[0], rci[1]

In [None]:
df = pd.DataFrame(index=['observations', 'df numerator', 'df denominator', 'fvalue', 
                         'pvalue', 'r2', 'lower r2 CI', 'upper r2 CI'])

'PACT', 'Pct Child', 'Pct Adult', 'syllables', 'len_syllables',
        'len_phonemes', 'stress_syl_pos', 'orthographic', 'ND', 'SOND', 'PLD20',
        'PFEAT20', 'iscvc', 'ismulti']

In [None]:
# CVC
df = df.assign(cvc_3_ND=ols_and_boot(data3[data3.iscvc],'ND','PACT'))
df = df.assign(cvc_4_ND=ols_and_boot(data4[data4.iscvc],'ND','PACT'))
df = df.assign(cvc_6_ND=ols_and_boot(data6[data6.iscvc],'ND','PACT'))

# multisyllabic age 3
df = df.assign(multi_3_PFEAT20=ols_and_boot(data3[data3.ismulti],'PFEAT20','PACT'))
df = df.assign(multi_3_PLD20=ols_and_boot(data3[data3.ismulti],'PLD20','PACT'))
df = df.assign(multi_3_SOND=ols_and_boot(data3[data3.ismulti],'SOND','PACT'))

# multisyllabic age 4
df = df.assign(multi_4_PFEAT20=ols_and_boot(data4[data4.ismulti],'PFEAT20','PACT'))
df = df.assign(multi_4_PLD20=ols_and_boot(data4[data4.ismulti],'PLD20','PACT'))
df = df.assign(multi_4_SOND=ols_and_boot(data4[data4.ismulti],'SOND','PACT'))

# multisyllabic age 6
df = df.assign(multi_6_PFEAT20=ols_and_boot(data6[data6.ismulti],'PFEAT20','PACT'))
df = df.assign(multi_6_PLD20=ols_and_boot(data6[data6.ismulti],'PLD20','PACT'))
df = df.assign(multi_6_SOND=ols_and_boot(data6[data6.ismulti],'SOND','PACT'))



In [None]:
df = df.T
df.to_csv(outdir/'inferential.csv')

In [None]:
df