In [1]:
import IPython
import math
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 10]



In [2]:
from rpy2.robjects.packages import importr

utils = importr("utils")
utils.chooseCRANmirror(ind=1)
#utils.install_packages('lme4')

%load_ext rpy2.ipython

In [3]:
# Load data
conversations = pd.read_csv("results/intelligibility/conversations.csv")
conversations_melted = pd.read_csv("results/intelligibility/conversations_melted.csv")

# convert True/False to 0/1:
conversations.replace({False: 0, True: 1}, inplace=True)
conversations_melted.replace({False: 0, True: 1}, inplace=True)

# normalize age
min_age, max_age = conversations["age"].min(), conversations["age"].max()
conversations["age"] = (conversations["age"] - min_age) / (max_age - min_age) * (1 - 0)
conversations_melted["age"] = (conversations_melted["age"] - min_age) / (max_age - min_age) * (1 - 0)

conversations.head()

Unnamed: 0,utterance_id,speaker_code,tokens,pos,age,corpus,transcript_file,child_name,speaker_code_next,start_time_next,...,follow_up_start_time,follow_up_end_time,follow_up_is_speech_related,follow_up_is_intelligible,follow_up_speech_act,response_latency,response_latency_follow_up,has_response,response_is_clarification_request,pos_feedback
0,300,CHI,"['sit', '.']",['v'],0.166667,Bernstein,/home/mitja/data/CHILDES/Bernstein/Children/Al...,Bernstein_Alice,MOT,29650.0,...,32175.0,39984.0,1.0,1,ST,0.0,2525.0,1,0,1
1,333,CHI,"['a', 'mommy', '.']","['det:art', 'n']",0.166667,Bernstein,/home/mitja/data/CHILDES/Bernstein/Children/Al...,Bernstein_Alice,MOT,138685.0,...,140585.0,142428.0,1.0,0,YY,0.0,1900.0,1,0,1
2,336,CHI,['.'],['none'],0.166667,Bernstein,/home/mitja/data/CHILDES/Bernstein/Children/Al...,Bernstein_Alice,MOT,142428.0,...,152302.0,153962.0,1.0,0,YY,0.0,9874.0,1,0,1
3,343,CHI,['.'],['none'],0.166667,Bernstein,/home/mitja/data/CHILDES/Bernstein/Children/Al...,Bernstein_Alice,MOT,153962.0,...,173964.0,176230.0,1.0,0,YY,0.0,20002.0,1,0,1
4,350,CHI,['.'],['none'],0.166667,Bernstein,/home/mitja/data/CHILDES/Bernstein/Children/Al...,Bernstein_Alice,MOT,176230.0,...,200269.0,208049.0,1.0,0,YY,0.0,24039.0,1,0,1


## Quality of communicative feedback/ Caregiver contingency


### Timing:

In [4]:
%%R -i conversations
library(lme4)

m_caregiver_contingency<-glmer('has_response ~ utt_is_intelligible + age + (1 | child_name)', data=conversations, family=binomial)
print(summary(m_caregiver_contingency))


R[write to console]: Loading required package: Matrix



Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: has_response ~ utt_is_intelligible + age + (1 | child_name)
   Data: conversations

      AIC       BIC    logLik  deviance  df.resid 
 406540.4  406584.3 -203266.2  406532.4    431746 

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-13.2691   0.1579   0.4849   0.5605   1.3335 

Random effects:
 Groups     Name        Variance Std.Dev.
 child_name (Intercept) 2.114    1.454   
Number of obs: 431750, groups:  child_name, 332

Fixed effects:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)          2.70550    0.07404   36.54   <2e-16 ***
utt_is_intelligible  0.77371    0.01045   74.04   <2e-16 ***
age                  0.84248    0.02413   34.92   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
            (Intr) utt_s_
utt_s_ntllg -0.010       
age        

### Clarification requests

In [5]:
%%R -i conversations
library(lme4)

conversations_with_response = subset(conversations, has_response==1)

m_caregiver_contingency<-glmer('response_is_clarification_request ~ utt_is_intelligible + age + (1 | child_name)', data=conversations_with_response, family=binomial)
print(summary(m_caregiver_contingency))

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: response_is_clarification_request ~ utt_is_intelligible + age +  
    (1 | child_name)
   Data: conversations_with_response

     AIC      BIC   logLik deviance df.resid 
 23844.0  23886.9 -11918.0  23836.0   341347 

Scaled residuals: 
   Min     1Q Median     3Q    Max 
-0.279 -0.086 -0.070 -0.033 34.566 

Random effects:
 Groups     Name        Variance Std.Dev.
 child_name (Intercept) 0.7597   0.8716  
Number of obs: 341351, groups:  child_name, 332

Fixed effects:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)         -4.09721    0.11031 -37.143  < 2e-16 ***
utt_is_intelligible -1.10295    0.05052 -21.833  < 2e-16 ***
age                 -0.40060    0.11989  -3.342 0.000833 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
            (Intr) utt_s_
utt_s_ntllg -0.111       


### Combined (Positive Feedback = No pause, no clarification request)

In [6]:
%%R -i conversations
library(lme4)

m_caregiver_contingency<-glmer('pos_feedback ~ utt_is_intelligible + age + (1 | child_name)', data=conversations, family=binomial)
print(summary(m_caregiver_contingency))

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: pos_feedback ~ utt_is_intelligible + age + (1 | child_name)
   Data: conversations

      AIC       BIC    logLik  deviance  df.resid 
 413275.2  413319.1 -206633.6  413267.2    431746 

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-10.5871   0.1760   0.5016   0.5665   1.3571 

Random effects:
 Groups     Name        Variance Std.Dev.
 child_name (Intercept) 1.474    1.214   
Number of obs: 431750, groups:  child_name, 332

Fixed effects:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)          2.29067    0.07056   32.46   <2e-16 ***
utt_is_intelligible  0.79255    0.01029   77.04   <2e-16 ***
age                  0.83980    0.02397   35.04   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
            (Intr) utt_s_
utt_s_ntllg -0.041       
age        

## Effect of communicative feedback
### Positive Feedback: Timing

In [None]:
%%R -i conversations
library(lme4)

m_child_contingency<-glmer('follow_up_is_intelligible ~ has_response + age + (1 | child_name)', data=conversations, family=binomial)
print(summary(m_child_contingency))

In [7]:
%%R -i conversations
library(lme4)

conversations_child_intelligible = subset(conversations, utt_is_intelligible==1)

m_child_contingency<-glmer('follow_up_is_intelligible ~ has_response + age + (1 | child_name)', data=conversations_child_intelligible, family=binomial)
print(summary(m_child_contingency))

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: follow_up_is_intelligible ~ has_response + age + (1 | child_name)
   Data: conversations_child_intelligible

      AIC       BIC    logLik  deviance  df.resid 
 249290.1  249333.2 -124641.1  249282.1    355003 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-6.1738  0.2825  0.3077  0.3625  2.1996 

Random effects:
 Groups     Name        Variance Std.Dev.
 child_name (Intercept) 1.805    1.343   
Number of obs: 355007, groups:  child_name, 255

Fixed effects:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept)   0.44039    0.07305   6.029 1.65e-09 ***
has_response  0.36107    0.01296  27.869  < 2e-16 ***
age           0.82617    0.03099  26.660  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
            (Intr) hs_rsp
has_respons -0.080       
age         -0.056 -0.04

In [7]:
%%R -i conversations
library(lme4)

conversations_child_intelligible = subset(conversations, utt_is_intelligible==1)

m_child_contingency<-glmer('follow_up_is_intelligible ~ has_response * age + (1 | child_name)', data=conversations_child_intelligible, family=binomial)
print(summary(m_child_contingency))

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: follow_up_is_intelligible ~ has_response * age + (1 | child_name)
   Data: conversations_child_intelligible

      AIC       BIC    logLik  deviance  df.resid 
 249287.8  249341.7 -124638.9  249277.8    355002 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-6.0843  0.2818  0.3078  0.3624  2.1445 

Random effects:
 Groups     Name        Variance Std.Dev.
 child_name (Intercept) 1.785    1.336   
Number of obs: 355007, groups:  child_name, 255

Fixed effects:
                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)       0.50660    0.06998   7.239 4.51e-13 ***
has_response      0.28223    0.03027   9.325  < 2e-16 ***
age               0.70039    0.05017  13.960  < 2e-16 ***
has_response:age  0.15578    0.05441   2.863   0.0042 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
  

### Negative Feedback: Clarification requests

In [9]:
%%R -i conversations_melted
library(lme4)

conversations_with_response = subset(conversations_melted, has_response == 1)
conversations_cr = subset(conversations_with_response, response_is_clarification_request == 1)

# TODO: conversation_id or is_follow_up in random effects?
m_child_contingency<-glmer('is_intelligible ~ is_follow_up + age + (1 | child_name) + (1 | conversation_id)', data=conversations_cr, family=binomial, control=glmerControl(optimizer="bobyqa", optCtrl=list(maxfun=1e5)))
print(summary(m_child_contingency))

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: is_intelligible ~ is_follow_up + age + (1 | child_name) + (1 |  
    conversation_id)
   Data: conversations_cr
Control: glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 1e+05))

     AIC      BIC   logLik deviance df.resid 
  4559.0   4590.6  -2274.5   4549.0     4165 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.4902 -0.5526  0.3731  0.5227  2.1619 

Random effects:
 Groups          Name        Variance Std.Dev.
 conversation_id (Intercept) 1.464    1.210   
 child_name      (Intercept) 1.450    1.204   
Number of obs: 4170, groups:  conversation_id, 2085; child_name, 87

Fixed effects:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -1.39240    0.23303  -5.975 2.30e-09 ***
is_follow_up  0.40852    0.08339   4.899 9.63e-07 ***
age           2.88964    0.31870   9.067  < 2e-16 ***
---
Signif. codes:  0 ‘***’

In [10]:
%%R -i conversations_melted
library(lme4)

conversations_with_response = subset(conversations_melted, has_response == 1)
conversations_cr = subset(conversations_with_response, response_is_clarification_request == 1)

# TODO: conversation_id or is_follow_up in random effects?
m_child_contingency<-glmer('is_intelligible ~ is_follow_up + age + (1 | child_name)', data=conversations_cr, family=binomial, control=glmerControl(optimizer="bobyqa", optCtrl=list(maxfun=1e5)))
print(summary(m_child_contingency))

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: is_intelligible ~ is_follow_up + age + (1 | child_name)
   Data: conversations_cr
Control: glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 1e+05))

     AIC      BIC   logLik deviance df.resid 
  4650.1   4675.4  -2321.0   4642.1     4166 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-3.6924 -0.6857  0.4215  0.6039  2.7332 

Random effects:
 Groups     Name        Variance Std.Dev.
 child_name (Intercept) 1.119    1.058   
Number of obs: 4170, groups:  child_name, 87

Fixed effects:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -1.09224    0.18289  -5.972 2.34e-09 ***
is_follow_up  0.31907    0.07272   4.388 1.15e-05 ***
age           2.15757    0.22123   9.752  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
            (Intr) is_fl_
is_follow_p -

In [11]:
%%R -i conversations_melted
library(lme4)

conversations_with_response = subset(conversations_melted, has_response == 1)


m_child_contingency<-glmer('is_intelligible ~ response_is_clarification_request * is_follow_up + (1 | age) + (1 | child_name) + (1 | conversation_id)', data=conversations_with_response, family=binomial, control=glmerControl(optimizer="bobyqa", optCtrl=list(maxfun=1e4)))
print(summary(m_child_contingency))

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: is_intelligible ~ response_is_clarification_request * is_follow_up +  
    (1 | age) + (1 | child_name) + (1 | conversation_id)
   Data: conversations_with_response
Control: glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 10000))

      AIC       BIC    logLik  deviance  df.resid 
 507258.9  507339.0 -253622.5  507244.9    682695 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-4.9155  0.2631  0.2906  0.3503  4.9846 

Random effects:
 Groups          Name        Variance Std.Dev.
 conversation_id (Intercept) 0.5358   0.7320  
 child_name      (Intercept) 4.0746   2.0186  
 age             (Intercept) 0.4693   0.6851  
Number of obs: 682702, groups:  
conversation_id, 341351; child_name, 332; age, 7

Fixed effects:
                                                Estimate Std. Error z value
(Intercept)                                