In [1]:
import math
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 10]

In [2]:
from rpy2.robjects.packages import importr

utils = importr("utils")
utils.chooseCRANmirror(ind=1)
#utils.install_packages('lme4')

%load_ext rpy2.ipython

In [13]:
# Load data
conversations = pd.read_csv("results/intelligibility/conversations.csv")
conversations_melted = pd.read_csv("results/intelligibility/conversations_melted.csv")

# convert True/False to 0/1:
conversations.replace({False: 0, True: 1}, inplace=True)
conversations_melted.replace({False: 0, True: 1}, inplace=True)

# normalize age
min_age, max_age = conversations["age"].min(), conversations["age"].max()
conversations["age"] = (conversations["age"] - min_age) / (max_age - min_age) * (1 - 0)
conversations_melted["age"] = (conversations_melted["age"] - min_age) / (max_age - min_age) * (1 - 0)

conversations.head()

Unnamed: 0,utterance_id,speaker_code,tokens,pos,age,corpus,transcript_file,child_name,speaker_code_next,start_time_next,...,follow_up_start_time,follow_up_end_time,follow_up_is_speech_related,follow_up_is_intelligible,follow_up_speech_act,response_latency,response_latency_follow_up,has_response,response_is_clarification_request,pos_feedback
0,28,CHI,"[""let's"", 'get', 'some', 'glue', 'too', '.']","['v', 'pro:obj', 'v', 'qn', 'n', 'adv']",0.5,Bloom,/home/mitja/data/CHILDES/Bloom/Peter/020324.cha,Bloom_Peter,MOT,96687.0,...,97154.0,100026.0,1.0,1,PF,0.0,467.0,1,0,1
1,50,CHI,"['i', 'get', 'some', 'glue', 'too', '.']","['pro:sub', 'v', 'qn', 'n', 'adv']",0.5,Bloom,/home/mitja/data/CHILDES/Bloom/Peter/020324.cha,Bloom_Peter,MOT,148262.0,...,150114.0,151434.0,1.0,0,AA,0.0,1852.0,1,0,1
2,52,CHI,"['broken', '.']",['part'],0.5,Bloom,/home/mitja/data/CHILDES/Bloom/Peter/020324.cha,Bloom_Peter,MOT,151434.0,...,155422.0,161449.0,1.0,0,ST,0.0,3988.0,1,0,1
3,58,CHI,"['huh', 'right', 'there', 'see', 'his', 'legs'...","['co', 'adv', 'adv', 'v', 'det:poss', 'n', 'v'...",0.5,Bloom,/home/mitja/data/CHILDES/Bloom/Peter/020324.cha,Bloom_Peter,MOT,169064.0,...,172202.0,173521.0,1.0,1,AD,0.0,3138.0,1,0,1
4,78,CHI,"['hi', 'jenny', 'how', 'you', 'fine', 'hi', 'h...","['co', 'n:prop', 'pro:rel', 'pro:per', 'adv', ...",0.5,Bloom,/home/mitja/data/CHILDES/Bloom/Peter/020324.cha,Bloom_Peter,MOT,243831.0,...,249125.0,249641.0,1.0,1,AA,0.0,5294.0,1,0,1


## Quality of communicative feedback/ Caregiver contingency


### Timing:

In [4]:
%%R -i conversations
library(lme4)

m_caregiver_contingency<-glmer('has_response ~ utt_is_intelligible * age + (1 | child_name)', data=conversations, family=binomial)
print(summary(m_caregiver_contingency))


R[write to console]: Loading required package: Matrix



Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: has_response ~ utt_is_intelligible * age + (1 | child_name)
   Data: conversations

      AIC       BIC    logLik  deviance  df.resid 
 316140.1  316193.8 -158065.0  316130.1    340110 

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-10.9342   0.1442   0.4944   0.5457   1.3183 

Random effects:
 Groups     Name        Variance Std.Dev.
 child_name (Intercept) 1.223    1.106   
Number of obs: 340115, groups:  child_name, 45

Fixed effects:
                        Estimate Std. Error z value Pr(>|z|)    
(Intercept)              1.73717    0.09017  19.266  < 2e-16 ***
utt_is_intelligible      0.65315    0.02679  24.381  < 2e-16 ***
age                      0.67847    0.04166  16.287  < 2e-16 ***
utt_is_intelligible:age  0.20033    0.04731   4.234  2.3e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation o

### Clarification requests

In [5]:
conversations["response_is_no_clarification_request"] = conversations.response_is_clarification_request.replace({0:1, 1:0})


In [6]:
%%R -i conversations
library(lme4)

# TODO: filter out cases without responses?
m_caregiver_contingency<-glmer('response_is_clarification_request ~ utt_is_intelligible * age + (1 | child_name)', data=conversations, family=binomial)
print(summary(m_caregiver_contingency))

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: response_is_clarification_request ~ utt_is_intelligible * age +  
    (1 | child_name)
   Data: conversations

     AIC      BIC   logLik deviance df.resid 
 20564.5  20618.2 -10277.2  20554.5   340110 

Scaled residuals: 
   Min     1Q Median     3Q    Max 
-0.252 -0.078 -0.065 -0.033 33.277 

Random effects:
 Groups     Name        Variance Std.Dev.
 child_name (Intercept) 0.5583   0.7472  
Number of obs: 340115, groups:  child_name, 45

Fixed effects:
                        Estimate Std. Error z value Pr(>|z|)    
(Intercept)              -4.1062     0.1416 -29.000   <2e-16 ***
utt_is_intelligible      -0.9750     0.1182  -8.250   <2e-16 ***
age                      -0.3729     0.1901  -1.961   0.0498 *  
utt_is_intelligible:age   0.1002     0.2077   0.482   0.6296    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of F

### Combined (Positive Feedback = No pause, no clarification request)

In [7]:
%%R -i conversations
library(lme4)

m_caregiver_contingency<-glmer('pos_feedback ~ utt_is_intelligible * age + (1 | child_name)', data=conversations, family=binomial)
print(summary(m_caregiver_contingency))

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: pos_feedback ~ utt_is_intelligible * age + (1 | child_name)
   Data: conversations

      AIC       BIC    logLik  deviance  df.resid 
 321532.6  321586.3 -160761.3  321522.6    340110 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-8.2163  0.1658  0.4953  0.5518  1.3401 

Random effects:
 Groups     Name        Variance Std.Dev.
 child_name (Intercept) 0.9521   0.9757  
Number of obs: 340115, groups:  child_name, 45

Fixed effects:
                        Estimate Std. Error z value Pr(>|z|)    
(Intercept)              1.50958    0.08451  17.862  < 2e-16 ***
utt_is_intelligible      0.68197    0.02664  25.601  < 2e-16 ***
age                      0.69011    0.04362  15.821  < 2e-16 ***
utt_is_intelligible:age  0.18229    0.04766   3.825 0.000131 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Ef

## Effect of communicative feedback
### Positive Feedback: Timing

In [8]:
%%R -i conversations
library(lme4)

m_child_contingency<-glmer('follow_up_is_intelligible ~ has_response * age + (1 | child_name)', data=conversations, family=binomial)
print(summary(m_child_contingency))

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: follow_up_is_intelligible ~ has_response * age + (1 | child_name)
   Data: conversations

      AIC       BIC    logLik  deviance  df.resid 
 258919.6  258973.3 -129454.8  258909.6    340110 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-5.7015  0.2778  0.3203  0.4008  9.5605 

Random effects:
 Groups     Name        Variance Std.Dev.
 child_name (Intercept) 2.701    1.643   
Number of obs: 340115, groups:  child_name, 45

Fixed effects:
                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)       0.05172    0.11421   0.453  0.65067    
has_response      0.41832    0.03138  13.332  < 2e-16 ***
age               1.20313    0.05326  22.589  < 2e-16 ***
has_response:age  0.17131    0.05795   2.956  0.00312 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
            (Intr) hs_

In [9]:
%%R -i conversations
library(lme4)

conversations_child_intelligible = subset(conversations, utt_is_intelligible==1)

m_child_contingency<-glmer('follow_up_is_intelligible ~ has_response * age + (1 | child_name)', data=conversations_child_intelligible, family=binomial)
print(summary(m_child_contingency))

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: follow_up_is_intelligible ~ has_response * age + (1 | child_name)
   Data: conversations_child_intelligible

     AIC      BIC   logLik deviance df.resid 
194555.6 194608.5 -97272.8 194545.6   287147 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-5.6721  0.2771  0.3036  0.3514  1.9946 

Random effects:
 Groups     Name        Variance Std.Dev.
 child_name (Intercept) 0.9542   0.9768  
Number of obs: 287152, groups:  child_name, 44

Fixed effects:
                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)       0.92997    0.09535   9.753   <2e-16 ***
has_response      0.32684    0.03523   9.276   <2e-16 ***
age               0.68331    0.05703  11.981   <2e-16 ***
has_response:age  0.15137    0.06248   2.423   0.0154 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
            (

### Negative Feedback: Clarification requests

In [16]:
%%R -i conversations_melted
library(lme4)

conversations_with_response = subset(conversations_melted, has_response == 1)
conversations_cr = subset(conversations_with_response, response_is_clarification_request == 1)

# TODO: conversation_id or is_follow_up in random effects?
m_child_contingency<-glmer('is_intelligible ~ is_follow_up * age + (1 | child_name) + (1 | conversation_id)', data=conversations_cr, family=binomial, control=glmerControl(optimizer="bobyqa", optCtrl=list(maxfun=1e5)))
print(summary(m_child_contingency))

R[write to console]: boundary (singular) fit: see ?isSingular



Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: is_intelligible ~ is_follow_up * age + (1 | child_name) + (1 |  
    is_follow_up)
   Data: conversations_cr
Control: glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 1e+05))

     AIC      BIC   logLik deviance df.resid 
  3483.0   3519.3  -1735.5   3471.0     3142 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-3.5998 -0.6840  0.4588  0.6031  2.8500 

Random effects:
 Groups       Name        Variance Std.Dev.
 child_name   (Intercept) 0.897    0.9471  
 is_follow_up (Intercept) 0.000    0.0000  
Number of obs: 3148, groups:  child_name, 42; is_follow_up, 2

Fixed effects:
                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)       -0.6198     0.2542  -2.438   0.0148 *  
is_follow_up       0.4913     0.2475   1.985   0.0472 *  
age                2.1303     0.3501   6.085 1.17e-09 ***
is_follow_up:age  -0.4493    

In [11]:
%%R -i conversations_melted
library(lme4)

conversations_with_response = subset(conversations_melted, has_response == 1)


m_child_contingency<-glmer('is_intelligible ~ response_is_clarification_request * is_follow_up + (1 | age) + (1 | child_name) + (1 | is_follow_up)', data=conversations_with_response, family=binomial, control=glmerControl(optimizer="bobyqa", optCtrl=list(maxfun=1e4)))
print(summary(m_child_contingency))

R[write to console]: boundary (singular) fit: see ?isSingular



Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: is_intelligible ~ response_is_clarification_request * is_follow_up +  
    (1 | age) + (1 | child_name) + (1 | is_follow_up)
   Data: conversations_with_response
Control: glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 10000))

      AIC       BIC    logLik  deviance  df.resid 
 388958.6  389037.1 -194472.3  388944.6    542779 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-5.4251  0.2767  0.3041  0.3481  8.7127 

Random effects:
 Groups       Name        Variance Std.Dev.
 child_name   (Intercept) 2.2565   1.5022  
 age          (Intercept) 0.3076   0.5546  
 is_follow_up (Intercept) 0.0000   0.0000  
Number of obs: 542786, groups:  child_name, 45; age, 7; is_follow_up, 2

Fixed effects:
                                                Estimate Std. Error z value
(Intercept)                                     1.147648   0.066659 