In [19]:
import IPython
import math
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 10]



In [2]:
from rpy2.robjects.packages import importr

utils = importr("utils")
utils.chooseCRANmirror(ind=1)
#utils.install_packages('lme4')

%load_ext rpy2.ipython

In [3]:
# Load data
conversations = pd.read_csv("results/intelligibility/conversations.csv")
conversations_melted = pd.read_csv("results/intelligibility/conversations_melted.csv")

# convert True/False to 0/1:
conversations.replace({False: 0, True: 1}, inplace=True)
conversations_melted.replace({False: 0, True: 1}, inplace=True)

# normalize age
min_age, max_age = conversations["age"].min(), conversations["age"].max()
conversations["age"] = (conversations["age"] - min_age) / (max_age - min_age) * (1 - 0)
conversations_melted["age"] = (conversations_melted["age"] - min_age) / (max_age - min_age) * (1 - 0)

conversations.head()

Unnamed: 0,utterance_id,speaker_code,tokens,pos,age,corpus,transcript_file,child_name,speaker_code_next,start_time_next,...,follow_up_start_time,follow_up_end_time,follow_up_is_speech_related,follow_up_is_intelligible,follow_up_speech_act,response_latency,response_latency_follow_up,has_response,response_is_clarification_request,pos_feedback
0,1941,CHI,['.'],['none'],0.333333,Bloom,/home/mitja/data/CHILDES/Bloom/Peter/010908.cha,Bloom_Peter,MOT,3146391.0,...,3160166.0,3166476.0,1.0,0,ST,0.0,13775.0,1,0,1
1,2004,CHI,['.'],['none'],0.333333,Bloom,/home/mitja/data/CHILDES/Bloom/Peter/010908.cha,Bloom_Peter,MOT,3534418.0,...,3550229.0,3552151.0,1.0,1,SA,0.0,15811.0,1,0,1
2,2009,CHI,"['frisbee', '.']",['n'],0.333333,Bloom,/home/mitja/data/CHILDES/Bloom/Peter/010908.cha,Bloom_Peter,MOT,3552151.0,...,3585822.0,3604379.0,,0,YY,0.0,33671.0,1,0,1
3,2025,CHI,"['train', 'train', 'penny', '.']","['n', 'v', 'n']",0.333333,Bloom,/home/mitja/data/CHILDES/Bloom/Peter/010908.cha,Bloom_Peter,MOT,3625437.0,...,3630262.0,3632581.0,1.0,1,DC,0.0,4825.0,1,0,1
4,2043,CHI,"['car_car', '.']",['chi'],0.333333,Bloom,/home/mitja/data/CHILDES/Bloom/Peter/010908.cha,Bloom_Peter,MOT,3710723.0,...,3712392.0,3714645.0,1.0,1,AN,0.0,1669.0,1,0,1


## Quality of communicative feedback/ Caregiver contingency


### Timing:

In [4]:
%%R -i conversations
library(lme4)

m_caregiver_contingency<-glmer('has_response ~ utt_is_intelligible * age + (1 | child_name)', data=conversations, family=binomial)
print(summary(m_caregiver_contingency))


R[write to console]: Loading required package: Matrix



Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: has_response ~ utt_is_intelligible * age + (1 | child_name)
   Data: conversations

      AIC       BIC    logLik  deviance  df.resid 
 405787.6  405842.4 -202888.8  405777.6    430638 

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-12.9356   0.1578   0.4862   0.5605   1.3347 

Random effects:
 Groups     Name        Variance Std.Dev.
 child_name (Intercept) 1.95     1.397   
Number of obs: 430643, groups:  child_name, 189

Fixed effects:
                        Estimate Std. Error z value Pr(>|z|)    
(Intercept)              2.60413    0.07489  34.774  < 2e-16 ***
utt_is_intelligible      0.65228    0.02211  29.507  < 2e-16 ***
age                      0.64960    0.03773  17.217  < 2e-16 ***
utt_is_intelligible:age  0.25084    0.04030   6.225 4.82e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation 

### Clarification requests

In [5]:
conversations["response_is_no_clarification_request"] = conversations.response_is_clarification_request.replace({0:1, 1:0})


In [18]:
%%R -i conversations
library(lme4)

conversations_with_response = subset(conversations, has_response==1)

m_caregiver_contingency<-glmer('response_is_clarification_request ~ utt_is_intelligible * age + (1 | child_name)', data=conversations_with_response, family=binomial)
print(summary(m_caregiver_contingency))

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: response_is_clarification_request ~ utt_is_intelligible * age +  
    (1 | child_name)
   Data: conversations_with_response

     AIC      BIC   logLik deviance df.resid 
 23723.2  23776.9 -11856.6  23713.2   340371 

Scaled residuals: 
   Min     1Q Median     3Q    Max 
-0.279 -0.085 -0.070 -0.033 34.195 

Random effects:
 Groups     Name        Variance Std.Dev.
 child_name (Intercept) 0.7456   0.8635  
Number of obs: 340376, groups:  child_name, 189

Fixed effects:
                        Estimate Std. Error z value Pr(>|z|)    
(Intercept)             -4.05184    0.11802 -34.332  < 2e-16 ***
utt_is_intelligible     -1.14697    0.09800 -11.703  < 2e-16 ***
age                     -0.44901    0.15167  -2.960  0.00307 ** 
utt_is_intelligible:age  0.09353    0.17610   0.531  0.59532    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

C

### Combined (Positive Feedback = No pause, no clarification request)

In [7]:
%%R -i conversations
library(lme4)

m_caregiver_contingency<-glmer('pos_feedback ~ utt_is_intelligible * age + (1 | child_name)', data=conversations, family=binomial)
print(summary(m_caregiver_contingency))

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: pos_feedback ~ utt_is_intelligible * age + (1 | child_name)
   Data: conversations

      AIC       BIC    logLik  deviance  df.resid 
 412469.8  412524.6 -206229.9  412459.8    430638 

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-10.4584   0.1761   0.5034   0.5664   1.3584 

Random effects:
 Groups     Name        Variance Std.Dev.
 child_name (Intercept) 1.36     1.166   
Number of obs: 430643, groups:  child_name, 189

Fixed effects:
                        Estimate Std. Error z value Pr(>|z|)    
(Intercept)              2.20415    0.05347  41.221  < 2e-16 ***
utt_is_intelligible      0.68356    0.02189  31.229  < 2e-16 ***
age                      0.66740    0.03751  17.791  < 2e-16 ***
utt_is_intelligible:age  0.22541    0.04085   5.518 3.43e-08 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation 

## Effect of communicative feedback
### Positive Feedback: Timing

In [8]:
%%R -i conversations
library(lme4)

m_child_contingency<-glmer('follow_up_is_intelligible ~ has_response * age + (1 | child_name)', data=conversations, family=binomial)
print(summary(m_child_contingency))

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: follow_up_is_intelligible ~ has_response * age + (1 | child_name)
   Data: conversations

      AIC       BIC    logLik  deviance  df.resid 
 340043.8  340098.6 -170016.9  340033.8    430638 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-5.8845  0.2476  0.3230  0.4086 12.1736 

Random effects:
 Groups     Name        Variance Std.Dev.
 child_name (Intercept) 3.207    1.791   
Number of obs: 430643, groups:  child_name, 189

Fixed effects:
                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)      -0.47305    0.07022  -6.736 1.62e-11 ***
has_response      0.40228    0.02434  16.531  < 2e-16 ***
age               1.31050    0.04384  29.890  < 2e-16 ***
has_response:age  0.13527    0.04634   2.919  0.00351 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
            (Intr) hs

In [9]:
%%R -i conversations
library(lme4)

conversations_child_intelligible = subset(conversations, utt_is_intelligible==1)

m_child_contingency<-glmer('follow_up_is_intelligible ~ has_response * age + (1 | child_name)', data=conversations_child_intelligible, family=binomial)
print(summary(m_child_contingency))

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: follow_up_is_intelligible ~ has_response * age + (1 | child_name)
   Data: conversations_child_intelligible

      AIC       BIC    logLik  deviance  df.resid 
 248756.5  248810.4 -124373.3  248746.5    354438 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-5.8631  0.2818  0.3078  0.3625  2.1229 

Random effects:
 Groups     Name        Variance Std.Dev.
 child_name (Intercept) 1.828    1.352   
Number of obs: 354443, groups:  child_name, 178

Fixed effects:
                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)       0.50781    0.08542   5.945 2.77e-09 ***
has_response      0.28359    0.02862   9.910  < 2e-16 ***
age               0.70087    0.04726  14.829  < 2e-16 ***
has_response:age  0.15431    0.05153   2.995  0.00275 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
  

### Negative Feedback: Clarification requests

In [10]:
%%R -i conversations_melted
library(lme4)

conversations_with_response = subset(conversations_melted, has_response == 1)
conversations_cr = subset(conversations_with_response, response_is_clarification_request == 1)

# TODO: conversation_id or is_follow_up in random effects?
m_child_contingency<-glmer('is_intelligible ~ is_follow_up * age + (1 | child_name) + (1 | conversation_id)', data=conversations_cr, family=binomial, control=glmerControl(optimizer="bobyqa", optCtrl=list(maxfun=1e5)))
print(summary(m_child_contingency))

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: is_intelligible ~ is_follow_up * age + (1 | child_name) + (1 |  
    conversation_id)
   Data: conversations_cr
Control: glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 1e+05))

     AIC      BIC   logLik deviance df.resid 
  4538.1   4576.0  -2263.0   4526.1     4140 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.5431 -0.5561  0.3747  0.5258  2.2593 

Random effects:
 Groups          Name        Variance Std.Dev.
 conversation_id (Intercept) 1.462    1.209   
 child_name      (Intercept) 1.382    1.175   
Number of obs: 4146, groups:  conversation_id, 2073; child_name, 79

Fixed effects:
                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)       -1.3939     0.2573  -5.418 6.03e-08 ***
is_follow_up       0.6010     0.2199   2.734  0.00627 ** 
age                3.0589     0.3849   7.947 1.91e-15 ***
is_follow_u

In [11]:
%%R -i conversations_melted
library(lme4)

conversations_with_response = subset(conversations_melted, has_response == 1)


m_child_contingency<-glmer('is_intelligible ~ response_is_clarification_request * is_follow_up + (1 | age) + (1 | child_name) + (1 | is_follow_up)', data=conversations_with_response, family=binomial, control=glmerControl(optimizer="bobyqa", optCtrl=list(maxfun=1e4)))
print(summary(m_child_contingency))

R[write to console]: boundary (singular) fit: see ?isSingular



Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: is_intelligible ~ response_is_clarification_request * is_follow_up +  
    (1 | age) + (1 | child_name) + (1 | is_follow_up)
   Data: conversations_with_response
Control: glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 10000))

      AIC       BIC    logLik  deviance  df.resid 
 507498.9  507578.9 -253742.4  507484.9    680745 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-5.8686  0.2756  0.3064  0.3561 14.4815 

Random effects:
 Groups       Name        Variance Std.Dev.
 child_name   (Intercept) 2.9612   1.7208  
 age          (Intercept) 0.3725   0.6103  
 is_follow_up (Intercept) 0.0000   0.0000  
Number of obs: 680752, groups:  child_name, 189; age, 7; is_follow_up, 2

Fixed effects:
                                                Estimate Std. Error z value
(Intercept)                                     0.704272   0.068233