In [2]:
import math
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 10]

In [3]:
from rpy2.robjects.packages import importr

utils = importr("utils")
utils.chooseCRANmirror(ind=1)
#utils.install_packages('lme4')

%load_ext rpy2.ipython

In [32]:
# Load data
conversations = pd.read_csv("results/intelligibility/conversations.csv")
conversations_melted = pd.read_csv("results/intelligibility/conversations_melted.csv")

# convert True/False to 0/1:
conversations.replace({False: 0, True: 1}, inplace=True)
conversations_melted.replace({False: 0, True: 1}, inplace=True)

# normalize age
min_age, max_age = conversations["age"].min(), conversations["age"].max()
conversations["age"] = (conversations["age"] - min_age) / (max_age - min_age) * (1 - 0)
conversations_melted["age"] = (conversations_melted["age"] - min_age) / (max_age - min_age) * (1 - 0)

conversations.head()

Unnamed: 0,utterance_id,speaker_code,tokens,pos,age,corpus,transcript_file,child_name,speaker_code_next,start_time_next,...,follow_up_start_time,follow_up_end_time,follow_up_is_speech_related,follow_up_is_intelligible,follow_up_speech_act,response_latency,response_latency_follow_up,has_response,response_is_clarification_request,pos_feedback
0,28,CHI,"[""let's"", 'get', 'some', 'glue', 'too', '.']","['v', 'pro:obj', 'v', 'qn', 'n', 'adv']",0.5,Bloom,/home/mitja/data/CHILDES/Bloom/Peter/020324.cha,Bloom_Peter,MOT,96687.0,...,97154.0,100026.0,1.0,1,PF,0.0,467.0,1,0,1
1,50,CHI,"['i', 'get', 'some', 'glue', 'too', '.']","['pro:sub', 'v', 'qn', 'n', 'adv']",0.5,Bloom,/home/mitja/data/CHILDES/Bloom/Peter/020324.cha,Bloom_Peter,MOT,148262.0,...,150114.0,151434.0,1.0,0,AA,0.0,1852.0,0,0,0
2,52,CHI,"['broken', '.']",['part'],0.5,Bloom,/home/mitja/data/CHILDES/Bloom/Peter/020324.cha,Bloom_Peter,MOT,151434.0,...,155422.0,161449.0,1.0,0,ST,0.0,3988.0,1,0,1
3,58,CHI,"['huh', 'right', 'there', 'see', 'his', 'legs'...","['co', 'adv', 'adv', 'v', 'det:poss', 'n', 'v'...",0.5,Bloom,/home/mitja/data/CHILDES/Bloom/Peter/020324.cha,Bloom_Peter,MOT,169064.0,...,172202.0,173521.0,1.0,1,AD,0.0,3138.0,1,0,1
4,78,CHI,"['hi', 'jenny', 'how', 'you', 'fine', 'hi', 'h...","['co', 'n:prop', 'pro:rel', 'pro:per', 'adv', ...",0.5,Bloom,/home/mitja/data/CHILDES/Bloom/Peter/020324.cha,Bloom_Peter,MOT,243831.0,...,249125.0,249641.0,1.0,1,AA,0.0,5294.0,1,0,1


## Quality of communicative feedback/ Caregiver contingency


### Timing:

In [5]:
%%R -i conversations
library(lme4)

m_caregiver_contingency<-glmer('has_response ~ utt_is_intelligible * age + (1 | child_name)', data=conversations, family=binomial)
print(summary(m_caregiver_contingency))


R[write to console]: Loading required package: Matrix



Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: has_response ~ utt_is_intelligible * age + (1 | child_name)
   Data: conversations

      AIC       BIC    logLik  deviance  df.resid 
 288574.9  288628.6 -144282.4  288564.9    341501 

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-24.3482   0.0985   0.4977   0.5367   3.9629 

Random effects:
 Groups     Name        Variance Std.Dev.
 child_name (Intercept) 3.644    1.909   
Number of obs: 341506, groups:  child_name, 45

Fixed effects:
                        Estimate Std. Error z value Pr(>|z|)    
(Intercept)              2.36294    0.06879  34.349   <2e-16 ***
utt_is_intelligible      0.80023    0.02886  27.728   <2e-16 ***
age                      0.79530    0.04718  16.858   <2e-16 ***
utt_is_intelligible:age  0.10948    0.05063   2.162   0.0306 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation o

### Clarification requests

In [6]:
conversations["response_is_no_clarification_request"] = conversations.response_is_clarification_request.replace({0:1, 1:0})


In [7]:
%%R -i conversations
library(lme4)

# TODO: filter out cases without responses?
m_caregiver_contingency<-glmer('response_is_clarification_request ~ utt_is_intelligible * age + (1 | child_name)', data=conversations, family=binomial)
print(summary(m_caregiver_contingency))

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: response_is_clarification_request ~ utt_is_intelligible * age +  
    (1 | child_name)
   Data: conversations

     AIC      BIC   logLik deviance df.resid 
 20855.6  20909.3 -10422.8  20845.6   341501 

Scaled residuals: 
   Min     1Q Median     3Q    Max 
-0.355 -0.072 -0.070 -0.028 36.763 

Random effects:
 Groups     Name        Variance Std.Dev.
 child_name (Intercept) 0.7563   0.8696  
Number of obs: 341506, groups:  child_name, 45

Fixed effects:
                        Estimate Std. Error z value Pr(>|z|)    
(Intercept)              -3.6178     0.1485 -24.367   <2e-16 ***
utt_is_intelligible      -1.9172     0.0971 -19.744   <2e-16 ***
age                      -0.1762     0.1643  -1.073    0.283    
utt_is_intelligible:age   0.0634     0.1718   0.369    0.712    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of F

### Combined (Positive Feedback = No pause, no clarification request)

In [None]:
%%R -i conversations
library(lme4)

m_caregiver_contingency<-glmer('pos_feedback ~ utt_is_intelligible * age + (1 | child_name)', data=conversations, family=binomial)
print(summary(m_caregiver_contingency))

## Effect of communicative feedback
### Positive Feedback: Timing

In [12]:
%%R -i conversations
library(lme4)

m_child_contingency<-glmer('follow_up_is_intelligible ~ has_response * age + (1 | child_name)', data=conversations, family=binomial)
print(summary(m_child_contingency))

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: follow_up_is_intelligible ~ has_response * age + (1 | child_name)
   Data: conversations

     AIC      BIC   logLik deviance df.resid 
191092.7 191146.4 -95541.4 191082.7   341501 

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-10.3677   0.1985   0.2490   0.3245  10.8294 

Random effects:
 Groups     Name        Variance Std.Dev.
 child_name (Intercept) 4.06     2.015   
Number of obs: 341506, groups:  child_name, 45

Fixed effects:
                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)      -0.15185    0.10862  -1.398    0.162    
has_response      0.57887    0.03441  16.821  < 2e-16 ***
age               2.40294    0.05918  40.602  < 2e-16 ***
has_response:age  0.31376    0.06552   4.789 1.68e-06 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
            (Intr) hs_

In [18]:
%%R -i conversations
library(lme4)

conversations_child_intelligible = subset(conversations, utt_is_intelligible==1)

m_child_contingency<-glmer('follow_up_is_intelligible ~ has_response * age + (1 | child_name)', data=conversations_child_intelligible, family=binomial)
print(summary(m_child_contingency))

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: follow_up_is_intelligible ~ has_response * age + (1 | child_name)
   Data: conversations_child_intelligible

      AIC       BIC    logLik  deviance  df.resid 
 235203.7  235256.5 -117596.9  235193.7    280874 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-4.8729  0.3423  0.3731  0.4486  2.1207 

Random effects:
 Groups     Name        Variance Std.Dev.
 child_name (Intercept) 1.403    1.185   
Number of obs: 280879, groups:  child_name, 49

Fixed effects:
                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)       0.65018    0.10077   6.452  1.1e-10 ***
has_response      0.40953    0.03242  12.630  < 2e-16 ***
age               0.42758    0.04782   8.941  < 2e-16 ***
has_response:age -0.08326    0.05716  -1.457    0.145    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
   

### Negative Feedback: Clarification requests

In [33]:
%%R -i conversations_melted
library(lme4)

conversations_with_response = subset(conversations_melted, has_response == 1)
conversations_cr = subset(conversations_with_response, response_is_clarification_request == 1)

# TODO: conversation_id or is_follow_up in random effects?
m_child_contingency<-glmer('is_intelligible ~ is_follow_up * age + (1 | child_name) + (1 | conversation_id)', data=conversations_cr, family=binomial, control=glmerControl(optimizer="bobyqa", optCtrl=list(maxfun=1e5)))
print(summary(m_child_contingency))

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: is_intelligible ~ is_follow_up * age + (1 | child_name) + (1 |  
    conversation_id)
   Data: conversations_cr
Control: glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 1e+05))

     AIC      BIC   logLik deviance df.resid 
  3087.8   3123.3  -1537.9   3075.8     2728 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.3505 -0.6435  0.4090  0.5519  2.3932 

Random effects:
 Groups          Name        Variance Std.Dev.
 conversation_id (Intercept) 1.028    1.014   
 child_name      (Intercept) 1.357    1.165   
Number of obs: 2734, groups:  conversation_id, 1367; child_name, 40

Fixed effects:
                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)       -0.7037     0.3186  -2.209   0.0272 *  
is_follow_up       0.5595     0.2809   1.992   0.0464 *  
age                2.3239     0.4352   5.340 9.29e-08 ***
is_follow_u

In [34]:
%%R -i conversations_melted
library(lme4)

conversations_with_response = subset(conversations_melted, has_response == 1)


m_child_contingency<-glmer('is_intelligible ~ response_is_clarification_request * is_follow_up + (1 | age) + (1 | child_name) + (1 | is_follow_up)', data=conversations_with_response, family=binomial, control=glmerControl(optimizer="bobyqa", optCtrl=list(maxfun=1e4)))
print(summary(m_child_contingency))

R[write to console]: boundary (singular) fit: see ?isSingular



Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: is_intelligible ~ response_is_clarification_request * is_follow_up +  
    (1 | age) + (1 | child_name) + (1 | is_follow_up)
   Data: conversations_with_response
Control: glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 10000))

      AIC       BIC    logLik  deviance  df.resid 
 389811.9  389889.6 -194898.9  389797.9    488613 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-5.1061  0.3220  0.3396  0.3824  8.4581 

Random effects:
 Groups       Name        Variance Std.Dev.
 child_name   (Intercept) 2.1763   1.4752  
 age          (Intercept) 0.2402   0.4902  
 is_follow_up (Intercept) 0.0000   0.0000  
Number of obs: 488620, groups:  child_name, 45; age, 7; is_follow_up, 2

Fixed effects:
                                                Estimate Std. Error z value
(Intercept)                                     0.886203   0.076206 