In [3]:
options(stringsAsFactors = FALSE)

In [2]:
#install.packages(c('quanteda', 'tm', 'ggplot2', 'koRpus', 'RMySQL', 'plyr', 'dplyr'), repos = "http://cran.rstudio.com/")

In [41]:
library('grid')
library('gridExtra')

In [42]:
source('config.R')
library('lattice')
library('tm')
library('quanteda')
library('koRpus')
library('RMySQL')
library('ggplot2')
library('parallel')
library('plyr')
library('dplyr')
library('grid')
library('gridExtra')


In [43]:
#connect R to mysql
childes_db = dbConnect(MySQL(), user=config[['username']], password=config[['password']], dbname=config[['dbasename']], host=config[['host']])

In [38]:
#dbListTables(childes_db)

In [44]:
all = dbGetQuery(childes_db, "select * from words where speaker = 'CHI' AND age != 'NA' ")

In [45]:
n.cores = detectCores()

In [None]:
names(all)

In [12]:
#assigning child names
all$child = 0
getchi = split(all, all$participants)

for (elem in 1:length(getchi)){
  chi_name = unlist(strsplit(names(getchi)[elem], " "))[2]
  all$child[all$participants == names(getchi)[elem]] = chi_name
}

#cleaning up child names before adding tokens
all$child[all$child == "Target"] = "Eric"
all$child[all$child == "Target_Child"] = "Gia"
all$child[all$child == "Mother"] = "Nina"

#removing Gia because CHI only says 902 tokens, which is less than the 1024 tokens needed for a single lexical diversity metric
all = all[all$child != "Gia",]

In [10]:
names(all)

In [13]:
#assinging index for breaking up into 1024 tokens each
all$chi_index = 0

for (elem in unique(all$child)){
all$chi_index[all$child == elem] = 1:length(all$chi_index[all$child == elem])
}

In [14]:
#gets various age metrics for the graph
splitchi = split(all, all$child)
#checks dimensions of each child
dimchi = lapply(splitchi, dim)

In [15]:
max_age = lapply(splitchi, function (x) max(x$age, na.rm = TRUE))
median_age = lapply(splitchi, function (x) median(x$age, na.rm = TRUE))
min_age = lapply(splitchi, function (x) min(x$age, na.rm = TRUE))
mean_age = lapply(splitchi, function (x) mean(x$age, na.rm = TRUE))
allchigraph = data.frame("child" = names(mean_age), 
           "max_age" = unlist(max_age), 
           "min_age" = unlist(min_age), 
           "median_age" = unlist(median_age), 
           "mean_age" = unlist(mean_age))

In [16]:
childnames = unique(allchigraph$child)

In [17]:
##############Compute lexical diversity of Uber Index, Yule's I, and MTLD by child#################
#returns a list of dataframes

get_tok_set = function(child_name){
    chi_rows = all[all$child == child_name,]
    tot_tok = nrow(chi_rows)    
    chi_rows = chi_rows[chi_rows$chi_index %in% 1:(tot_tok - tot_tok%%512),]
    tot_tok = nrow(chi_rows)    
    chi_rows$tok_grp = sort(rep(1:(tot_tok/512), times = 512))
    tok_512 = split(chi_rows, factor(chi_rows$tok_grp))
    tok_1024 = lapply(1:(length(tok_512)-1), function(x) rbind(tok_512[[x]], tok_512[[x+1]]))
    names(tok_1024) = unlist(lapply(1:length(tok_1024), function(x) median(tok_1024[[x]]$age, na.rm = TRUE)))
    return(tok_1024)
}



In [19]:
tagging_text = function(child_rows){
    tokens = as.character(child_rows$gloss)
    tokens = paste(tokens, collapse = " ")
    tagged.text <- tokenize(tokens, format="obj", lang="en")
    return(tagged.text)
}


In [20]:
compute_inv_K = function(tagged_text){
    return(1/K.ld(tagged_text, detailed=FALSE, char=c(), quiet=TRUE)@K.ld)
}
                      

In [21]:
compute_U = function(child_rows){
    tokens = as.character(child_rows$gloss)
    tokens = paste(tokens, collapse = " ")
    chi_dfm = dfm(tokens)
    chi_lexdiv = unname(lexdiv(x = chi_dfm, measure = "U"))
    return(chi_lexdiv)
 }                      

In [22]:
compute_MTLD = function(tagged_text){
      return(MTLD(tagged_text)@MTLD$MTLD)
}


In [23]:
childnames = unique(all$child)
ld_metric_1024 = function(tok_1024){#tok_1024 is a dataframe with 1024 rows
    U_LD = compute_U(tok_1024)
    tt = tagging_text(tok_1024)
    inv_K_LD = compute_inv_K(tt)
    MTLD_LD = compute_MTLD(tt)
    med_age = median(tok_1024$age, na.rm = TRUE)
    return(data.frame(med_age, U_LD, inv_K_LD, MTLD_LD))
}
                    

In [24]:
process_one_child = function(child_name){
    pw_results = lapply(get_tok_set(child_name = child_name), ld_metric_1024)
    pw_results_df = do.call("rbind", pw_results)
    return(pw_results_df)
}


In [None]:
#allchiLD = mclapply(as.character(childnames), process_one_child, mc.cores=detectCores())
allchiLD = lapply(as.character(childnames), process_one_child)


In [None]:
names(allchiLD) = childnames

In [None]:
allchiLD[[1]]

In [None]:
LDplotfunction = function(child_name, LD){
    if(LD == "U_LD"){
        y.text = "Uber"
    }
    if(LD == "inv_K_LD"){
        y.text = "Inverse K"
    }
    if(LD == "MTLD_LD"){
        y.text = "MTLD"
    }
    return(ggplot(data = allchiLD[[child_name]], aes_string(x = "med_age", y = LD)) + 
geom_smooth(fill = NA, size = 1) + labs(x = "Age (days)", y = paste(c(y.text,"Lexical Diversity"), collapse = " "), 
                                                    title = child_name))}

In [None]:
allchiLD[['Eric']] = NULL
#Eric does not have enough observations for a loess curve
allchiLD[['Naima']] = allchiLD[['Naima']][(!(is.na(allchiLD[['Naima']]$med_age))),]

In [None]:
Uplotlist = lapply(names(allchiLD), function(x) LDplotfunction(x,"U_LD"))
Kplotlist = lapply(names(allchiLD), function(x) LDplotfunction(x,"inv_K_LD"))
MTLDplotlist = lapply(names(allchiLD), function(x) LDplotfunction(x,"MTLD_LD"))

In [None]:
#plot all graphs of the same type together
#pdf(file = "UberLexDivDevChi.pdf", width = 28, height = 18)
Uplotall = do.call('grid.arrange', c(Uplotlist, list(ncol=4)))
#grid.arrange(Uplotall)
#dev.off()
#pdf(file = "YulesKLexDivDevChi.pdf", width = 28, height = 18)
Kplotall = do.call(grid.arrange, c(Kplotlist, list(ncol=4)))
#grid.arrange(Kplotall)
#dev.off()
#pdf(file = "MTLDLexDivDevChi.pdf", width = 28, height = 18)
MTLDplotall = do.call(grid.arrange, c(MTLDplotlist, list(ncol=4)))
#grid.arrange(MTLDplotall)
#dev.off()

In [None]:
namevec = c()
for(elem in 1:length(names(allchiLD))){
  namevec = c(namevec, rep(names(allchiLD)[elem], times = unname(sapply(allchiLD, dim)[1,])[elem]))  
}

In [None]:
all_chiLD_df = do.call("rbind", allchiLD)

In [None]:
all_chiLD_df$child = namevec
all_chiLD_df$gender = 0
all_chiLD_df$gender[all_chiLD_df$child %in% c("Nina", "Lily", "Naima", "Violet", "Eve", "Sarah", "Naomi")] = "female"
all_chiLD_df$gender[all_chiLD_df$child %in% c("Eric", "Peter", "Alex", "Ethan", "William", "Adam", "Abe")] = "male"

In [None]:
#plot generated in meeting on 3/9/16; shows Uber LD over time by gender
ggplot(all_chiLD_df, aes(x = med_age, y = U_LD)) + geom_point(aes(colour = gender)) + 
stat_smooth(aes(colour = gender), span = 0.1, formula = y ~ poly(x, 3), method = "gam")

In [None]:
#plot generated in meeting on 3/9/16; shows Uber LD over time by gender
ggplot(all_chiLD_df, aes(x = med_age, y = inv_K_LD)) + geom_point(aes(colour = gender)) + 
stat_smooth(aes(colour = gender), span = 0.1, formula = y ~ poly(x, 3), method = "gam")

In [None]:
#plot generated in meeting on 3/9/16; shows Uber LD over time by gender
ggplot(all_chiLD_df, aes(x = med_age, y = MTLD_LD)) + geom_point(aes(colour = gender)) + 
stat_smooth(aes(colour = gender), span = 0.1, formula = y ~ poly(x, 3), method = "gam")