Extract tokens from longitudinal corpora, including:
<ul>
<li>Bloom 1970</li>
<li>Brown</li>
<li>Suppes</li>
<li>Providence</li>
<li>Sachs</li>
</ul>

For further consideration:
<ul>
<li>Higginson (Naturalistic short-term longitudinal observations of mother–child interactions in unstructured play sessions)</li>
<li>Post</li>
<li>Bates</li>
<li>Demettras</li>
<li>Braunwald</li>
<li>Clark</li>
<li>Davis</li>
<li>Feldman</li>
<li>Inkelas (no morphology?)</li>
<li>MacWhinney</li>
<li>Weist</li>
</ul>

In [24]:
options(stringsAsFactors = FALSE)
source("CLANtoR.R")
source("config.R")
library('parallel','plyr','tools','stringr')
wordsToExclude = c("hmm", "hm", "mm", "uh", "uhh", "ah", "um","uhhuh","eh","xxx","yyy", "xx", 'yy','aw', 'www','er','ka')
noUtt = c("0")
metadataRows = c('sentGloss','sentMor','Speaker','xgr','pho','act','gpx','sit','com', 'par','Filename','Participants',
                 'Date',"Language","Corpus","Age","Gender","Utt.Number","index","add","alt","int","spa","err","eng")
reformulations = paste(c("\\[\\/\\/\\]","\\[\\/\\]", "\\[\\/\\?\\]", "\\[\\/\\/\\/\\]","\\[\\/-\\]"), collapse='|')

In [23]:
sentenceHandler = function(row){
    #!!! the problem is that there are many sentences where the word nodes do not have all of the metadata, 
    #leading to mismatches in the length of the arrays    
    
    temp_glosses = row$Gloss
    if (length(grep(reformulations,temp_glosses)) > 0){ 
       #handle reformulations using the script from Naomi
       	original_mor = cleanMOR(row$mor)          	
       	scrubbedGloss =  paste(removeAudioTags(strsplit(temp_glosses, ' ')[[1]]), collapse=' ')
     	mors = process_backslash_mor(original_gloss = scrubbedGloss, original_mor, reformulations)
 	    mors = mors[!is.na(mors)]        
    } else {
        mors = cleanMOR(row$mor)   
    }
               
    glosses = cleanGloss(temp_glosses)        
    
    #handle any mismatches in length:
    if (length(glosses) > length(mors)){
        # handle a mismatch in the length of the two cleaned vectors    
        
        #Why not use the gloss to index into the mors, and find the corresponding term for each?
        #b/c there's aproblem with indexing in from the gloss is that there are complex terms 
        #like don't == aux|do~neg|not        
        
        newMor = mat.or.vec(length(mors),1)
        offset = 0 #this is the difference between the index in glosses and the index in mor
        
        for (i in 1:length(glosses)){
            if (length(grep('&=', glosses[i])) > 0){
                newMor[i] = 'NON-LINGUISTIC'       
                offset = offset + 1
            } else if ((length(grep('^&', glosses[i])) > 0) | glosses[i] %in% wordsToExclude){
                newMor[i] = 'EXC'       
                offset = offset + 1
            } else if (glosses[i] %in% noUtt){
                newMor[i] = 'NO_UTT'
                offset = offset + 1
            } else {
                newMor[i] = mors[i-offset]
            }            
        }
        
        if(length(glosses) != length(newMor)){
            print('Glosses: ')
            print(glosses)
            print('Original Mors:')
            print(mors)
            print('Corrected Mors:')
            print(newMor)
            stop('Recovery process for longer gloss failed')
        } else {
            mors = newMor            
        }
    } else if (length(glosses) < length(mors)){         
#         print('Glosses: ')
#         print(glosses)
#         print('Mors:')
#         print(mors)
#         print('Row')
#         print(row)
#         stop('No recovery process for longer MOR line')
        return(NULL)
    }
        
    splitGlosses = strsplit(glosses,'@') 
    glosses = sapply(splitGlosses, function(x){x[1]})
    atTags = sapply(splitGlosses, function(x){ ifelse(length(x) > 1,x[2],'NA')})    
    
    #print('Gloss')
    #print(glosses)
    #print('Mor')
    #print(mors)
    
    rd = data.frame(Gloss = glosses, mor = mors, atTags)
    if (nrow(rd) > 0){                        
        row$sentGloss = paste(glosses, collapse = ' ')
        row$sentMor = paste(mors, collapse= ' ')        
        
        if('xgr' %in% names(row)){
            row$xgr = gsub('\\t','',row$xgr)            
        }
        selectRows = names(row)[names(row) %in% metadataRows] #only select those from the desired columns that are present
        return(cbind(rd, row[,selectRows], row.names = NULL)) #returns df, number of words * columns
    } else {
        print(row)
        stop('Zero-length return data')
    }  
}

process_backslash_mor = function(original_gloss, original_mor, reformulations){
	
	original_gloss = gsub('[()]','', original_gloss)
	og_sep = gsub('<', '< ', gsub('>', ' >',original_gloss))
	
	#reverse both and parse from the back
	og = rev(strsplit(og_sep, ' ')[[1]])
	#delete the punctuation, if it exists
	if (nchar(gsub('[[:punct:]]','',og[1])) == 0 ){
		og = og[-1]
	}			
	om = rev(original_mor)
	if (nchar(gsub('[[:punct:]]','',om[1])) == 0 ){
		om = om[-1]
	}			
		
	#then parse both from back to front. > means the beginning of a sequenct, and > the end.

	inReformulation = F
	new_mor = list()
	new_mor_index = 0
	old_mor_index = 0
	markerPrevious = T 
	
	for (i in 1:length(og)){
		if(length(grep(reformulations, og[i])) > 0){
			#reformulation marker found
			inReformulation = T			
			markerPrevious = T			
		} else if (inReformulation & og[i] == '>'){
			#beginning marker, don't do anything
			markerPrevious=F
		} else if (inReformulation & og[i] == '<'){
			#end marker; end inReformulation
			inReformulation = F
		} else if (inReformulation & markerPrevious){
			#single content item in the reformulation			
			new_mor_index = new_mor_index+1
			new_mor[new_mor_index] = 'BRK'
			inReformulation = F
			markerPrevious = F 
		} else if (inReformulation ){				
			#content item in the reformulation			
			new_mor_index = new_mor_index+1
			new_mor[new_mor_index] = 'BRK'
		} else {
			new_mor_index = new_mor_index+1
			old_mor_index = old_mor_index+1
			new_mor[new_mor_index] = om[old_mor_index]					
		}	
	}
	
	rev(sapply(new_mor, function(x){x[1]}))
}

removeAudioTags = function(unlistedwords){	
	charLengths = sapply(unlistedwords, nchar)
	numNonNumbers = sapply(unlistedwords, function(x){
		nchar(gsub('[[:digit:]_\\^+U] *','',x)) 		
	})
	
	tagIndices = (charLengths > 5) & (numNonNumbers <= 2) 
	if(any(tagIndices)){
		return(unlistedwords[-which(tagIndices)]) 	
	} else {
		return(unlistedwords) 	
	}	
}

cleanGloss = function(gloss){
    #print('cleaning gloss')
    originalGloss = gloss
    gloss = gsub("[^[:alnum:][:space:]'_@+&=]", '', gloss)  #remove non-apostrophe punctuation  
	gloss = gsub('\342\200\234','\342\200\234 ', gloss)#preceding quotes
    gloss = gsub('\342\200\235',' \342\200\235', gloss)#following quotes
    unlistedwords = unlist(strsplit(gloss, split = " "))
    unlistedwords = unlistedwords[!(unlistedwords ==  '')]
    unlistedwords = gsub("\\n|\\t", " ", unlistedwords)           
    unlistedwords = unlistedwords[sapply(gsub('[[:punct:]]','', unlistedwords), nchar) > 0] #remove puntucation-only words
    #is the last items a number with > 7 digits? this is a tag, don't return it
    if (length(unlistedwords) == 0){
        print(originalGloss)
        print(unlistedwords)
    }
    return(removeAudioTags(unlistedwords))
}


cleanMOR = function(mor){
    #print('cleaning mor')
    onesplit = gsub("cm\\|cm|none\\|cm", "", unlist(strsplit(mor, " ")))
    onesplit = onesplit[grepl("\\|", onesplit)]
    onesplit = gsub("\\n|\\t", " ", onesplit)
    onesplit = unlist(strsplit(onesplit, split = " "))
    onesplit = gsub("[!,?//.]", "", onesplit) #!!! think we probably want to keep this information around
    onesplit = onesplit[!(onesplit %in%  c("","bq|bq","eq|eq"))]        
    return(onesplit[sapply(gsub('[[:punct:]]','', onesplit), nchar) > 0])
}

processClanFile = function(filename){
    library('stringr')
    print(paste('Processing file:', filename))
    df = read.CLAN.file(filename)
    if (ncol(df) > 35){ #!!! lower this number if possible
        print(names(df))
        stop(paste(filename, 'has an invalid structure: too many columns found'))
    }
    print(paste('CLANtoR produced dataframe with dimensions:',dim(df)[1], 'by', dim(df)[2]))
    
    processedSentenceList = lapply(1:nrow(df), function(i){sentenceHandler(df[i,])})
    print('Processed sentences')
    
    allTokens = do.call('rbind.fill', processedSentenceList)            
    return(allTokens)
}

processDirectory = function(dirname){    
    fnames = paste(dirname, list.files(dirname, recursive=T, pattern = "\\.cha$"), sep='/')
    print(paste('Processing', length(fnames), 'filenames'))
    
    #!!! multicore this 
    allFiles = do.call('rbind.fill', lapply(fnames, processClanFile))
    #allFiles = do.call('rbind.fill', mclapply(fnames, processClanFile, mc.cores=detectCores()))
    names(allFiles) = tolower(names(allFiles))
    allFiles$age = sapply(allFiles$age, ageToDays)
    
    return(allFiles)
}

ageToDays = function(age){
    ageParts = strsplit(age, ';')[[1]]
    return(ceiling((12*30.5*as.numeric(ageParts[1])) + as.numeric(ageParts[2])*30.5))	
}

In [7]:
test = processClanFile('/shared_hd0/corpora/childes_new/Bloom70/Peter/01.cha')

[1] "Processing file: /shared_hd0/corpora/childes_new/Bloom70/Peter/01.cha"
[1] "CLANtoR produced dataframe with dimensions: 2465 by 31"
[1] "Processed sentences"


In [9]:
bloom70 = processDirectory('/shared_hd0/corpora/childes_new/Bloom70') 

[1] "Processing 28 filenames"


In [None]:
length(grep('.* NA$', bloom70$sentMor))

In [None]:
# randomly sample from bloom70
#bloom70$word[sample(1:nrow(bloom70),1000)]
bloom70[sample(1:nrow(bloom70),1000),c('gloss','mor')]

In [3]:
#connect R to mysql
library('RMySQL')
childes_db = dbConnect(MySQL(), user=config[['username']], password=config[['password']], dbname=config[['dbasename']], host=config[['host']])

Loading required package: DBI


In [16]:
#write the dataframe to the remote

dbWriteTable(childes_db, bloom70, name = "words", row.names = F, overwrite=T)  

In [10]:
#to start it, point it at a specific dictionary
suppes = processDirectory('/shared_hd0/corpora/childes_new/Suppes') 

[1] "Processing 52 filenames"


In [17]:
dbWriteTable(childes_db, suppes, name = "words", row.names = F, append=T)  

In [11]:
providence = processDirectory('/shared_hd0/corpora/childes_new/Providence') 

[1] "Processing 364 filenames"


In [18]:
dbWriteTable(childes_db, providence, name = "words", row.names = F, append=T)  

In [12]:
brown = processDirectory('/shared_hd0/corpora/childes_new/Brown') 

[1] "Processing 214 filenames"


In [19]:
dbWriteTable(childes_db, brown, name = "words", row.names = F, append=T)  

In [13]:
kuczaj = processDirectory('/shared_hd0/corpora/childes_new/Kuczaj') 

[1] "Processing 210 filenames"


In [20]:
dbWriteTable(childes_db, kuczaj, name = "words", row.names = F, append=T)  

In [None]:
processClanFile('/shared_hd0/corpora/childes_new/Sachs/n30na.cha')

In [14]:
sachs = processDirectory('/shared_hd0/corpora/childes_new/Sachs') 

[1] "Processing 93 filenames"


In [21]:
dbWriteTable(childes_db, sachs, name = "words", row.names = F, append=T)  

In [None]:
#Need some reliable desgination of the child
#break apart the corpus schema

In [None]:
<li>Post</li>
<li>Bates</li>
<li>Demettras</li>
<li>Braunwald</li>
<li>Clark</li>
<li>Davis</li>
<li>Feldman</li>
<li>Inkelas (no morphology?)</li>
<li>MacWhinney</li>
<li>Weist</li>

In [6]:
higginson = processDirectory('/shared_hd0/corpora/childes_new/Higginson')

[1] "Processing 21 filenames"


In [9]:
post = processDirectory('/shared_hd0/corpora/childes_new/Post')

[1] "Processing 30 filenames"


In [13]:
bates = processDirectory('/shared_hd0/corpora/childes_new/Bates')
dim(bates)

[1] "Processing 117 filenames"


In [14]:
Demetras1 = processDirectory('/shared_hd0/corpora/childes_new/Demetras1')
dim(Demetras1)

[1] "Processing 26 filenames"


In [15]:
Demetras2 = processDirectory('/shared_hd0/corpora/childes_new/Demetras2')
dim(Demetras2)

[1] "Processing 26 filenames"


In [26]:
Braunwald = processDirectory('/shared_hd0/corpora/childes_new/Braunwald')
dim(Braunwald)

[1] "Processing 900 filenames"
[1] "Processing file: /shared_hd0/corpora/childes_new/Braunwald/0diary/010301.cha"
[1] "CLANtoR produced dataframe with dimensions: 6 by 18"
[1] "Processed sentences"
[1] "Processing file: /shared_hd0/corpora/childes_new/Braunwald/0diary/010302.cha"
[1] "CLANtoR produced dataframe with dimensions: 1 by 18"
[1] "Processed sentences"
[1] "Processing file: /shared_hd0/corpora/childes_new/Braunwald/0diary/010303.cha"
[1] "CLANtoR produced dataframe with dimensions: 13 by 18"
[1] "Processed sentences"
[1] "Processing file: /shared_hd0/corpora/childes_new/Braunwald/0diary/010304.cha"
[1] "CLANtoR produced dataframe with dimensions: 3 by 18"
[1] "Processed sentences"
[1] "Processing file: /shared_hd0/corpora/childes_new/Braunwald/0diary/010305.cha"
[1] "CLANtoR produced dataframe with dimensions: 3 by 18"
[1] "Processed sentences"
[1] "Processing file: /shared_hd0/corpora/childes_new/Braunwald/0diary/010306.cha"
[1] "CLANtoR produced dataframe with dimensions: 3

ERROR: Error in w:closebracket: NA/NaN argument


ERROR: Error in eval(expr, envir, enclos): object 'Braunwald' not found


In [25]:
test = processClanFile('/shared_hd0/corpora/childes_new/Braunwald/2-06-15.cha')

[1] "Processing file: /shared_hd0/corpora/childes_new/Braunwald/2-06-15.cha"
[1] "CLANtoR produced dataframe with dimensions: 176 by 20"
[1] "Processed sentences"
