In [16]:
options(stringsAsFactors = FALSE)
source("CLANtoR.R")
source("config.R")
library('parallel','plyr','tools')
wordsToExclude = c("hmm", "hm", "mm", "uh", "uhh", "ah", "um","=laughs", "uhhuh","eh","xxx","yyy", "xx", 'yy','aw', 'www','er','ka')
noUtt = c("0")
metadataRows = c('sentGloss','sentMor','Speaker','xgr','pho','act','gpx','sit','com', 'par','Filename','Participants',
                 'Date',"Language","Corpus","Age","Gender","Utt.Number","index","add","alt","int","spa","err","eng")

In [18]:
sentenceHandler = function(row){
    #!!! the problem is that there are many sentences where the word nodes do not have all of the metadata, 
    #leading to mismatches in the length of the arrays    
    
    glosses = cleanGloss(row$Gloss)    
    mors = cleanMOR(row$mor)       
    
    #handle any mismatches in length:
    if (length(glosses) > length(mors)){
        # handle a mismatch in the length of the two cleaned vectors    
        
        #Why not use the gloss to index into the mors, and find the corresponding term for each?
        #b/c there's aproblem with indexing in from the gloss is that there are complex terms 
        #like don't == aux|do~neg|not        
        
        newMor = mat.or.vec(length(mors),1)
        offset = 0 #this is the difference between the index in glosses and the index in mor
        
        for (i in 1:length(glosses)){
            if(glosses[i] %in% wordsToExclude){
                newMor[i] = 'EXC'       
                offset = offset + 1
            } else if (glosses[i] %in% noUtt){
                newMor[i] = 'NO_UTT'
                offset = offset + 1
            } else {
                newMor[i] = mors[i-offset]
            }            
        }
        
        if(length(glosses) != length(newMor)){
            print('Glosses: ')
            print(glosses)
            print('Original Mors:')
            print(mors)
            print('Corrected Mors:')
            print(newMor)
            stop('Recovery process for longer gloss failed')
        } else{
            mors = newMor            
        }
    } else if (length(glosses) < length(mors)){         
        print('Glosses: ')
        print(glosses)
        print('Mors:')
        print(mors)
        print('Row')
        print(row)
        stop('No recovery process for longer MOR line')
    }
    
    splitGlosses = strsplit(glosses,'@') 
    glosses = sapply(splitGlosses, function(x){x[1]})
    atTags = sapply(splitGlosses, function(x){ ifelse(length(x) > 1,x[2],'NA')})    
    
    rd = data.frame(Gloss = glosses, mor = mors, atTags)
    if (nrow(rd) > 0){                        
        row$sentGloss = paste(glosses, collapse = ' ')
        row$sentMor = paste(mors, collapse= ' ')
        if('xgr' %in% names(row)){
            row$xgr = gsub('\\t','',row$xgr)            
        }
        selectRows = names(row)[names(row) %in% metadataRows] #only select those from the desired columns that are present
        return(cbind(rd, row[,selectRows], row.names = NULL)) #returns df, number of words * columns
    } else {
        print(row)
        stop('Zero-length return data')
    }  
}

cleanGloss = function(gloss){
    #print('cleaning gloss')
    originalGloss = gloss
    gloss = gsub('<.*?>','',gloss) #remove reformulations from the gloss
    gloss = gsub("[^[:alnum:][:space:]'_@+]", '', gloss)  #remove non-apostrophe punctuation  
    gloss = gsub('\342\200\234','\342\200\234 ', gloss)#preceding quotes
    gloss = gsub('\342\200\235',' \342\200\235', gloss)#following quotes
    unlistedwords = unlist(strsplit(gloss, split = " "))
    unlistedwords = unlistedwords[!(unlistedwords ==  '')]
    unlistedwords = gsub("\\n|\\t", " ", unlistedwords)           
    unlistedwords = unlistedwords[sapply(gsub('[[:punct:]]','', unlistedwords), nchar) > 0] #remove puntucation-only words
    #is the last items a number with > 7 digits? this is a tag, don't return it
    if (length(unlistedwords) == 0){
        print(originalGloss)
    }
    
    if(nchar(tail(unlistedwords,1)) >= 5 & nchar(gsub('[[:digit:]_+] *','',tail(unlistedwords,1))) == 0){
        return(unlistedwords[1:length(unlistedwords)-1]) 
    } else {
        return(unlistedwords)
    }
}

cleanMOR = function(mor){
    #print('cleaning mor')
    onesplit = gsub("cm\\|cm|none\\|cm", "", unlist(strsplit(mor, " ")))
    onesplit = onesplit[grepl("\\|", onesplit)]
    onesplit = gsub("\\n|\\t", " ", onesplit)
    onesplit = unlist(strsplit(onesplit, split = " "))
    onesplit = gsub("[!,?//.]", "", onesplit) #!!! think we probably want to keep this information around
    onesplit = onesplit[!(onesplit %in%  c("","bq|bq","eq|eq"))]        
    return(onesplit[sapply(gsub('[[:punct:]]','', onesplit), nchar) > 0])
}

processClanFile = function(filename){
    print(paste('Processing file:', filename))
    df = read.CLAN.file(filename)
    if (ncol(df) > 35){ #!!! lower this number if possible
        print(names(df))
        stop(paste(filename, 'has an invalid structure: too many columns found'))
    }
    print(paste('CLANtoR produced dataframe with dimensions:',dim(df)[1], 'by', dim(df)[2]))
    
    processedSentenceList = lapply(1:nrow(df), function(i){sentenceHandler(df[i,])})
    print('Processed sentences')
    
    allTokens = do.call('rbind.fill', processedSentenceList)            
    return(allTokens)
}


processDirectory = function(dirname){    
    fnames = paste(dirname, list.files(dirname, recursive=T, pattern = "\\.cha$"), sep='/')
    print(paste('Processing', length(fnames), 'filenames'))
    
    #!!! multicore this 
    allFiles = do.call('rbind.fill', lapply(fnames, processClanFile))
    #allFiles = do.call('rbind.fill', mclapply(fnames, processClanFile, mc.cores=detectCores()))
    names(allFiles) = tolower(names(allFiles))
    allFiles$age = sapply(allFiles$age, ageToDays)
    
    return(allFiles)
}

ageToDays = function(age){
    ageParts = strsplit(age, ';')[[1]]
    return(ceiling((12*30.5*as.numeric(ageParts[1])) + as.numeric(ageParts[2])*30.5))	
}

In [19]:
#to start it, point it at a specific dictionary
bloom70 = processDirectory('/shared_hd0/corpora/childes_new/Bloom70') 

[1] "Processing 28 filenames"
[1] "Processing file: /shared_hd0/corpora/childes_new/Bloom70/Eric/eric1.cha"
[1] "CLANtoR produced dataframe with dimensions: 1889 by 27"
[1] "<what is that> "


ERROR: Error in if (nchar(tail(unlistedwords, 1)) >= 5 & nchar(gsub("[[:digit:]_+] *", : argument is of length zero


In [14]:
#connect R to mysql
library('RMySQL')
childes_db = dbConnect(MySQL(), user=config[['username']], password=config[['password']], dbname=config[['dbasename']], host=config[['host']])

Loading required package: DBI


In [None]:
#write the dataframe to the remote

dbWriteTable(childes_db, bloom70, name = "words", row.names = F, overwrite=T)  

In [11]:
#to start it, point it at a specific dictionary
suppes = processDirectory('/shared_hd0/corpora/childes_new/Suppes') 

[1] "Processing 52 filenames"


In [None]:
dbWriteTable(childes_db, suppes, name = "words", row.names = F, append=T)  

In [12]:
providence = processDirectory('/shared_hd0/corpora/childes_new/Providence') 

[1] "Processing 364 filenames"


In [15]:
dbWriteTable(childes_db, providence, name = "words", row.names = F, append=T)  

In [None]:
#Need some reliable desgination of the child
#break apart the corpus schema