# Cherry Blossom Data Analysis

In [1]:
select.cols <- function(col.names, header.row, search.locs) {
    sapply(col.names, function(name, header.row, search.locs) {
        start.pos <- regexpr(name, header.row)[[1]]
        if (start.pos == -1) {
            return(c(NA, NA))
        }
        
        index <- sum(start.pos >= search.locs)
        c(search.locs[index] + 1, search.locs[index + 1] - 1)
    }, header.row=header.row, search.locs=search.locs)
}

In [2]:
read.cherry <- function(file.name) {
    flat.file <- readLines(file.name)
    eq.index <- grep('^===', flat.file)
    
    spacer.row <- flat.file[eq.index]
    header.row <- tolower(flat.file[eq.index - 1])
    body <- flat.file[-(1:eq.index)]
    
    blank.locs <- gregexpr(" ", spacer.row)
    search.locs <- c(0, blank.locs[[1]])
    if (substr(spacer.row, nchar(spacer.row), nchar(spacer.row)) != " ") {
        search.locs <- c(search.locs, nchar(spacer.row))
    }
    
    short.col.names <- c("name", "home", "ag", "gun", "net", "time")
    loc.cols <- select.cols(short.col.names, header.row, search.locs)
    
    values <- mapply(substr, list(body), start=loc.cols[1,], stop=loc.cols[2,])
    colnames(values) <- short.col.names
    
    return(values)
}

## Read men's data

In [3]:
men.files <- paste("MenTxt/", 1999:2012, ".txt", sep="")
men.files

In [4]:
men.data <- lapply(men.files, read.cherry)
length(men.data)

In [5]:
men.data <- lapply(men.data, as.data.frame)
class(men.data[[1]])

## Read women's data

In [6]:
women.files <- paste("WomenTxt/", 1999:2012, ".txt", sep="")
women.files

In [7]:
women.data <- lapply(women.files, read.cherry)
length(women.data)

In [8]:
women.data <- lapply(women.data, as.data.frame)
class(women.data[[1]])

# Convert age values to numeric

In [9]:
class(women.data[[1]]$ag)

In [10]:
convert.age <- function(x) {
    x$ag <- as.numeric(x$ag)
    return(x)
}

men.data <- lapply(men.data, convert.age)
women.data <- lapply(women.data, convert.age)

class(women.data[[1]]$ag)

# Convert time to numeric

In [11]:
class(women.data[[1]]$time)

In [12]:
convert.time <- function(x) {
    time.split <- function(t) {
        if (is.na(t)) {
            return(NA)
        }
        
        #tryCatch({
            split.time <- as.numeric(unlist(strsplit(t, split=":")))
            if (length(split.time) == 1) {
                return(split.time[[1]] / 60)
            } else if (length(split.time) == 2) {
                return(split.time[[1]] + split.time[[2]] / 60)
            } else if (length(split.time) == 3) {
                return(split.time[[1]] * 60 + split.time[[2]] + split.time[[3]] / 60)
            }    
        #}, error=function(error) {
        #    print(error)
        #    print(t)
        #})
    }
    
    x$gun <- sapply(as.character(x$gun), time.split)
    x$net <- sapply(as.character(x$net), time.split)
    x$time <- sapply(as.character(x$time), time.split)
    
    return(x)
}

In [13]:
suppressWarnings({
    men.data <- lapply(men.data, convert.time)
    women.data <- lapply(women.data, convert.time)

    class(women.data[[1]]$time)    
})

# Show data

In [14]:
head(men.data[[1]])

name,home,ag,gun,net,time
Worku Bikila,Ethiopia,19,,,46.98333
Lazarus Nyakeraka,Kenya,15,,,47.01667
James Kariuki,Kenya,18,,,47.05
William Kiptum,Kenya,19,,,47.11667
Joseph Kimani,Kenya,17,,,47.51667
Josphat Machuka,Kenya,16,,,47.55
