# Cherry Blossom Data Analysis

In [17]:
select.cols <- function(col.names, header.row, search.locs) {
    sapply(col.names, function(name, header.row, search.locs) {
        start.pos <- regexpr(name, header.row)[[1]]
        if (start.pos == -1) {
            return(c(NA, NA))
        }
        
        index <- sum(start.pos >= search.locs)
        c(search.locs[index] + 1, search.locs[index + 1] - 1)
    }, header.row=header.row, search.locs=search.locs)
}

In [18]:
read.cherry <- function(file.name) {
    flat.file <- readLines(file.name)
    eq.index <- grep('^===', flat.file)
    
    spacer.row <- flat.file[eq.index]
    header.row <- tolower(flat.file[eq.index - 1])
    body <- flat.file[-(1:eq.index)]
    
    blank.locs <- gregexpr(" ", spacer.row)
    search.locs <- c(0, blank.locs[[1]])
    if (substr(spacer.row, nchar(spacer.row), nchar(spacer.row)) != " ") {
        search.locs <- c(search.locs, nchar(spacer.row))
    }
    
    short.col.names <- c("name", "home", "ag", "gun", "net", "time")
    loc.cols <- select.cols(short.col.names, header.row, search.locs)
    
    values <- mapply(substr, list(body), start=loc.cols[1,], stop=loc.cols[2,])
    colnames(values) <- short.col.names
    
    return(values)
}

## Read men's data

In [19]:
men.files <- paste("MenTxt/", 1999:2012, ".txt", sep="")
men.files

In [21]:
men.data <- lapply(men.files, read.cherry)
length(men.data)

In [25]:
men.data <- lapply(men.data, as.data.frame)
class(men.data[[1]])

## Read women's data

In [20]:
women.files <- paste("WomenTxt/", 1999:2012, ".txt", sep="")
women.files

In [26]:
women.data <- lapply(women.files, read.cherry)
length(women.data)

In [27]:
women.data <- lapply(women.data, as.data.frame)
class(women.data[[1]])

# Convert age values to numeric

In [29]:
class(women.data[[1]]$ag)

In [30]:
convert.age <- function(x) {
    x$ag <- as.numeric(x$ag)
    return(x)
}

men.data <- lapply(men.data, convert.age)
women.data <- lapply(women.data, convert.age)

class(women.data[[1]]$ag)

# Convert time to numeric