In [62]:
library(tidyverse)
library(glmnet)

── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 2.2.1     ✔ purrr   0.2.4
✔ tibble  1.4.2     ✔ dplyr   0.7.4
✔ tidyr   0.8.0     ✔ stringr 1.3.0
✔ readr   1.1.1     ✔ forcats 0.3.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
Loading required package: Matrix

Attaching package: ‘Matrix’

The following object is masked from ‘package:tidyr’:

    expand

Loading required package: foreach

Attaching package: ‘foreach’

The following objects are masked from ‘package:purrr’:

    accumulate, when

Loaded glmnet 2.0-13



In [12]:
load(file="data/GSE40279_r2.Rdata")
load(file="data/GSE41169_r2.Rdata")

In [52]:
# Age is stored in `characteristics_ch1` in the metadata data frames
# When we split the data, it is at the third index
train.age  <- sapply(as.character(gse40279.meta$characteristics_ch1),
                     function(ch) as.numeric(unlist(strsplit(ch, ' '))[3]))
test.age  <- sapply(as.character(gse41169.meta$characteristics_ch1.6),
                     function(ch) as.numeric(unlist(strsplit(ch, ' '))[2]))

# remove names
names(train.age)  <- NULL
names(test.age)  <- NULL

In [54]:
# find common probes
train.probes  <- rownames(gse40279.data)
test.probes  <- rownames(gse41169.data)
common.probes  <- intersect(rownames(gse41169.data), rownames(gse40279.data))

# only select the common probes in training and test datasets
train.common  <- gse40279.data[common.probes,]
test.common  <- gse41169.data[common.probes,]

In [91]:
# number of common probes
length(common.probes)

In [64]:
# summarize the ages in the training dataset
summary(train.age)
mean(train.age)
sd(train.age)
length(train.age)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  19.00   54.00   65.00   64.04   75.00  101.00 

In [65]:
# summarize the ages in the test dataset
summary(test.age)
mean(test.age)
sd(test.age)
length(test.age)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  18.00   24.50   29.00   31.57   36.00   65.00 

In [72]:
# inpute missing values based on means
imputeData  <- function(probeData) {
    probeData[is.na(probeData)]  <- mean(probeData, na.rm=T)
    return(probeData)
}

In [90]:
train.common.imputed  <- t(apply(train.common, 1, imputeData))
test.common.imputed  <- t(apply(test.common, 1, imputeData))

In [94]:
save.image()