# Cardiomyopathy - Gene expression data pre-processing (RNAseq)

In [1]:
# check working directory
getwd()

In [2]:
# load libraries
library(readxl)
library(edgeR)
library(data.table)
library(biomaRt)
library(tidyr)

"package 'edgeR' was built under R version 3.5.2"Loading required package: limma
"package 'tidyr' was built under R version 3.5.2"

In [3]:
data <- as.data.frame(read_excel(file.path(getwd(), "Data", "GSE55296_count_data.xlsx")))
row.names(data) <- data$ENSG_ID
data$Control9 <- as.numeric(data$Control9)

head(data)
dim(data)

"NAs introduced by coercion"

Unnamed: 0,ENSG_ID,hgnc_symbol,Control1,Control2,Control3,Control4,Control5,Control6,Control7,Control8,...,ICM4,ICM5,ICM6,ICM7,ICM8,ICM9,ICM10,ICM11,ICM12,ICM13
ENSG00000000003,ENSG00000000003,TSPAN6,26.06,81.14,67.72,41.08,43.56,29.27,29.69,36.98,...,36.03,24.71,29.87,56.66,50.37,39.91,31.87,32.95,56.86,15.3
ENSG00000000005,ENSG00000000005,TNMD,0.0,0.0,4.62,0.0,1.53,0.0,0.0,1.04,...,3.28,0.0,0.88,10.79,20.64,3.24,3.86,0.0,0.0,2.04
ENSG00000000419,ENSG00000000419,DPM1,56.46,205.79,311.68,205.41,159.71,132.83,137.83,160.42,...,173.61,158.15,140.56,151.1,261.32,197.38,173.86,162.53,152.81,156.03
ENSG00000000457,ENSG00000000457,SCYL3,95.54,23.52,62.34,33.38,64.19,92.31,38.17,41.67,...,57.32,51.89,79.06,87.69,130.87,57.17,42.5,83.46,72.85,54.05
ENSG00000000460,ENSG00000000460,C1orf112,8.69,18.81,14.62,38.52,15.28,27.02,16.96,17.71,...,32.76,22.24,24.6,25.63,10.32,12.94,22.22,24.16,7.11,10.2
ENSG00000000938,ENSG00000000938,FGR,73.83,52.92,15.39,17.97,27.51,67.54,29.69,43.75,...,8.19,58.07,21.96,18.89,44.17,28.04,6.76,46.12,55.08,29.57


In [4]:
# filter lowly expressed genes 
keep <-rowSums(data[,3:38]) > 50
table(keep)
data <- data[keep,]

data <- data %>% drop_na()

head(data)
dim(data)

keep
FALSE  TRUE 
 4859 15354 

Unnamed: 0,ENSG_ID,hgnc_symbol,Control1,Control2,Control3,Control4,Control5,Control6,Control7,Control8,...,ICM4,ICM5,ICM6,ICM7,ICM8,ICM9,ICM10,ICM11,ICM12,ICM13
ENSG00000000003,ENSG00000000003,TSPAN6,26.06,81.14,67.72,41.08,43.56,29.27,29.69,36.98,...,36.03,24.71,29.87,56.66,50.37,39.91,31.87,32.95,56.86,15.3
ENSG00000000005,ENSG00000000005,TNMD,0.0,0.0,4.62,0.0,1.53,0.0,0.0,1.04,...,3.28,0.0,0.88,10.79,20.64,3.24,3.86,0.0,0.0,2.04
ENSG00000000419,ENSG00000000419,DPM1,56.46,205.79,311.68,205.41,159.71,132.83,137.83,160.42,...,173.61,158.15,140.56,151.1,261.32,197.38,173.86,162.53,152.81,156.03
ENSG00000000457,ENSG00000000457,SCYL3,95.54,23.52,62.34,33.38,64.19,92.31,38.17,41.67,...,57.32,51.89,79.06,87.69,130.87,57.17,42.5,83.46,72.85,54.05
ENSG00000000460,ENSG00000000460,C1orf112,8.69,18.81,14.62,38.52,15.28,27.02,16.96,17.71,...,32.76,22.24,24.6,25.63,10.32,12.94,22.22,24.16,7.11,10.2
ENSG00000000938,ENSG00000000938,FGR,73.83,52.92,15.39,17.97,27.51,67.54,29.69,43.75,...,8.19,58.07,21.96,18.89,44.17,28.04,6.76,46.12,55.08,29.57


In [5]:
# run EdgeR
group <- factor(c(rep("case", 13), rep("control", 10)))
y <- DGEList(counts = data[,3:25], group = group)
y <- calcNormFactors(y)
design <- model.matrix(~group)
y <- estimateDisp(y, design)
fit <- glmQLFit(y, design)
qlf <- glmQLFTest(fit, coef = 2)
res <- as.data.frame(topTags(qlf,n=Inf))

In [6]:
# add row names as 1st column
res <- setDT(res, keep.rownames = "GeneID")[]

In [None]:
# add HGNC symbols
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
genes <- res$GeneID
G_list <- getBM(filters= "ensembl_gene_id", attributes= c("ensembl_gene_id","hgnc_symbol"),values=genes,mart= mart)
res <- merge(res,G_list,by.x="GeneID",by.y="ensembl_gene_id")

In [11]:
# save result for DCM
write.table(res, file.path(getwd(), "Data", "Stats", "DCM_stats.txt"), quote=F, sep="\t", col.names=T, row.names=F)

In [2]:
# information about the session
sessionInfo()

R version 3.5.1 (2018-07-02)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 17134)

Matrix products: default

locale:
[1] LC_COLLATE=Dutch_Netherlands.1252  LC_CTYPE=Dutch_Netherlands.1252   
[3] LC_MONETARY=Dutch_Netherlands.1252 LC_NUMERIC=C                      
[5] LC_TIME=Dutch_Netherlands.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] tidyr_0.8.2          dplyr_0.7.8          biomaRt_2.38.0      
[4] data.table_1.12.0    edgeR_3.24.3         limma_3.38.3        
[7] readxl_1.2.0         RevoUtils_11.0.1     RevoUtilsMath_11.0.0

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.0           pillar_1.3.1         bindr_0.1.1         
 [4] cellranger_1.1.0     compiler_3.5.1       prettyunits_1.0.2   
 [7] progress_1.2.0       bitops_1.0-6         base64enc_0.1-3     
[10] tools_3.5.1          digest_0.6.18        uuid_0.1-2          
[13] bit_1.1-14      