# Prepare input data for QTL analyses
Before running QTL analysis, create all the input files necessary for running QTLs
- metadata
- genotypes (SNVs + SVs)
- covariates by subject
- covariates by sample


In [1]:
setwd("/frazer01/projects/GTEx_v7/analysis/eqtls_deconvolution")

# Create input folders

In [2]:
dir.create("input"   , showWarnings = FALSE)
dir.create("analysis", showWarnings = FALSE)
dir.create("qtls"    , showWarnings = FALSE)
dir.create("private" , showWarnings = FALSE)
dir.create("log"     , showWarnings = FALSE)


invisible(lapply(c("metadata", "genotypes", "phenotypes", "covariates"), function(x){dir.create(paste("input"           , x, sep = "/"), showWarnings = FALSE)}))
invisible(lapply(c("rna"                                              ), function(x){dir.create(paste("input/genotypes" , x, sep = "/"), showWarnings = FALSE)}))
invisible(lapply(c("liver", "skin"                                    ), function(x){dir.create(paste("input/phenotypes", x, sep = "/"), showWarnings = FALSE)}))


# Create links to notebooks and scripts

In [3]:
invisible(suppressWarnings(file.link("/home/matteo/notebooks/eqtls_deconvolution_gtex/prepare_input_data.ipynb"    , "analysis/prepare_input_data.ipynb"    )))
invisible(suppressWarnings(file.link("/home/matteo/notebooks/eqtls_deconvolution_gtex/cardiac_qtls_packages.R"     , "analysis/cardiac_qtls_packages.R"     )))
invisible(suppressWarnings(file.link("/home/matteo/notebooks/eqtls_deconvolution_gtex/cardiac_qtls_input_files.R"  , "analysis/cardiac_qtls_input_files.R"  )))
invisible(suppressWarnings(file.link("/home/matteo/notebooks/eqtls_deconvolution_gtex/cardiac_qtls_functions.R"    , "analysis/cardiac_qtls_functions.R"    )))
invisible(suppressWarnings(file.link("/home/matteo/notebooks/eqtls_deconvolution_gtex/cardiac_qtls_load_metadata.R", "analysis/cardiac_qtls_load_metadata.R")))
invisible(suppressWarnings(file.link("/home/matteo/notebooks/eqtls_deconvolution_gtex/cardiac_qtls_input_data.R"   , "analysis/cardiac_qtls_input_data.R"   )))
invisible(suppressWarnings(file.link("/home/matteo/notebooks/eqtls_deconvolution_gtex/cardiac_qtls_run_eqtls.R"    , "analysis/cardiac_qtls_run_eqtls.R"    )))
invisible(suppressWarnings(file.link("/home/matteo/notebooks/eqtls_deconvolution_gtex/cardiac_qtls_run_eqtls.sh"   , "analysis/cardiac_qtls_run_eqtls.sh"   )))
invisible(suppressWarnings(file.link("/home/matteo/notebooks/eqtls_deconvolution_gtex/permute_covariates.r"        , "analysis/permute_covariates.r"        )))
invisible(suppressWarnings(file.link("/home/matteo/notebooks/eqtls_deconvolution_gtex/permute_covariates.sh"       , "analysis/permute_covariates.sh"       )))


In [4]:
source("analysis/cardiac_qtls_packages.R"   )
source("analysis/cardiac_qtls_input_files.R")
source("analysis/cardiac_qtls_functions.R"  )
source("analysis/cardiac_qtls_input_data.R" )


Loading packages...
Loading input files...
Loading functions...
Loading input data...


# Metadata files
- get subject-WGS-sample associations


## GTEx
### WGS
- IDs in the WGS VCF header are subject IDs

In [5]:
gtex_vcf             = gtex_vcf_input
gtex_wgs             = readLines(gtex_vcf, n = 1000)
gtex_wgs             = gtex_wgs[grepl("^#", gtex_wgs, perl = TRUE)]
gtex_wgs             = gsub("#", "", gtex_wgs[length(gtex_wgs)])
gtex_wgs             = unlist(strsplit(gtex_wgs, "\t"))
gtex_wgs_sample_list = gtex_wgs[10:length(gtex_wgs)]
gtex_meta_wgs        = data.frame(subject_id = gtex_wgs_sample_list, wgs_id = gtex_wgs_sample_list)

message(paste("WGS IDs", nrow(gtex_meta_wgs), sep = " = "))

WGS IDs = 635


### RNA

In [6]:
tpm_all             = fread("/publicdata/gtex_v7/GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm.gct", header = TRUE, sep = "\t", data.table = FALSE)
rownames(tpm_all)   = tpm_all$Name
tpm_all$Name        = NULL
tpm_all$Description = NULL

In [7]:
gtex_tissues          = data.frame(tissue_full = c("Skin - Not Sun Exposed (Suprapubic)", "Skin - Sun Exposed (Lower leg)", "Liver"), tissue = c("skin", "skin", "liver"))
gtex_meta             = read.table("/frazer01/publicdata/gtex_v7/GTEx_v7_Annotations_SampleAttributesDS_V2.txt", header = TRUE, sep = "\t", comment.char = "")
gtex_meta             = merge(gtex_meta[gtex_meta$SAMPID %in% colnames(tpm_all), c("SAMPID", "SMTSD")], gtex_tissues, by.x = "SMTSD", by.y = "tissue_full")
colnames(gtex_meta)   = c("tissue_full", "rna_id", "tissue")
gtex_meta$tissue_full = NULL
gtex_meta$subject_id  = unlist(lapply(gtex_meta$rna_id, function(x){paste(unlist(strsplit(x, "-"))[1:2], collapse = "-")}))
gtex_meta             = gtex_meta[gtex_meta$subject_id %in% gtex_meta_wgs$subject_id, ]

In [8]:
tpm           = tpm_all[,gtex_meta$rna_id]
gtex_meta_wgs = gtex_meta_wgs[gtex_meta_wgs$subject_id %in% gtex_meta$subject_id,]
gtex_meta_rna = gtex_meta

### Subject metadata

In [9]:
gtex_meta_file      = "/frazer01/projects/GTEx_v7/decrypted/PhenotypeFiles/phs000424.v7.pht002742.v7.p2.c1.GTEx_Subject_Phenotypes.GRU.txt.gz"
gtex_meta           = fread(gtex_meta_file, header = TRUE, sep = "\t", blank.lines.skip = TRUE, skip = 10, data.table = FALSE)
gtex_meta           = gtex_meta[,c("SUBJID", "SEX", "AGE", "AGE", "HGHT", "WGHT")]
colnames(gtex_meta) = c("subject_id", "sex", "age", "age_sample", "height", "weight")
gtex_meta$study     = "GTEx"
gtex_meta[gtex_meta$sex ==  1 , "sex"] = "M"
gtex_meta[gtex_meta$sex == "2", "sex"] = "F"

message(paste("Subjects", nrow(gtex_meta), sep = " = "))


Subjects = 752


In [10]:
subject_ids        = sort(unique(intersect(gtex_meta_wgs$subject_id, gtex_meta_rna$subject_id)))
meta_wgs           = gtex_meta_wgs[gtex_meta_wgs$subject_id %in% subject_ids,]
meta_rna           = gtex_meta_rna[gtex_meta_rna$subject_id %in% subject_ids,]
covariates_subject = gtex_meta    [gtex_meta$subject_id     %in% subject_ids,]
metadata           = merge(meta_wgs, meta_rna)
unrelated          = subject_ids       

message(paste("RNA samples", nrow  (meta_rna          ), sep = " = "))
message(paste("WGS samples", nrow  (meta_wgs          ), sep = " = "))
message(paste("Subjects"   , nrow  (covariates_subject), sep = " = "))

table(meta_rna$tissue)


RNA samples = 902
WGS samples = 525
Subjects = 525



liver  skin 
  153   749 

In [11]:
write.table(tpm, "input/phenotypes/tpm.txt", quote = FALSE, sep = "\t", row.names = TRUE, col.names = NA)

# Prepare covariates files:
- PCA based on ~100,000 SNP in linkage equilibrium (from resource paper)
- kinship matrix on the same SNPs
- RNA data: CIBERSORT populations


## Genotype PCA

In [12]:
## Very slow: uncomment to run
snp_list_file        = "snp_list.txt"
#filter_vcf_gtex_file = filter_vcf(meta_wgs[meta_wgs$wgs_id %in% gtex_meta_wgs$wgs_id   , "wgs_id"], bcftools, snp_list_file, gtex_vcf_input   , "gtex"   )
filter_vcf_gtex_file = 'private/gttable.gtex.vcf.gz'
gtmatrix             = find_genotype_matrix(meta_wgs$wgs_id, bcftools, filter_vcf_gtex_file)


In [13]:
pcadata                    = prcomp(t(gtmatrix), scale = TRUE)
percentVar                 = pcadata$sdev^2 / sum( pcadata$sdev^2 )

In [14]:
genotype_pca               = as.data.frame(pcadata$x)
genotype_pca$wgs_id        = rownames(genotype_pca)
genotype_pca               = merge(genotype_pca[,c("wgs_id", paste("PC", 1:10, sep = ""))], meta_wgs)
covariates_subject         = merge(covariates_subject, genotype_pca)
covariates_subject$wgs_id  = NULL

## RNA data: CIBERSORT
- run CIBERSORT on all samples
- For now, get data from the Production paper, but it has fewer samples, for some reason

In [20]:
liver            = read.csv("/frazer01/home/mdonovan/gtex_deconvolution/tables/Cibersort_out/CIBERSORT.Output_Job78_liver_mouse.csv", header = TRUE)
liver            = liver[,1:6]
colnames(liver)  = c("rna_id", "b_cell", "endothelial", "hepatocyte", "kupffer_cell", "nk_cell")
skin             = read.csv("/frazer01/home/mdonovan/gtex_deconvolution/tables/Cibersort_out/CIBERSORT_reannotated_skin_mouse.csv", header = TRUE)
skin             = skin[,1:7]
colnames(skin)   = c("rna_id", "outer_bulge","inner_bulge","epidermis_basal","epidermis","epidermis_stem_cell","leukocyte")
liver2           = read.csv("/frazer01/home/mdonovan/gtex_deconvolution/tables/Cibersort_out/CIBERSORT.Output_Job79_liver_human.csv", header = TRUE)[,1:16]
liver2           = liver2[,c("Input.Sample", "central_venous_sinusoidal_endothelial_cells", "gdT_cell", "Hepatocytes0", "Hepatocytes3", "Hepatocytes4", "inflammatory_macrophages", "NK.NKT_cell", "Periportal_sinusoidal_endothelial_cells")]
colnames(liver2) = c("rna_id", "endothelial_venous", "gdt", "hepatocyte0", "hepatocyte3", "hepatocyte4", "macrophage", "nkt_cell", "endothelial_periportal")
liver3           = read.csv("/frazer01/home/mdonovan/gtex_deconvolution/tables/merged_human_liver_cibersort.txt", header = TRUE, sep = " ")
colnames(liver3)[[1]] = "rna_id"
colnames(liver3) = tolower(colnames(liver3))
colnames(liver3) = gsub("\\.", "_", colnames(liver3))

skin2 = skin
skin2$collapsed_leukocyte    = skin2$leukocyte
skin2$collapsed_fibroblast   = skin2$inner_bulge
skin2$collapsed_keratinocyte = rowSums(skin2[,c("outer_bulge", "epidermis_basal", "epidermis", "epidermis_stem_cell"     )])
skin2                        = skin2[,c("rna_id", "collapsed_leukocyte", "collapsed_fibroblast", "collapsed_keratinocyte")]

covariates_rna   = merge(liver         , liver2, all = TRUE)
covariates_rna   = merge(covariates_rna, liver3, all = TRUE)
covariates_rna   = merge(covariates_rna, skin  , all = TRUE)
covariates_rna   = merge(covariates_rna, skin2 , all = TRUE)


In [21]:
str(covariates_rna)

'data.frame':	1035 obs. of  30 variables:
 $ rna_id                       : chr  "GTEX-1117F-2926-SM-5GZYI" "GTEX-111CU-1126-SM-5EGIM" "GTEX-111FC-0126-SM-5N9DL" "GTEX-111FC-2526-SM-5GZXU" ...
 $ b_cell                       : num  NA NA NA NA NA NA NA NA NA NA ...
 $ endothelial                  : num  NA NA NA NA NA NA NA NA NA NA ...
 $ hepatocyte                   : num  NA NA NA NA NA NA NA NA NA NA ...
 $ kupffer_cell                 : num  NA NA NA NA NA NA NA NA NA NA ...
 $ nk_cell                      : num  NA NA NA NA NA NA NA NA NA NA ...
 $ endothelial_venous           : num  NA NA NA NA NA NA NA NA NA NA ...
 $ gdt                          : num  NA NA NA NA NA NA NA NA NA NA ...
 $ hepatocyte0                  : num  NA NA NA NA NA NA NA NA NA NA ...
 $ hepatocyte3                  : num  NA NA NA NA NA NA NA NA NA NA ...
 $ hepatocyte4                  : num  NA NA NA NA NA NA NA NA NA NA ...
 $ macrophage                   : num  NA NA NA NA NA NA NA NA NA NA ...
 $ n

# Create output files
- Metadata
- Covariates for subjects
- Covariates for RNA-seq data

In [22]:
colnames(covariates_rna)[[1]] = "assay_id"

#metadata = metadata[metadata$assay_id %in% colnames(tpm),]
covariates_subject = covariates_subject[covariates_subject$subject_id %in% metadata$subject_id,]
covariates_rna     = covariates_rna    [covariates_rna$assay_id       %in% metadata$rna_id    ,]
unrelated          = unrelated         [unrelated                     %in% metadata$subject_id ]


In [23]:
write.table(metadata          , "input//metadata/metadata.txt"            , quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE )
write.table(covariates_subject, "input//covariates/covariates.subject.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE )
write.table(covariates_rna    , "input//covariates/covariates.rna.txt"    , quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE )
write.table(unrelated         , "input//metadata/unrelated.txt"           , quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
write.table(rownames(tpm)     , "input//phenotypes/rna_list.txt"          , quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
