In [None]:
library(data.table)
library(xml2)
library(XML)
library(dplyr)

# Read GTEx V8 Subject Phenotype Data

In [None]:
phenos <- fread("data/phenotypes/GTEx_Subject_Phenotypes.GRU.txt.gz"
                ,skip="dbGaP_Subject_ID"
               ,header=TRUE)
head(phenos)

In [None]:
with(phenos, table(SEX))

Males are marked as "1" and females as "2" (according to dictionary)

In [None]:
males <- subset(phenos, phenos$SEX %in% 1)
females <- subset(phenos, phenos$SEX %in% 2)

# Load Sample Attributes

In [None]:
sampleattr <- fread("data/phenotypes//GTEx_Sample_Attributes.GRU.txt.gz"
                   ,skip="dbGaP_Sample_ID"
                   ,header=TRUE)
head(sampleattr)

In [None]:
sampleattrxml <- read_xml("data/phenotypes//GTEx_Sample_Attributes.data_dict.xml")
sampleattrxml

In [None]:
samplecolnames <- xml_text(xml_find_all(sampleattrxml, xpath="//name"))
sampledesc <- xml_text(xml_find_all(sampleattrxml, xpath="//description")[-1])
sampleattr_tb <- tibble(attrname = samplecolnames, description=sampledesc)
sampleattr_tb

In [None]:
sampleattr[,c('dbGaP_Sample_ID','SAMPID','SMSMPSTE','SMTS','SMTSD')]

So the variable for the tissue type seems to be `SMTSD`

In [None]:
table(sampleattr$SMTSD)

In [None]:
brainsamples <- subset(sampleattr, sampleattr$SMTS %in% c("Brain","Nerve"))
table(brainsamples$SMTSD)

In [None]:
head(brainsamples[,c('dbGaP_Sample_ID','SAMPID','SMSMPSTE','SMTS','SMTSD')])
paste("Total number of brain and nerve samples:", nrow(brainsamples))

# Obtain the Subject and Sample IDs Needed

In [None]:
subjid <- gsub("(GTEX-.*)-(.*)*","\\1",brainsamples$SAMPID)
paste("Number of subjects: ", length(unique(subjid)))
x <- merge(tibble(SUBJID=subjid), phenos, by="SUBJID")
#head(x[,c("SUBJID","SEX","AGE","RACE","ETHNCTY")])
x2 <- bind_cols(brainsamples, x)

tissue <- x2$SMTSD
tissue <- gsub("-","",tissue)
tissue <- gsub("[[:space:]]", ".", tissue)
tissue <- gsub("\\.\\.?",".",tissue)
tissue <- gsub("\\(|\\)","",tissue)
sex <- ifelse(x2$SEX==1,"male","female")
outdir="data/phenotypes/sample_subsets/"
fname <- paste0(outdir,tissue,".",sex)

x3 <- bind_cols(x2, fname=fname)

colist <- c('dbGaP_Sample_ID','SAMPID','SUBJID',"SEX","AGE","RACE","ETHNCTY",'SMSMPSTE','SMTS','SMTSD','fname')

head(x3[, ..colist])
nrow(x3)

Have to get separate sample lists for each brain/nerve tissue types and also separate out by gender.  
In addition, need to ensure sample lists have both genotypes and expression data.  

In [None]:
famfile <- "data/genotypes/phg001219.v1.GTEx_v8_WGS.genotype-calls-vcf.c1/GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.SHAPEIT2_phased_bplink.fam"
fam <- fread(famfile)
head(fam)

In [None]:
#by_tissue_sex <- x3 %>% group_by(SMTSD,SEX)
#bytissue %>% summarise(n=n())
fnames <- unique(x3$fname)
for(i in fnames) {
    s <- subset(x3, x3$fname %in% i)
    s <- merge(s,fam[,2],by.x="SUBJID",by.y="V2")
    if(nrow(s)>0) {
        fwrite(s[,'SAMPID'], paste0(i,".SAMPID_list.txt")
               ,quote=F, row.names=F, col.names=F, sep="\t")
        fwrite(s[,c('SAMPID','SUBJID')], paste0(i,".SAMPID_lookup.txt")
               ,quote=F, row.names=F, col.names=T, sep="\t")
        fwrite(s[, ..colist], paste0(i,".sample_attributes.txt")
              ,quote=F, row.names=F, col.names=T, sep="\t")
    }
}


# Subject Phenotype Data Dictionary

It is best to go to the following link:  
https://ftp.ncbi.nlm.nih.gov/dbgap/studies/phs000424/phs000424.v8.p2/pheno_variable_summaries/phs000424.v8.pht002742.v8.GTEx_Subject_Phenotypes.data_dict.xml

In [None]:
phenoxml <- read_xml("data/phenotypes//GTEx_Subject_Phenotypes.var_report.xml")
phenoxml_variable <- xml_find_all(phenoxml, xpath="//variable")

#phenoxml_variable_text <- xml_text(phenoxml_variable)
#phenoxml

In [None]:
#phenolist <- read_xml("data/phenotypes//GTEx_Subject_Phenotypes.var_report.xml") %>% as_list()
#phenolist

In [None]:
#phenoxml_variables <- xmlToDataFrame(nodes=getNodeSet(phenoxml, "//variable"))
#phenoxml_variables