In [1]:
from firecloud import fiss
import pandas as pd
pd.set_option('display.max_row', 10000)
import io
import numpy as np
from pprint import pprint

In [2]:
control = fiss.fapi.get_workspace_acl("topmed-shared", "topmed-shared").json()#['acl']
print control.keys()

[u'acl']


In [3]:
## start hail context
from hail import *
hc = HailContext(sc)

Running on Apache Spark version 2.0.2
SparkUI available at http://10.128.0.3:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.1-6f85985


In [4]:
samples_sets = fiss.fapi.get_entities('topmed-shared','topmed-shared', 'sample_set').json()
print 'Sample set name:', samples_sets[0]['name']
print 'Sample set fields:', ', '.join(samples_sets[0]['attributes'].keys())
vcf_files = samples_sets[0]['attributes']['vcf']['items']
print '# of vcf files:', len(vcf_files)

Sample set name: freeze5b_minDP10
Sample set fields: wgsa_subset, gds, vcf, bcf, tbi, cov_grm, samples, wgsa_raw, cor_grm
# of vcf files: 23


For analysis, we will want to combine genotype data in several different ways -- and these will be specific to each working group. One of the first operations then, to make the data more managable, will be to create variant data sets (vds) per cohort/consent combination.

In [6]:
data_model = fiss.fapi.get_entities_tsv("topmed-shared","topmed-shared", "sample")
data_model_text = pd.read_csv(io.StringIO(data_model.text), sep='\t')[['entity:sample_id','participant','CENTER','study','topmed_project','consent']]
data_model_text.rename(columns = {'entity:sample_id':'ent_sample_id', 'participant':'sample_id'}, inplace = True)
data_model_text[['study', 'topmed_project','consent']] = data_model_text[['study', 'topmed_project','consent']].astype(str)
data_model_text.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56436 entries, 0 to 56435
Data columns (total 6 columns):
ent_sample_id     56436 non-null object
sample_id         56436 non-null object
CENTER            56436 non-null object
study             56436 non-null object
topmed_project    56436 non-null object
consent           56436 non-null object
dtypes: object(6)

Taking a look at the top of this data frame:

In [7]:
data_model_text.head()

Unnamed: 0,ent_sample_id,sample_id,CENTER,study,topmed_project,consent
0,freeze5a_NWD100014,NWD100014,uw,JHS,JHS,HMB-IRB-NPU
1,freeze5a_NWD100018,NWD100018,broad,COPDGene,COPD,HMB
2,freeze5a_NWD100027,NWD100027,macrogen,GeneSTAR,GeneSTAR,DS-CVD-IRB-NPU-MDS
3,freeze5a_NWD100047,NWD100047,uw,EOCOPD,COPD,DS-CS-RD
4,freeze5a_NWD100048,NWD100048,broad,VU_AF,AFGen,GRU-IRB


We are going to loop through each chromosome, and create vds subsets per cohort/consent group.

In [31]:
i = 13

vcf_file = vcf_files[i]
print vcf_file

label = vcf_file.split('/')[-1]
label = ".".join(label.split('.')[:-2])
print label

gs://fc-adaae650-a458-4c56-8a55-d96fa463a5c6/25c6cc4a-9d07-49bc-a548-9166f7a5f5ad/w/e111b73f-4bff-4079-8c73-f9951d8dcb7a/call-runVcf/shard-13/freeze.5b.chr21.pass_and_fail.gtonly.minDP10.vcf.bgz
freeze.5b.chr21.pass_and_fail.gtonly.minDP10


In [32]:
vds = hc.import_vcf(vcf_files[i],min_partitions = 500).split_multi()

2017-11-25 21:08:25 Hail: INFO: No multiallelics detected.
2017-11-25 21:08:26 Hail: INFO: Coerced sorted dataset
2017-11-25 21:08:26 Hail: WARN: called redundant split on an already split VDS


Annotate the samples

In [33]:
from pyspark.sql import SQLContext
sqlctx = SQLContext(hc.sc)
spark_df = sqlctx.createDataFrame(data_model_text)
kt = KeyTable.from_dataframe(spark_df,key='sample_id') 
vds = vds.annotate_samples_table(kt, root='sa')

Extract samples for each sample/consent combination, excluding the consent="NA"

In [57]:
data_model_text['cohort_consent'] = data_model_text['topmed_project'].map(str) + "." + data_model_text['consent']

In [58]:
cohort_consents = data_model_text['cohort_consent'].unique()
print cohort_consents

['JHS.HMB-IRB-NPU' 'COPD.HMB' 'GeneSTAR.DS-CVD-IRB-NPU-MDS' 'COPD.DS-CS-RD'
 'AFGen.GRU-IRB' 'WHI.HMB-IRB' 'CFS.DS-HLBS-IRB-NPU' 'FHS.HMB-IRB-MDS'
 'HyperGEN_GENOA.GRU-IRB' 'VTE.HMB-IRB' 'THRV.DS-CVD-IRB-COL-NPU-RD'
 'HyperGEN_GENOA.DS-ASC-RF-NPU' 'GenSalt.GRU-IRB' 'SAFS.nan' 'MESA.HMB'
 'BAGS.GRU-IRB' 'GOLDN.DS-CVD-IRB' 'SAS.GRU-IRB-PUB-COL-NPU-GSO'
 'CRA.DS-ASTHMA-IRB-MDS-RD' 'WHI.HMB-IRB-NPU' 'JHS.HMB-IRB'
 'PGX_Asthma.DS-LD-RD' 'THRV.nan' 'VTE.GRU' 'FHS.HMB-IRB-NPU-MDS'
 'PGX_Asthma.DS-LD' 'JHS.DS-FDO-IRB-NPU' 'AA_CAC.HMB-IRB-COL-NPU'
 'Sarcoidosis.DS-SAR-IRB' 'JHS.DS-FDO-IRB' 'MESA.nan' 'VTE.nan'
 'AFGen.DS-AF-IRB-RD' 'Amish.HMB-IRB-MDS' 'AFGen.HMB-IRB-NPU-MDS'
 'AFGen.HMB-IRB' 'AFGen.HMB' 'AA_CAC.HMB' 'AFGen.nan'
 'HyperGEN_GENOA.DS-CVD-IRB-RD' 'AA_CAC.DS-ASC-RF-NPU'
 'AA_CAC.DS-CVD-IRB-NPU-MDS' 'AFGen.HMB-IRB-MDS' 'VTE.HMB-NPU-MDS'
 'nan.NRUP' 'BAGS.nan' 'Amish.nan' 'HyperGEN_GENOA.nan' 'AA_CAC.nan'
 'JHS.nan' 'CFS.nan' 'FHS.nan' 'AA_CAC.DS-DHD-IRB-COL-NPU' 'PGX_Asthma.nan'
 'CR

In [63]:
data_model_text['cohort_consent']

0                 JHS.HMB-IRB-NPU
1                        COPD.HMB
2     GeneSTAR.DS-CVD-IRB-NPU-MDS
3                   COPD.DS-CS-RD
4                   AFGen.GRU-IRB
5                   AFGen.GRU-IRB
6                        COPD.HMB
7                     WHI.HMB-IRB
8             CFS.DS-HLBS-IRB-NPU
9                        COPD.HMB
10                FHS.HMB-IRB-MDS
11         HyperGEN_GENOA.GRU-IRB
12    GeneSTAR.DS-CVD-IRB-NPU-MDS
13                  AFGen.GRU-IRB
14                    VTE.HMB-IRB
...
56421               WHI.HMB-IRB-NPU
56422                      COPD.HMB
56423                      COPD.HMB
56424                      COPD.HMB
56425                   WHI.HMB-IRB
56426                 AFGen.GRU-IRB
56427                      SAFS.nan
56428    THRV.DS-CVD-IRB-COL-NPU-RD
56429               WHI.HMB-IRB-NPU
56430                      COPD.HMB
56431      CRA.DS-ASTHMA-IRB-MDS-RD
56432                 AFGen.GRU-IRB
56433                      COPD.HMB
56434             

In [67]:
consent_grouped = data_model_text.groupby('cohort_consent')

In [69]:
one_group = consent_grouped.get_group('JHS.HMB-IRB-NPU')

In [72]:
one_group["sample_id"].describe()

count           697
unique          697
top       NWD589399
freq              1
dtype: object

In [77]:
one_group["sample_id"].values

array(['NWD100014', 'NWD100597', 'NWD100900', 'NWD102903', 'NWD103804',
       'NWD107594', 'NWD108366', 'NWD109345', 'NWD112373', 'NWD113503',
       'NWD114901', 'NWD115576', 'NWD117670', 'NWD119675', 'NWD120808',
       'NWD121168', 'NWD122406', 'NWD123492', 'NWD125918', 'NWD126673',
       'NWD127039', 'NWD127105', 'NWD128382', 'NWD128980', 'NWD130049',
       'NWD131190', 'NWD131470', 'NWD132536', 'NWD133474', 'NWD136646',
       'NWD140186', 'NWD140972', 'NWD141914', 'NWD143467', 'NWD144171',
       'NWD147037', 'NWD148009', 'NWD149390', 'NWD153049', 'NWD155095',
       'NWD155490', 'NWD155709', 'NWD155738', 'NWD158614', 'NWD158978',
       'NWD162478', 'NWD163306', 'NWD164010', 'NWD164654', 'NWD165668',
       'NWD166048', 'NWD169202', 'NWD170185', 'NWD170530', 'NWD171177',
       'NWD173120', 'NWD173675', 'NWD174851', 'NWD176480', 'NWD178783',
       'NWD179190', 'NWD179781', 'NWD180145', 'NWD181046', 'NWD182275',
       'NWD183109', 'NWD183770', 'NWD184736', 'NWD184773', 'NWD1

In [78]:
vds_subset_one_group = vds.filter_samples_list(list(one_group["sample_id"].values), keep=True)

In [79]:
## this takes a very long time
#vds_subset_one_group.summarize()

Summary(samples=697, variants=7762483, call_rate=0.991701, contigs=['chr21'], multiallelics=0, snps=6949524, mnps=0, insertions=287547, deletions=525412, complex=0, star=0, max_alleles=2)

In [80]:
vds_subset_one_group_variant_qc = vds_subset_one_group.variant_qc()

In [81]:
vds_subset_one_group_sample_qc = vds_subset_one_group.variant_qc().cache().sample_qc()

In [91]:
print "gs://fc-adaae650-a458-4c56-8a55-d96fa463a5c6/tmp/" + "JHS.HMB-IRB-NPU." + label + ".sample_qc"

gs://fc-adaae650-a458-4c56-8a55-d96fa463a5c6/tmp/JHS.HMB-IRB-NPU.freeze.5b.chr21.pass_and_fail.gtonly.minDP10.sample_qc


In [94]:
vds_subset_one_group_sample_qc.samples_table().flatten().export("gs://fc-adaae650-a458-4c56-8a55-d96fa463a5c6/tmp/" + "JHS.HMB-IRB-NPU." + label + ".sample_qc.tsv",types_file=".tsv")
vds_subset_one_group_variant_qc.variants_table().flatten().export("gs://fc-adaae650-a458-4c56-8a55-d96fa463a5c6/tmp/" + "JHS.HMB-IRB-NPU." + label + ".variant_qc.tsv",types_file=".tsv")

2017-11-25 23:21:37 Hail: INFO: while writing:
    gs://fc-adaae650-a458-4c56-8a55-d96fa463a5c6/tmp/JHS.HMB-IRB-NPU.freeze.5b.chr21.pass_and_fail.gtonly.minDP10.sample_qc.tsv
  merge time: 2.329s
2017-11-25 23:45:24 Hail: INFO: while writing:
    gs://fc-adaae650-a458-4c56-8a55-d96fa463a5c6/tmp/JHS.HMB-IRB-NPU.freeze.5b.chr21.pass_and_fail.gtonly.minDP10.variant_qc
  merge time: 4m37.0s
2017-11-26 00:02:28 Hail: INFO: while writing:
    gs://fc-adaae650-a458-4c56-8a55-d96fa463a5c6/tmp/JHS.HMB-IRB-NPU.freeze.5b.chr21.pass_and_fail.gtonly.minDP10.variant_qc.tsv
  merge time: 3m10.4s


Finally, export the VDS file after removing monomorphic variants.

In [109]:
vds_subset_one_group = vds_subset_one_group.filter_variants_expr("")

Struct{rsid:String,qual:Double,filters:Set[String],info:Struct{AVGDP:Double,AC:Array[Int],AN:Int,AF:Array[Double],GC:Array[Int],GN:Int,HWEAF_P:Array[Double],FIBC_P:Double,HWE_SLP_P:Double,FIBC_I:Double,HWE_SLP_I:Double,MAX_IF:Double,MIN_IF:Double,ABE:Double,ABZ:Double,NS_NREF:Int,BQZ:Double,CYZ:Double,STZ:Double,NMZ:Double,IOR:Double,NM0:Double,NM1:Double,SVM:Double,OVERLAP:Array[String],ANN:Array[String],LOF:Array[String],NMD:Array[String]}}


In [110]:
print vds_subset_one_group_variant_qc.variant_schema

Struct{rsid:String,qual:Double,filters:Set[String],info:Struct{AVGDP:Double,AC:Array[Int],AN:Int,AF:Array[Double],GC:Array[Int],GN:Int,HWEAF_P:Array[Double],FIBC_P:Double,HWE_SLP_P:Double,FIBC_I:Double,HWE_SLP_I:Double,MAX_IF:Double,MIN_IF:Double,ABE:Double,ABZ:Double,NS_NREF:Int,BQZ:Double,CYZ:Double,STZ:Double,NMZ:Double,IOR:Double,NM0:Double,NM1:Double,SVM:Double,OVERLAP:Array[String],ANN:Array[String],LOF:Array[String],NMD:Array[String]},qc:Struct{callRate:Double,AC:Int,AF:Double,nCalled:Int,nNotCalled:Int,nHomRef:Int,nHet:Int,nHomVar:Int,dpMean:Double,dpStDev:Double,gqMean:Double,gqStDev:Double,nNonRef:Int,rHeterozygosity:Double,rHetHomVar:Double,rExpectedHetFrequency:Double,pHWE:Double}}


In [98]:
vds_subset_one_group.repartition(500)

FatalError: HailException: Struct has no field `qc'
  Available fields:
    rsid: String
    qual: Double
    filters: Set[String]
    info: Struct{AVGDP:Double,AC:Array[Int],AN:Int,AF:Array[Double],GC:Array[Int],GN:Int,HWEAF_P:Array[Double],FIBC_P:Double,HWE_SLP_P:Double,FIBC_I:Double,HWE_SLP_I:Double,MAX_IF:Double,MIN_IF:Double,ABE:Double,ABZ:Double,NS_NREF:Int,BQZ:Double,CYZ:Double,STZ:Double,NMZ:Double,IOR:Double,NM0:Double,NM1:Double,SVM:Double,OVERLAP:Array[String],ANN:Array[String],LOF:Array[String],NMD:Array[String]}
<input>:1:va.qc.AC > 0
            ^

Java stack trace:
is.hail.utils.HailException: Struct has no field `qc'
  Available fields:
    rsid: String
    qual: Double
    filters: Set[String]
    info: Struct{AVGDP:Double,AC:Array[Int],AN:Int,AF:Array[Double],GC:Array[Int],GN:Int,HWEAF_P:Array[Double],FIBC_P:Double,HWE_SLP_P:Double,FIBC_I:Double,HWE_SLP_I:Double,MAX_IF:Double,MIN_IF:Double,ABE:Double,ABZ:Double,NS_NREF:Int,BQZ:Double,CYZ:Double,STZ:Double,NMZ:Double,IOR:Double,NM0:Double,NM1:Double,SVM:Double,OVERLAP:Array[String],ANN:Array[String],LOF:Array[String],NMD:Array[String]}
<input>:1:va.qc.AC > 0
            ^
	at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:6)
	at is.hail.utils.package$.fatal(package.scala:27)
	at is.hail.expr.ParserUtils$.error(Parser.scala:24)
	at is.hail.expr.AST.parseError(AST.scala:238)
	at is.hail.expr.Select.typecheckThis(AST.scala:303)
	at is.hail.expr.AST.typecheckThis(AST.scala:229)
	at is.hail.expr.AST.typecheck(AST.scala:235)
	at is.hail.expr.AST$$anonfun$typecheck$1.apply(AST.scala:234)
	at is.hail.expr.AST$$anonfun$typecheck$1.apply(AST.scala:234)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
	at is.hail.expr.AST.typecheck(AST.scala:234)
	at is.hail.expr.AST$$anonfun$typecheck$1.apply(AST.scala:234)
	at is.hail.expr.AST$$anonfun$typecheck$1.apply(AST.scala:234)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
	at is.hail.expr.AST.typecheck(AST.scala:234)
	at is.hail.expr.Apply.typecheck(AST.scala:520)
	at is.hail.expr.Parser$.is$hail$expr$Parser$$eval(Parser.scala:57)
	at is.hail.expr.Parser$.evalExpr(Parser.scala:74)
	at is.hail.expr.Parser$.evalTypedExpr(Parser.scala:67)
	at is.hail.expr.FilterVariants.execute(Relational.scala:348)
	at is.hail.variant.VariantSampleMatrix.value$lzycompute(VariantSampleMatrix.scala:415)
	at is.hail.variant.VariantSampleMatrix.value(VariantSampleMatrix.scala:413)
	at is.hail.variant.VariantSampleMatrix.x$11$lzycompute(VariantSampleMatrix.scala:418)
	at is.hail.variant.VariantSampleMatrix.x$11(VariantSampleMatrix.scala:418)
	at is.hail.variant.VariantSampleMatrix.rdd$lzycompute(VariantSampleMatrix.scala:418)
	at is.hail.variant.VariantSampleMatrix.rdd(VariantSampleMatrix.scala:418)
	at is.hail.variant.VariantDatasetFunctions$.withGenotypeStream$extension(VariantDataset.scala:196)
	at is.hail.variant.VariantDatasetFunctions$.coalesce$extension(VariantDataset.scala:202)
	at is.hail.variant.VariantDatasetFunctions.coalesce(VariantDataset.scala:201)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)

Hail version: 0.1-6f85985
Error summary: HailException: Struct has no field `qc'
  Available fields:
    rsid: String
    qual: Double
    filters: Set[String]
    info: Struct{AVGDP:Double,AC:Array[Int],AN:Int,AF:Array[Double],GC:Array[Int],GN:Int,HWEAF_P:Array[Double],FIBC_P:Double,HWE_SLP_P:Double,FIBC_I:Double,HWE_SLP_I:Double,MAX_IF:Double,MIN_IF:Double,ABE:Double,ABZ:Double,NS_NREF:Int,BQZ:Double,CYZ:Double,STZ:Double,NMZ:Double,IOR:Double,NM0:Double,NM1:Double,SVM:Double,OVERLAP:Array[String],ANN:Array[String],LOF:Array[String],NMD:Array[String]}
<input>:1:va.qc.AC > 0
            ^

In [None]:
ids_mask = data_model_text.map(lambda x: x['study'] in ['Amish', 'SAS', 'JHS','MESA']
ids_filtered = data_model_text[].sample_id


In [11]:
vds = vds.variant_qc().cache().sample_qc()

KeyboardInterrupt: 

In [None]:
vds.summarize().report()

In [None]:
pprint(vds.sample_schema)

## Running PCA on Freeze 5b Genotypes

In [None]:
# do pca
pca = vds.pca('sa.pca', k=5, eigenvalues='global.eigen')

### Visualize the differences in families within the studies by principle components.

In [None]:
# define some colors
c = ["aec7e8","ff7f0e","ffbb78","2ca02c","98df8a","d62728","ff9896","9467bd","c5b0d5","8c564b","c49c94"]
cz = zip(vds.query_samples('samples.map(s => sa.topmed_project).counter()').keys(),c)
colors = {t[0]: t[1] for t in cz}
# show the pca results
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from collections import Counter
from math import log, isnan
from pprint import pprint
%matplotlib inline
pca_table = pca.samples_table().to_pandas()
plt.scatter(pca_table["sa.pca.PC1"], pca_table["sa.pca.PC2"],
            c = pca_table["sa.metadata.SuperPopulation"].map(colors),
            alpha = .5)
plt.xlim(-0.6, 0.6)
plt.xlabel("PC1")
plt.ylabel("PC2")
legend_entries = [mpatches.Patch(color=c, label=pheno) for pheno, c in colors.items()]
plt.legend(handles=legend_entries, loc=2)
plt.show()