## Variant annotations in hail 

In [1]:
from hail import *
hc = HailContext(log="/topmed.log")
from pprint import pprint

### Import vcf to vds and split any multi allelic sites

In [2]:
vds = hc.import_vcf('gs://dataproc-0a502eb9-92d4-4031-a99d-7b98ab92717b-us/freeze.5b.chr10.phased.pass.minDP0.remDuplicates.vcf.bgz',min_partitions = 500).split_multi()

### Import wgsa annotations as a key table. This line does the following:
    1: import full csv
    2: make a _variant_ type field
    3: key by the variant
    4: remove unneeded fields
    5: split the vep annotations into an array for each variant (there can be multiple annotations)
    6: expand each variant row to 1 row per vep annotation for ease of filtering

In [3]:
# kt = hc.import_table('freezes_2a_3a_4.snp_indel.annotated.general20170422.subset.gz.chr20.csv',delimiter='\t').annotate("v = chr+':'+pos+':'+ref+':'+alt").annotate('v = Variant(v)').key_by('v').drop(['pos','alt','chr','ref','wgsa_version']).annotate("VEP_ensembl_Consequence = VEP_ensembl_Consequence.split(',')").explode('VEP_ensembl_Consequence')

# kt = hc.import_table('gs://dataproc-0a502eb9-92d4-4031-a99d-7b98ab92717b-us/freezes_2a_3a_4.snp_indel.annotated.general20170422.subset.gz.chr21',delimiter='\t').annotate("rchr = chr.replace('chr','')").annotate("v = rchr+':'+pos+':'+ref+':'+alt").annotate('v = Variant(v)').key_by('v').drop(['pos','alt','chr','ref','rchr']).annotate("VEP_ensembl_Consequence = VEP_ensembl_Consequence.split(',')").explode('VEP_ensembl_Consequence')

### Annotated vds with key table

In [4]:
# vds = vds.annotate_variants_table(kt, root='va.wgsa')

### Load expression data

In [5]:
# gtex_pan = hc.import_table('gs://dataproc-0a502eb9-92d4-4031-a99d-7b98ab92717b-us/gtex_pan_rpkm2_hg19.csv',delimiter=',').annotate("rchr = chr.replace('chr','')").annotate("start_pad = toInt(transcript_start)-5000").annotate("end_pad = toInt(transcript_end)+5000").annotate("i = rchr+':'+start_pad+'-'+end_pad" ).annotate('i = Interval(i)').key_by('i')

In [6]:
# islet_trans_file = 'gs://dataproc-0a502eb9-92d4-4031-a99d-7b98ab92717b-us/t2dreamdb_rnaseq_avgFPKM_greater2.bed'
# islet_trans = hc.import_table(islet_trans_file,delimiter='\t').annotate("start_pad = toInt(start_position)-5000").annotate("end_pad = toInt(end_position)+5000").annotate("i = chromosome_name+':'+start_pad+'-'+end_pad" ).annotate('i = Interval(i)').key_by('i')
islet_trans = KeyTable.import_bed("gs://dataproc-0a502eb9-92d4-4031-a99d-7b98ab92717b-us/t2dreamdb_rnaseq_avgFPKM_greater2.bed")
islet_trans_pad = KeyTable.import_bed("gs://dataproc-0a502eb9-92d4-4031-a99d-7b98ab92717b-us/t2dreamdb_rnaseq_avgFPKM_greater2_padding.bed")


### Load the islet data

In [7]:
islet_states = KeyTable.import_bed('gs://dataproc-0a502eb9-92d4-4031-a99d-7b98ab92717b-us/Islets.chromatinStates.reformat.hg38.bed').filter('target == "10_Active_enhancer_2" || target == "9_Active_enhancer_1"')                           




### Load islet tfbs data

In [8]:
islet_tfbs_file = 'gs://dataproc-0a502eb9-92d4-4031-a99d-7b98ab92717b-us/all_tfbs_chr10.tss.hg38.bed'
islet_tfbs = KeyTable.import_bed(islet_tfbs_file)
# islet_tfbs = hc.import_table(islet_tfbs_file,delimiter=',').annotate("i = V1+':'+V2+'-'+V3" ).annotate('i = Interval(i)').key_by('i')


### Find variants that fall within expressed genes or islet states or ptv

In [9]:
vds_all_in_gene_pad = vds.filter_variants_table(islet_trans_pad).variant_qc().cache().filter_variants_expr('va.qc.AF < 0.01').annotate_variants_table(islet_trans_pad,root='va.gene').annotate_variants_table(islet_states,root='va.chr_state').annotate_variants_table(islet_tfbs,root='va.tfbs')


In [10]:
vds_all_in_gene = vds.filter_variants_table(islet_trans).variant_qc().cache().filter_variants_expr('va.qc.AF < 0.01').annotate_variants_table(islet_trans,root='va.gene').annotate_variants_table(islet_states,root='va.chr_state').annotate_variants_table(islet_tfbs,root='va.tfbs')

In [11]:
# vds_ptv = vds_all_in_gene.filter_variants_expr('va.wgsa.VEP_ensembl_Consequence.forall(tc => tc.toSet.contains("splice_acceptor_variant")) || va.wgsa.VEP_ensembl_Consequence.forall(tc => tc.toSet.contains("splice_donor_variant")) || va.wgsa.VEP_ensembl_Consequence.forall(tc => tc.toSet.contains("splice_region_variant")) || va.wgsa.VEP_ensembl_Consequence.forall(tc => tc.toSet.contains("stop_gained")) || va.wgsa.VEP_ensembl_Consequence.forall(tc => tc.toSet.contains("stop_lost")) || va.wgsa.VEP_ensembl_Consequence.forall(tc => tc.toSet.contains("start_gained")) || va.wgsa.VEP_ensembl_Consequence.forall(tc => tc.toSet.contains("start_lost")) || va.wgsa.VEP_ensembl_Consequence.forall(tc => tc.toSet.contains("frameshift_variant"))')
# vds_ptv = vds_all_in_gene_pad.filter_variants_expr('va.wgsa.VEP_ensembl_Consequence == "splice_acceptor_variant"  || va.wgsa.VEP_ensembl_Consequence == "splice_donor_variant" || va.wgsa.VEP_ensembl_Consequence== "splice_region_variant" || va.wgsa.VEP_ensembl_Consequence == "stop_gained" || va.wgsa.VEP_ensembl_Consequence == "stop_lost" || va.wgsa.VEP_ensembl_Consequence == "start_gained" || va.wgsa.VEP_ensembl_Consequence == "start_lost" || va.wgsa.VEP_ensembl_Consequence == "frameshift_variant"')
vds_tfbs = vds_all_in_gene.filter_variants_table(islet_tfbs)
vds_states = vds_all_in_gene.filter_variants_table(islet_states)


In [12]:
# vds_subset = VariantDataset.union([vds_tfbs,vds_ptv])
vds_tfbs.write('gs://dataproc-0a502eb9-92d4-4031-a99d-7b98ab92717b-us/chr10_tfbs.vds')


In [13]:
vds_states.write('gs://dataproc-0a502eb9-92d4-4031-a99d-7b98ab92717b-us/chr10_states.vds')

In [14]:
# vds_ptv.write('gs://dataproc-0a502eb9-92d4-4031-a99d-7b98ab92717b-us/chr10_ptv.vds')


In [15]:
union_vds = hc.read(['gs://dataproc-0a502eb9-92d4-4031-a99d-7b98ab92717b-us/chr10_tfbs.vds', 'gs://dataproc-0a502eb9-92d4-4031-a99d-7b98ab92717b-us/chr10_states.vds'])
union_vds.variant_schema

Struct{rsid:String,qual:Double,filters:Set[String],info:Struct{AC:Array[Int],AN:Int},qc:Struct{callRate:Double,AC:Int,AF:Double,nCalled:Int,nNotCalled:Int,nHomRef:Int,nHet:Int,nHomVar:Int,dpMean:Double,dpStDev:Double,gqMean:Double,gqStDev:Double,nNonRef:Int,rHeterozygosity:Double,rHetHomVar:Double,rExpectedHetFrequency:Double,pHWE:Double},gene:String,chr_state:String,tfbs:String}

### 

In [16]:
union_vds.export_variants('gs://dataproc-0a502eb9-92d4-4031-a99d-7b98ab92717b-us/freeze5b_chr10_islet_states_tfbs.tsv', 'group_id = va.gene, chromosome = v.locus.contig, variant_id = va.rsid, position=v.locus.position, ref = v.ref, alt=v.alt, chromatin_state = va.chr_state, tfbs = va.tfbs, allele_count = va.qc.AC')



In [17]:
vds.query_variants('variants.take(5)')

[Variant(contig=chr10, start=826806, ref=G, alts=[AltAllele(ref=G, alt=C)]),
 Variant(contig=chr10, start=826823, ref=A, alts=[AltAllele(ref=A, alt=G)]),
 Variant(contig=chr10, start=826825, ref=G, alts=[AltAllele(ref=G, alt=A)]),
 Variant(contig=chr10, start=826833, ref=T, alts=[AltAllele(ref=T, alt=C)]),
 Variant(contig=chr10, start=826854, ref=T, alts=[AltAllele(ref=T, alt=C)])]

In [18]:
# %%capture
# ! pip install firecloud
# from firecloud import fiss
# import pandas as pd
# import io


In [19]:
# vds_subset = vds_subset.variant_qc()

In [20]:
# data_model = fiss.fapi.get_entities_tsv("topmed-shared","topmed-shared", "sample")
# data_model_text = pd.read_csv(io.StringIO(data_model.text), sep='\t')[['entity:sample_id','participant','CENTER','study','topmed_project']]
# data_model_text.rename(columns = {'entity:sample_id':'ent_sample_id', 'participant':'sample_id'}, inplace = True)
# data_model_text[['study', 'topmed_project']] = data_model_text[['study', 'topmed_project']].astype(str)
# from pyspark.sql import SQLContext
# sqlctx = SQLContext(hc.sc)
# spark_df = sqlctx.createDataFrame(data_model_text)
# kt = KeyTable.from_dataframe(spark_df,key='sample_id')
# vds = vds_subset.annotate_samples_table(kt, root='sa').sample_qc()
# vds = vds.annotate_samples_expr('sa.nDoubles = gs.filter(g => g.isHet() && va.qc.AC == 2).count()')
# vds = vds.annotate_samples_expr('sa.nTri_to_one = gs.filter(g => g.isHet() && va.qc.AC == 3).count()')
# vds = vds.annotate_samples_expr('sa.nOne = gs.filter(g => g.isHet() && va.qc.AF < 0.01 && va.qc.AF > 0.001).count()')
# vds = vds.annotate_samples_expr('sa.nTen = gs.filter(g => g.isHet() && va.qc.AF < 0.1 && va.qc.AF > 0.01).count()')
# vds = vds.annotate_samples_expr('sa.nTen_above = gs.filter(g => g.isHet() && va.qc.AF > 0.1).count()')
# vds.samples_table().aggregate_by_key(key_expr=['Pop = sa.topmed_project'], agg_expr=['Singletons = sa.map(s => sa.qc.nSingleton).stats().sum',
#                                                                                           'Doubletons = sa.map(s => sa.nDoubles).stats().sum',
#                                                                                           'Tripletons_to_01 = sa.map(sa => sa.nTri_to_one).stats().sum',
#                                                                                           'Zero_1_to_1 = sa.map(sa => sa.nOne).stats().sum']).to_pandas()

In [21]:
# vds.samples_table().aggregate_by_key(key_expr=['Pop = sa.topmed_project'], agg_expr=['Single = sa.map(s => sa.qc.nSingleton).stats()']).to_pandas()