Skip to content
Permalink
Browse files

Some VEP resources (#104)

* Adding vep_or_lookup_vep, some paths to resources, and a minor fix to process_csq in light of OS

* optional kwargs

* oops

* docstring and name

* add option for vep_config

* addressed comments
  • Loading branch information...
konradjk committed Jun 10, 2019
1 parent 3675b5a commit 33c85f0fb40b4c3561c5447e9d626f7c76277725
Showing with 78 additions and 16 deletions.
  1. +30 −9 resources/basics.py
  2. +48 −7 utils/generic.py
@@ -434,13 +434,37 @@ def cpg_sites_ht_path():
return 'gs://gnomad-public/resources/methylation/cpg.ht'


def methylation_sites_mt_path(hail_version=CURRENT_HAIL_VERSION, ref='GRCh37'):
REFERENCE_DATA = {
'GRCh37': {
'vep_config': 'gs://hail-common/vep/vep/vep85-loftee-gcloud.json',
'all_possible': 'gs://gnomad-public/papers/2019-flagship-lof/v1.0/context/Homo_sapiens_assembly19.fasta.snps_only.vep_20181129.ht',
'methylation': 'gs://gnomad-public/resources/methylation/methylation.ht',
},
'GRCh38': {
'vep_config': 'gs://hail-common/vep/vep/vep95-GRCh38-loftee-gcloud.json',
'methylation': 'gs://gnomad-resources/methylation/hail-0.2/methylation_GRCh38.ht',
}
}

def methylation_sites_ht_path(ref: str = 'GRCh37'):
if ref not in REFERENCE_DATA:
raise DataException("Select reference as one of: {}".format(','.join(REFERENCES)))
return REFERENCE_DATA[ref]['methylation']


def context_ht_path(ref: str = 'GRCh37'):
if ref not in ('GRCh37', ):
raise DataException("Reference must be GRCh37")
return REFERENCE_DATA[ref]['all_possible']


def vep_config_path(ref: str = 'GRCh37'):
if ref not in REFERENCES:
return DataException("Select reference as one of: {}".format(','.join(REFERENCES)))
if ref == 'GRCh37':
return 'gs://gnomad-public/resources/methylation/methylation.ht'.format(hail_version)
else:
return 'gs://gnomad-resources/methylation/hail-{}/methylation_GRCh38.ht'.format(hail_version)
raise DataException("Select reference as one of: {}".format(','.join(REFERENCES)))
return REFERENCE_DATA[ref]['vep_config']


vep_config = vep_config_path() # For backwards-compatibility


dbsnp_vcf_path = "gs://gnomad-public/truth-sets/source/All_20180423.vcf.bgz"
@@ -469,10 +493,7 @@ def methylation_sites_mt_path(hail_version=CURRENT_HAIL_VERSION, ref='GRCh37'):
genome_evaluation_intervals_path_hg38 = "gs://gnomad-public/intervals/hg38-v0-wgs_evaluation_regions.hg38.interval_list"
# More can be found at gs://broad-references/hg19

vep_config = 'gs://hail-common/vep/vep/vep85-loftee-gcloud.json'

# Annotations
context_mt_path = 'gs://gnomad-resources/context/hail-0.2/context_processed.mt'
constraint_ht_path = 'gs://gnomad-public/release/2.1/ht/constraint/constraint.ht'


@@ -232,6 +232,46 @@ def filter_low_conf_regions(mt: Union[hl.MatrixTable, hl.Table], filter_lcr: boo
return mt


def vep_or_lookup_vep(ht, reference_vep_ht=None, reference=None, vep_config=None):
"""
VEP a table, or lookup variants in a reference database
:param Table ht: Input Table
:param Table reference_vep_ht: A reference database with VEP annotations (must be in top-level `vep`)
:param ReferenceGenome reference: If reference_vep_ht is not specified, find a suitable one in reference (if None, grabs from hl.default_reference)
:param str vep_config: vep_config to pass to hl.vep (if None, a suitable one for `reference` is chosen)
:return: VEPped Table
:rtype: Table
"""
from gnomad_hail.resources.basics import vep_config_path, context_ht_path

VEP_REFERENCES = {
'GRCh37': context_ht_path(),
'GRCh38': '',
}

if reference_vep_ht is None:
if reference is None:
reference = hl.default_reference().name

if reference not in VEP_REFERENCES:
raise ValueError(f'vep_or_lookup_vep got {reference}. Expected one of {", ".join(VEP_REFERENCES.keys())}')

reference_vep_ht = hl.read_table(VEP_REFERENCES[reference])

ht = ht.annotate(vep=reference_vep_ht[ht.key].vep)

vep_ht = ht.filter(hl.is_defined(ht.vep))
revep_ht = ht.filter(hl.is_missing(ht.vep))

if vep_config is None:
vep_config = vep_config_path(reference)

revep_ht = hl.vep(revep_ht, vep_config)

return vep_ht.union(revep_ht)


def add_most_severe_consequence_to_consequence(tc: hl.expr.StructExpression) -> hl.expr.StructExpression:
"""
Add most_severe_consequence annotation to transcript consequences
@@ -276,13 +316,14 @@ def csq_score(tc):

tcl = tcl.map(lambda tc: tc.annotate(
csq_score=hl.case(missing_false=True)
.when((tc.lof == 'HC') & (tc.lof_flags == ''), csq_score(tc) - no_flag_score)
.when((tc.lof == 'HC') & (tc.lof_flags != ''), csq_score(tc) - flag_score)
.when(tc.lof == 'LC', csq_score(tc) - 10)
.when(tc.polyphen_prediction == 'probably_damaging', csq_score(tc) - 0.5)
.when(tc.polyphen_prediction == 'possibly_damaging', csq_score(tc) - 0.25)
.when(tc.polyphen_prediction == 'benign', csq_score(tc) - 0.1)
.default(csq_score(tc))
.when((tc.lof == 'HC') & (tc.lof_flags == ''), csq_score(tc) - no_flag_score)
.when((tc.lof == 'HC') & (tc.lof_flags != ''), csq_score(tc) - flag_score)
.when(tc.lof == 'OS', csq_score(tc) - 20)
.when(tc.lof == 'LC', csq_score(tc) - 10)
.when(tc.polyphen_prediction == 'probably_damaging', csq_score(tc) - 0.5)
.when(tc.polyphen_prediction == 'possibly_damaging', csq_score(tc) - 0.25)
.when(tc.polyphen_prediction == 'benign', csq_score(tc) - 0.1)
.default(csq_score(tc))
))
return hl.or_missing(hl.len(tcl) > 0, hl.sorted(tcl, lambda x: x.csq_score)[0])

1 comment on commit 33c85f0

@knguyen142

This comment has been minimized.

Copy link

commented on 33c85f0 Jun 10, 2019

We were able to run this on our pipeline and unless I goofed, it actually took longer (from 1:51 to 2:45). I think loading big tables is pretty costly, I'll look it over to see if I did something funky.

Please sign in to comment.
You can’t perform that action at this time.