# GnomAD v2.1.1 exome sites

In [1]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '10g'})

Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-7889d4ff4c-z7fmq:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/hail-20210305-1154-0.2.61-3c86d3ba497a.log


In [2]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models.scales import LogScale
output_notebook()

## Load data

In [3]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'

In [4]:
ht = hl.read_table(BASE_DIR + '/data/resources/gnomAD/gnomad.exomes.r2.1.1.sites.ht')

In [9]:
n_variants = ht.count()
print('Number of variants: ' + str(n_variants))

Number of variants: 17209972


Convert the dataset from reference genom GRCh37 to GRCh38.

In [5]:
# Create a liftover from GRCh37 to GRCh38.
rg37 = hl.get_reference('GRCh37')
rg38 = hl.get_reference('GRCh38')
rg37.add_liftover(BASE_DIR + '/data/resources/liftover/grch37_to_grch38.over.chain.gz', rg38)

In [6]:
# Define the locus in GRCh38.
ht = ht.annotate(new_locus=hl.liftover(ht.locus, 'GRCh38'))
# Remove sites where the new locus isn't defined.
ht = ht.filter(hl.is_defined(ht.new_locus))  
# Replace the loci by the new loci, and key the rows by locus and alleles.
# NOTE: the FarGen exome dataset is keyed by both the locus and the alleles, and it is important that the 1kG
# dataset is keyed by the same fields.
ht = ht.key_by(locus=ht.new_locus, alleles=ht.alleles)
ht = ht.drop('new_locus')

## Inspect GnomAD exome variants

Filter the variants based on the AC0 and RF filters, described on the [gnomAD webiste](https://gnomad.broadinstitute.org/faq#whats-the-difference-between-gnomad-v2-and-v3) as follows:

* AC0: The allele count is zero after filtering out low-confidence genotypes (GQ < 20; DP < 10; and AB < 0.2 for het calls)
* RF (gnomAD v2 only): Failed random forest filtering thresholds of 0.055 for exome SNVs, 0.206 for exome indels, 0.263 for genome SNVs, and 0.222 for genome indels


In [7]:
ht = ht.filter(~ht.filters.contains('RF') | ~ht.filters.contains('AC0'))

## Load FarGen exome data

Load filtered, high-quality, variants.

In [9]:
fargen_mt = hl.read_matrix_table(BASE_DIR + '/data/mt/high_quality_variants.mt/')

In [10]:
n_variants, n_samples = fargen_mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

Number of variants: 1194405
Number of samples: 474


## Annotate FarGen data with gnomAD exome frequencies

In [15]:
ht.describe()

----------------------------------------
Global fields:
    'rf': struct {
        variants_by_type: dict<str, int32>, 
        feature_medians: dict<str, struct {
            variant_type: str, 
            n_alt_alleles: int32, 
            qd: float64, 
            pab_max: float64, 
            info_MQRankSum: float64, 
            info_SOR: float64, 
            info_InbreedingCoeff: float64, 
            info_ReadPosRankSum: float64, 
            info_FS: float64, 
            info_QD: float64, 
            info_MQ: float64, 
            info_DP: int32
        }>, 
        test_intervals: array<interval<locus<GRCh37>>>, 
        test_results: array<struct {
            rf_prediction: str, 
            rf_label: str, 
            n: int32
        }>, 
        features_importance: dict<str, float64>, 
        features: array<str>, 
        vqsr_training: bool, 
        no_transmitted_singletons: bool, 
        adj: bool, 
        rf_hash: str, 
        rf_snv_cutoff: struct {
     

In [17]:
alleles = ht.alleles.take(5)

In [20]:
alleles

[['G', 'C'], ['G', 'A'], ['G', 'C'], ['G', 'A'], ['G', 'A']]

In [None]:
freq = ht.freq.AF.take(100)

In [27]:
[x[:5] for x in freq]

[[None, 0.045710833668948854, None, None, None],
 [None, 0.0004409948844593403, None, None, None],
 [0.0, 0.00015578750584203146, None, None, None],
 [None, 0.004347084896010911, None, None, None],
 [0.0, 0.00430126404494382, None, None, None]]

In [26]:
336/2

168.0

In [None]:
exome_ann_ht = ht.select('freq')