# Get the exome from the gnomAD genome data

In [1]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '100g', 'spark.local.dir': '/home/olavur/tmp'})

Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-7889d4ff4c-6wxtc:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/gnomad_genome_genotypes/hail-20210323-1501-0.2.61-3c86d3ba497a.log


In [2]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'
RESOURCES_DIR = '/non-fargen/resources'

## Load gnomAD data

In [3]:
gnomad_mt = hl.read_matrix_table(RESOURCES_DIR + '/gnomAD/gnomad.genomes.v3.1.hgdp_1kg_subset_dense.mt')

In [4]:
n_variants, n_samples = gnomad_mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

Number of variants: 175312130
Number of samples: 3942


## Load exome target BED file

Load the SureSelect Human All Exon V6 UTR target BED file, which is used in the FarGen Phase I exome sequencing.

In [5]:
interval_ht = hl.import_bed(RESOURCES_DIR + '/sureselect_human_all_exon_v6_utr_grch38/S07604624_Padded.bed', reference_genome='GRCh38')

2021-03-23 15:05:30 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)
  Loading field 'f3' as type str (user-supplied)


## Filter data

### Filter poor quality variants

Filter the variants based on the AC0 and RF filters, described on the [gnomAD webiste](https://gnomad.broadinstitute.org/faq#whats-the-difference-between-gnomad-v2-and-v3) as follows:

* AC0: The allele count is zero after filtering out low-confidence genotypes (GQ < 20; DP < 10; and AB < 0.2 for het calls)
* RF (gnomAD v2 only): Failed random forest filtering thresholds of 0.055 for exome SNVs, 0.206 for exome indels, 0.263 for genome SNVs, and 0.222 for genome indels


In [6]:
gnomad_mt = gnomad_mt.filter_rows(~gnomad_mt.filters.contains('RF') | ~gnomad_mt.filters.contains('AC0'))

### Extract exome

In [7]:
gnomad_exome_mt = gnomad_mt.filter_rows(hl.is_defined(interval_ht[gnomad_mt.locus]))

In [9]:
n_variants, n_samples = gnomad_exome_mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

2021-03-23 15:22:31 Hail: INFO: Coerced sorted dataset


Number of variants: 7094228
Number of samples: 3942


## Write data to disk

In [10]:
if True:
    gnomad_exome_mt.write(RESOURCES_DIR + '/gnomAD/gnomad.genomes.v3.1.hgdp_1kg_subset_dense_EXOME_HIGH_QUALITY.mt', overwrite=True)

2021-03-23 15:35:19 Hail: INFO: Coerced sorted dataset
2021-03-23 16:47:06 Hail: INFO: wrote matrix table with 7094228 rows and 3942 columns in 115375 partitions to /non-fargen/resources/gnomAD/gnomad.genomes.v3.1.hgdp_1kg_subset_dense_EXOME_HIGH_QUALITY.mt
    Total size: 99.22 GiB
    * Rows/entries: 99.22 GiB
    * Columns: 1.08 MiB
    * Globals: 7.12 KiB
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  14831 rows (135.42 MiB)
