In [1]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '10g'}, tmp_dir='/home/olavur/tmp')

Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-7889d4ff4c-6wxtc:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/gnomad_exome_sites/hail-20210326-0956-0.2.61-3c86d3ba497a.log


In [2]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'
RESOURCES_DIR = '/non-fargen/resources'

## Load gnomAD data

In [8]:
gnomad_ht = hl.read_table(BASE_DIR + '/data/resources/gnomAD/gnomad.exomes.r2.1.1.sites.GRCh38.ht')

In [9]:
n_variants = gnomad_ht.count()
print('Number of variants: ' + str(n_variants))

Number of variants: 17204631


## Load exome target BED file

Load the SureSelect Human All Exon V6 UTR target BED file, which is used in the FarGen Phase I exome sequencing.

In [4]:
interval_ht = hl.import_bed(RESOURCES_DIR + '/sureselect_human_all_exon_v6_utr_grch38/S07604624_Padded.bed', reference_genome='GRCh38')

2021-03-26 09:56:58 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)
  Loading field 'f3' as type str (user-supplied)


## Filter data

### Filter poor quality variants

Filter the variants based on the AC0 and RF filters, described on the [gnomAD webiste](https://gnomad.broadinstitute.org/faq#whats-the-difference-between-gnomad-v2-and-v3) as follows:

* AC0: The allele count is zero after filtering out low-confidence genotypes (GQ < 20; DP < 10; and AB < 0.2 for het calls)
* RF (gnomAD v2 only): Failed random forest filtering thresholds of 0.055 for exome SNVs, 0.206 for exome indels, 0.263 for genome SNVs, and 0.222 for genome indels


In [10]:
gnomad_ht = gnomad_ht.filter(~gnomad_ht.filters.contains('RF') | ~gnomad_ht.filters.contains('AC0'))

In [11]:
n_variants = gnomad_ht.count()
print('Number of variants: ' + str(n_variants))

Number of variants: 16492423


### Extract exome

In [12]:
gnomad_ht = gnomad_ht.filter(hl.is_defined(interval_ht[gnomad_ht.locus]))

In [13]:
n_variants = gnomad_ht.count()
print('Number of variants: ' + str(n_variants))

2021-03-26 10:01:47 Hail: INFO: Coerced sorted dataset


Number of variants: 16320689


## Write table to file

In [14]:
if True:
    gnomad_ht.write(BASE_DIR + '/data/resources/gnomAD/gnomad.exomes.r2.1.1.sites.GRCh38.EXOME_HIGH_QUALITY.ht', overwrite=True)

2021-03-26 10:03:02 Hail: INFO: Coerced sorted dataset
2021-03-26 10:06:43 Hail: INFO: wrote table with 16320689 rows in 9997 partitions to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/data/resources/gnomAD/gnomad.exomes.r2.1.1.sites.GRCh38.EXOME_HIGH_QUALITY.ht
    Total size: 40.13 GiB
    * Rows: 40.13 GiB
    * Globals: 4.59 KiB
    * Smallest partition: 0 rows (21.00 B)
    * Largest partition:  2054 rows (7.57 MiB)
