# GnomAD v2.1.1 exome sites liftover

In [1]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '100g', 'spark.local.dir': '/home/olavur/tmp'})

Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-7889d4ff4c-6wxtc:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/hail-20210310-1217-0.2.61-3c86d3ba497a.log


## Load gnomAD exome sites data

In [3]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'

In [4]:
gnomad_ht = hl.read_table(BASE_DIR + '/data/resources/gnomAD/gnomad.exomes.r2.1.1.sites.ht')

In [5]:
n_variants = gnomad_ht.count()
print('Number of variants: ' + str(n_variants))

Number of variants: 17209972


## Liftover

Convert the dataset from reference genom GRCh37 to GRCh38.

In [6]:
# Create a liftover from GRCh37 to GRCh38.
rg37 = hl.get_reference('GRCh37')
rg38 = hl.get_reference('GRCh38')
rg37.add_liftover(BASE_DIR + '/data/resources/liftover/grch37_to_grch38.over.chain.gz', rg38)

In [7]:
# Define the locus in GRCh38.
gnomad_ht = gnomad_ht.annotate(new_locus=hl.liftover(gnomad_ht.locus, 'GRCh38'))
# Remove sites where the new locus isn't defined.
gnomad_ht = gnomad_ht.filter(hl.is_defined(gnomad_ht.new_locus))  
# Replace the loci by the new loci, and key the rows by locus and alleles.
# NOTE: the FarGen exome dataset is keyed by both the locus and the alleles, and it is important that the 1kG
# dataset is keyed by the same fields.
gnomad_ht = gnomad_ht.key_by(locus=gnomad_ht.new_locus, alleles=gnomad_ht.alleles)
gnomad_ht = gnomad_ht.drop('new_locus')

## Write HailTable to disk

In [8]:
if True:
    gnomad_ht.write(BASE_DIR + '/data/resources/gnomAD/gnomad.exomes.r2.1.1.sites.GRCh38.ht', overwrite=True)

2021-03-10 12:21:13 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-03-10 12:27:23 Hail: INFO: wrote table with 17204631 rows in 9997 partitions to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/data/resources/gnomad/gnomad.exomes.r2.1.1.sites.GRCh38.ht
    Total size: 41.89 GiB
    * Rows: 41.89 GiB
    * Globals: 4.59 KiB
    * Smallest partition: 513 rows (1.01 MiB)
    * Largest partition:  3481 rows (8.23 MiB)
