In [2]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '10g'}, tmp_dir='/home/olavur/tmp')

Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-848846b477-48ks9:4042
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/gnomad_exome_sites/hail-20210628-1248-0.2.61-3c86d3ba497a.log


In [3]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'
RESOURCES_DIR = '/data/other/resources'

## Read merged Hail table

In [45]:
ht = hl.read_table(BASE_DIR + '/data/resources/gnomad_exome_sites/fargen_gnomad_union_filtered.ht')

In [40]:
n_variants = ht.count()
print('Number of variants: ' + str(n_variants))

Number of variants: 15904432


## Write Hail table to VCF

The `hl.export_vcf()` function does not export all fields in the table. We will annotate the `info` field with all the information we need.

In [46]:
ht = ht.annotate(info=hl.struct())
ht = ht.annotate(info = ht.info.annotate(fae=ht.fae,
                                        gnomad_all = ht.gnomad_all,
                                        afr = ht.afr,
                                        sas = ht.sas,
                                        amr = ht.amr,
                                        eas = ht.eas,
                                        nfe = ht.nfe,
                                        fin = ht.fin,
                                        nfe_nwe = ht.nfe_nwe,
                                        nfe_seu = ht.nfe_seu)) 

In [62]:
# Metadata to for the VCF header.
# If the description is blank, SnpEff won't run.
metadata = {'info':
            {'fae': {'Description': 'fae'},
            'gnomad_all': {'Description': 'gnomad_all'},
            'afr': {'Description': 'afr'},
            'sas': {'Description': 'sas'},
            'amr': {'Description': 'amr'},
            'eas': {'Description': 'eas'},
            'nfe': {'Description': 'nfe'},
            'fin': {'Description': 'fin'},
            'nfe_nwe': {'Description': 'nfe_nwe'},
            'nfe_seu': {'Description': 'nfe_seu'}
           }}

In [64]:
hl.export_vcf(ht, '/home/olavur/tmp/fargen_gnomad_union_filtered.vcf.bgz', metadata=metadata)

2021-06-28 13:38:42 Hail: WARN: export_vcf: ignored the following fields:
    'fae' (row)
    'gnomad_all' (row)
    'afr' (row)
    'sas' (row)
    'amr' (row)
    'eas' (row)
    'nfe' (row)
    'fin' (row)
    'nfe_nwe' (row)
    'nfe_seu' (row)
2021-06-28 13:39:57 Hail: INFO: merging 10034 files totalling 297.1K...
2021-06-28 13:40:08 Hail: INFO: while writing:
    /home/olavur/tmp/fargen_gnomad_union_filtered.vcf.bgz
  merge time: 11.018s


## Annotate data using SnpEff

In [68]:
%%bash --out snpeff_out --err snpeff_err

snpEff -Xmx10g \
     -i vcf \
     -o vcf \
     -nodownload \
     -dataDir /data/other/resources/snpeff_data \
     hg38 \
     -noStats \
     -v /home/olavur/tmp/fargen_gnomad_union_filtered.vcf.bgz > /home/olavur/tmp/fargen_gnomad_union_annotated.vcf

## Convert VCF to Hail table

In [89]:
ann_mt = hl.import_vcf('/home/olavur/tmp/fargen_gnomad_union_annotated.vcf', reference_genome='GRCh38')

In [90]:
ann_ht = ann_mt.rows()

In [86]:
if True:
    ann_ht.write(BASE_DIR + '/data/resources/gnomad_exome_sites/fargen_gnomad_union_annotated.ht', overwrite=True)

2021-06-28 13:55:52 Hail: INFO: Coerced sorted dataset
2021-06-28 13:55:52 Hail: INFO: wrote table with 15 rows in 1 partition to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/data/resources/gnomad_exome_sites/fargen_gnomad_union_annotated.ht
    Total size: 2.34 KiB
    * Rows: 2.33 KiB
    * Globals: 11.00 B
    * Smallest partition: 15 rows (2.33 KiB)
    * Largest partition:  15 rows (2.33 KiB)


In [87]:
ann_ht = hl.read_table(BASE_DIR + '/data/resources/gnomad_exome_sites/fargen_gnomad_union_annotated.ht')

In [88]:
n_variants = ann_ht.count()
print('Number of variants: ' + str(n_variants))

Number of variants: 15
