# Annotate the FarGen/gnomAD data with SnpEff

In [1]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '10g'}, tmp_dir='/home/olavur/tmp')

2021-10-07 12:15:02 WARN  NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


2021-10-07 12:15:03 WARN  Hail:37 - This Hail JAR was compiled for Spark 2.4.5, running with Spark 2.4.1.
  Compatibility is not guaranteed.


Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-6676655f87-9xllv:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/gnomad_exome_sites/hail-20211007-1215-0.2.61-3c86d3ba497a.log


In [2]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'
RESOURCES_DIR = '/data/other/resources'

## Read merged Hail table

In [3]:
ht = hl.read_table(BASE_DIR + '/data/resources/gnomad_exome_sites/fargen_gnomad_union_filtered.ht')

In [4]:
n_variants = ht.count()
print('Number of variants: ' + str(n_variants))

Number of variants: 15692611


## Write Hail table to VCF

In [5]:
# Metadata to for the VCF header.
# If the description is blank, SnpEff won't run.
metadata = {'info':
            {
                'AF_fae': {'Description': 'fae'},
                'AF_gnomad_all': {'Description': 'gnomad_all'},
                'AF_afr': {'Description': 'afr'},
                'AF_sas': {'Description': 'sas'},
                'AF_amr': {'Description': 'amr'},
                'AF_eas': {'Description': 'eas'},
                'AF_nfe': {'Description': 'nfe'},
                'AF_fin': {'Description': 'fin'},
                'AF_nfe_nwe': {'Description': 'nfe_nwe'},
                'AF_nfe_seu': {'Description': 'nfe_seu'},
                'AC_fae': {'Description': 'fae'},
                'AC_gnomad_all': {'Description': 'gnomad_all'},
                'AC_afr': {'Description': 'afr'},
                'AC_sas': {'Description': 'sas'},
                'AC_amr': {'Description': 'amr'},
                'AC_eas': {'Description': 'eas'},
                'AC_nfe': {'Description': 'nfe'},
                'AC_fin': {'Description': 'fin'},
                'AC_nfe_nwe': {'Description': 'nfe_nwe'},
                'AC_nfe_seu': {'Description': 'nfe_seu'}
            }
           }

In [6]:
hl.export_vcf(ht, '/home/olavur/tmp/fargen_gnomad_union_filtered.vcf.bgz', metadata=metadata)

2021-10-05 13:53:30 Hail: INFO: while writing:
    /home/olavur/tmp/fargen_gnomad_union_filtered.vcf.bgz
  merge time: 14.349s


## Annotate data using SnpEff

In [None]:
%%bash --out snpeff_out --err snpeff_err

snpEff -Xmx10g \
     -i vcf \
     -o vcf \
     -nodownload \
     -dataDir /data/other/resources/snpeff_data \
     hg38 \
     -noStats \
     -v /home/olavur/tmp/fargen_gnomad_union_filtered.vcf.bgz > /home/olavur/tmp/fargen_gnomad_union_annotated.vcf

## Convert VCF to Hail table

In [None]:
ann_mt = hl.import_vcf('/home/olavur/tmp/fargen_gnomad_union_annotated.vcf', reference_genome='GRCh38')

In [None]:
ann_ht = ann_mt.rows()

In [None]:
if True:
    ann_ht.write(BASE_DIR + '/data/resources/gnomad_exome_sites/fargen_gnomad_union_annotated.ht', overwrite=True)

In [3]:
ann_ht = hl.read_table(BASE_DIR + '/data/resources/gnomad_exome_sites/fargen_gnomad_union_annotated.ht')

In [4]:
n_variants = ann_ht.count()
print('Number of variants: ' + str(n_variants))

Number of variants: 15692611
