# Merge FarGen and gnomAD exome sites data

In [1]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '10g'}, tmp_dir='/home/olavur/tmp')

2021-10-05 13:38:12 WARN  NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


2021-10-05 13:38:13 WARN  Hail:37 - This Hail JAR was compiled for Spark 2.4.5, running with Spark 2.4.1.
  Compatibility is not guaranteed.


Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-6676655f87-9xllv:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/gnomad_exome_sites/hail-20211005-1338-0.2.61-3c86d3ba497a.log


In [2]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'
RESOURCES_DIR = '/data/other/resources'

## Load gnomAD data

In [3]:
gnomad_ht = hl.read_table(BASE_DIR + '/data/resources/gnomAD/gnomad_exome_sites/gnomad.exomes.r2.1.1.sites.GRCh38.EXOME_HIGH_QUALITY.ht')

In [4]:
n_variants = gnomad_ht.count()
print('Number of variants: ' + str(n_variants))

Number of variants: 16320689


## Load FarGen exome data

Load filtered, high-quality, variants.

In [5]:
fargen_mt = hl.read_matrix_table(BASE_DIR + '/data/mt/high_quality_variants.mt/')

In the gnomAD data, multi-allelic sites have been split, according to this blog post: https://macarthurlab.org/2018/10/17/gnomad-v2-1/

So we shall split multi-allelic sites in the FarGen dataset as well.

In [6]:
fargen_mt = hl.split_multi_hts(fargen_mt)

# We must update the variant QC after the split.
fargen_mt = hl.variant_qc(fargen_mt)

We will only need a table with the row data.

In [7]:
fargen_sites_ht = fargen_mt.rows()

In [8]:
n_variants = fargen_sites_ht.count()
print('Number of variants: ' + str(n_variants))

[Stage 0:>                                                        (0 + 37) / 37]

Number of variants: 148305


## Select relevant fields

We are only interested in the allele frequencies and counts.

From the gnomAD data, select allele frequencies and counts in selected populations.

In [9]:
# Dictionary that indexes the populations in the array of population frequencies.
gnomad_freq_index_dict = gnomad_ht.freq_index_dict.collect()[0]

# We are going to annotate the info field. For example, gnomad_ht.info.AF.afr will give us frequencies in the African population.
gnomad_ht = gnomad_ht.annotate(info = hl.struct())
gnomad_ht = gnomad_ht.annotate(info = gnomad_ht.info.annotate(AF=hl.struct(), AC=hl.struct()))

# Annotate the table with the allele frequencies in selected populations and discard all other row fields.
gnomad_ht = gnomad_ht.select(
    info = gnomad_ht.info.select(
        AF_gnomad_all=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad']],
        AF_afr=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad_afr']],
        AF_sas=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad_sas']],
        AF_amr=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad_amr']],
        AF_eas=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad_eas']],
        AF_nfe=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad_nfe']],
        AF_fin=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad_fin']],
        AF_nfe_nwe=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad_nfe_nwe']],
        AF_nfe_seu=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad_nfe_seu']],
        AC_gnomad_all=gnomad_ht.freq.AC[gnomad_freq_index_dict['gnomad']],
        AC_afr=gnomad_ht.freq.AC[gnomad_freq_index_dict['gnomad_afr']],
        AC_sas=gnomad_ht.freq.AC[gnomad_freq_index_dict['gnomad_sas']],
        AC_amr=gnomad_ht.freq.AC[gnomad_freq_index_dict['gnomad_amr']],
        AC_eas=gnomad_ht.freq.AC[gnomad_freq_index_dict['gnomad_eas']],
        AC_nfe=gnomad_ht.freq.AC[gnomad_freq_index_dict['gnomad_nfe']],
        AC_fin=gnomad_ht.freq.AC[gnomad_freq_index_dict['gnomad_fin']],
        AC_nfe_nwe=gnomad_ht.freq.AC[gnomad_freq_index_dict['gnomad_nfe_nwe']],
        AC_nfe_seu=gnomad_ht.freq.AC[gnomad_freq_index_dict['gnomad_nfe_seu']])
)

# Discard all global fields.
gnomad_ht = gnomad_ht.select_globals()

gnomad_ht.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'locus': locus<GRCh38> 
    'alleles': array<str> 
    'info': struct {
        AF_gnomad_all: float64, 
        AF_afr: float64, 
        AF_sas: float64, 
        AF_amr: float64, 
        AF_eas: float64, 
        AF_nfe: float64, 
        AF_fin: float64, 
        AF_nfe_nwe: float64, 
        AF_nfe_seu: float64, 
        AC_gnomad_all: int32, 
        AC_afr: int32, 
        AC_sas: int32, 
        AC_amr: int32, 
        AC_eas: int32, 
        AC_nfe: int32, 
        AC_fin: int32, 
        AC_nfe_nwe: int32, 
        AC_nfe_seu: int32
    } 
----------------------------------------
Key: ['locus', 'alleles']
----------------------------------------


For the FarGen data, we only keep the alternate allele frequency.

In [10]:
# Make info.AF and info.AC structs.
fargen_sites_ht = fargen_sites_ht.annotate(info = fargen_sites_ht.info.annotate(AF=hl.struct(), AC=hl.struct()))

# Annotate these structs with allele frequency and counts.
fargen_sites_ht = fargen_sites_ht.select(
    info = fargen_sites_ht.info.select(
        AF_fae = fargen_sites_ht.variant_qc.AF[1],
        AC_fae = fargen_sites_ht.variant_qc.AC[1]
        )
    )


# Discard all global fields.
fargen_sites_ht = fargen_sites_ht.select_globals()

fargen_sites_ht.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'locus': locus<GRCh38> 
    'alleles': array<str> 
    'info': struct {
        AF_fae: float64, 
        AC_fae: int32
    } 
----------------------------------------
Key: ['locus', 'alleles']
----------------------------------------


## Merge all sites

Sites are matched by locus and allele. Use an outer join such that all sites in both datasets are kept. 

In [11]:
merged_ht = gnomad_ht.join(fargen_sites_ht, how='outer')

2021-10-05 13:38:24 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:
    'locus' -> 'locus_1'
    'info' -> 'info_1'
    'alleles' -> 'alleles_1'


In [12]:
# We now how a 'info_1' field as well as a 'info' field, so we merge them.
merged_ht = merged_ht.annotate(
    info = merged_ht.info.annotate(
        AF_fae = merged_ht.info_1.AF_fae,
        AC_fae = merged_ht.info_1.AC_fae
    ))
merged_ht = merged_ht.select(info = merged_ht.info)

In [13]:
merged_ht.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'locus': locus<GRCh38> 
    'alleles': array<str> 
    'info': struct {
        AF_gnomad_all: float64, 
        AF_afr: float64, 
        AF_sas: float64, 
        AF_amr: float64, 
        AF_eas: float64, 
        AF_nfe: float64, 
        AF_fin: float64, 
        AF_nfe_nwe: float64, 
        AF_nfe_seu: float64, 
        AC_gnomad_all: int32, 
        AC_afr: int32, 
        AC_sas: int32, 
        AC_amr: int32, 
        AC_eas: int32, 
        AC_nfe: int32, 
        AC_fin: int32, 
        AC_nfe_nwe: int32, 
        AC_nfe_seu: int32, 
        AF_fae: float64, 
        AC_fae: int32
    } 
----------------------------------------
Key: ['locus', 'alleles']
----------------------------------------


# Filter intervals

Filter the merged file to only contain sites that are contained in the exome targets of both datasets. In addition, remove sites in repeat regions using RepeatMasker.

## Load exome target files

Load the SureSelect Human All Exon V6 UTR target BED file, which is used in the FarGen Phase I exome sequencing.

In [14]:
fargen_interval_ht = hl.import_bed(RESOURCES_DIR + '/sureselect_human_all_exon_v6_utr_grch38/S07604624_Padded.bed', reference_genome='GRCh38')

2021-10-05 13:38:24 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)
  Loading field 'f3' as type str (user-supplied)


Exome calling regions used in gnomAD.

In [15]:
gnomAD_interval_ht = hl.read_table(BASE_DIR + '/data/resources/gnomAD/gnomad_exome_sites/exome_calling_regions.v1.GRCh38.ht')

Load the RepeatMasker regions, which were downloaded from UCSC with default settings.

> Repeating Elements by RepeatMasker
>
> https://genome.ucsc.edu/cgi-bin/hgTrackUi?g=rmsk

In [16]:
# NOTE: this BED file contains some special contigs like "chr1_KN196472v1_fix". I skip these, as Hail is not able
# to the anything with the table if they're included.
rmsk_interval_ht = hl.import_bed(BASE_DIR + '/data/resources/repeatmasker/repeatmasker.bed', reference_genome='GRCh38',
                                 skip_invalid_intervals=True)

2021-10-05 13:38:24 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)
  Loading field 'f3' as type str (user-supplied)
  Loading field 'f4' as type str (user-supplied)
  Loading field 'f5' as type str (not specified)


## Filter sites

Keep only sites where both the FarGen and gnomAD exome targets are defined, and discard sites contained in repeat regions.

In [17]:
merged_ht = merged_ht.filter(hl.is_defined(fargen_interval_ht[merged_ht.locus]) &
                             hl.is_defined(gnomAD_interval_ht[merged_ht.locus]) &
                             hl.is_missing(rmsk_interval_ht[merged_ht.locus]))

## Write table to file

In [18]:
if True:
    merged_ht.write(BASE_DIR + '/data/resources/gnomad_exome_sites/fargen_gnomad_union_filtered.ht', overwrite=True)

[Stage 3:>                                                          (0 + 1) / 1]2021-10-05 13:38:33 Hail: INFO: Coerced sorted dataset
    Total size: 595.68 MiB
    * Rows: 595.68 MiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (21.00 B)
    * Largest partition:  3545 rows (128.62 KiB)


In [19]:
if True:
    merged_ht = hl.read_table(BASE_DIR + '/data/resources/gnomad_exome_sites/fargen_gnomad_union_filtered.ht')

In [20]:
n_variants = merged_ht.count()
print('Number of variants: ' + str(n_variants))

Number of variants: 15692611
