# Merge FarGen and gnomAD exome sites data

In [1]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '10g'}, tmp_dir='/home/olavur/tmp')

Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-848846b477-48ks9:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/gnomad_exome_sites/hail-20210629-0942-0.2.61-3c86d3ba497a.log


In [2]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'
RESOURCES_DIR = '/data/other/resources'

## Load gnomAD data

In [3]:
gnomad_ht = hl.read_table(BASE_DIR + '/data/resources/gnomAD/gnomad_exome_sites/gnomad.exomes.r2.1.1.sites.GRCh38.EXOME_HIGH_QUALITY.ht')

In [4]:
n_variants = gnomad_ht.count()
print('Number of variants: ' + str(n_variants))

Number of variants: 16320689


## Load FarGen exome data

Load filtered, high-quality, variants.

In [5]:
fargen_mt = hl.read_matrix_table(BASE_DIR + '/data/mt/high_quality_variants_pao_removed.mt/')

We will only need a table with the row data.

In [6]:
fargen_sites_ht = fargen_mt.rows()

In [7]:
n_variants = fargen_sites_ht.count()
print('Number of variants: ' + str(n_variants))

Number of variants: 1146382


## Select relevant fields

We are only interested in the allele frequencies.

From the gnomAD data, select allele frequencies in selected populations.

In [8]:
# Dictionary that indexes the populations in the array of population frequencies.
gnomad_freq_index_dict = gnomad_ht.freq_index_dict.collect()[0]

# Annotate the table with the allele frequencies in selected populations and discard all other row fields.
gnomad_ht = gnomad_ht.select(gnomad_all=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad']],
                                     afr=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad_afr']],
                                     sas=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad_sas']],
                                     amr=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad_amr']],
                                     eas=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad_eas']],
                                     nfe=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad_nfe']],
                                     fin=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad_fin']],
                                     nfe_nwe=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad_nfe_nwe']],
                                     nfe_seu=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad_nfe_seu']])

# Discard all global fields.
gnomad_ht = gnomad_ht.select_globals()

gnomad_ht.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'locus': locus<GRCh38> 
    'alleles': array<str> 
    'gnomad_all': float64 
    'afr': float64 
    'sas': float64 
    'amr': float64 
    'eas': float64 
    'nfe': float64 
    'fin': float64 
    'nfe_nwe': float64 
    'nfe_seu': float64 
----------------------------------------
Key: ['locus', 'alleles']
----------------------------------------


For the FarGen data, we only keep the allele frequency.

In [9]:
fargen_sites_ht = fargen_sites_ht.select(fae=fargen_sites_ht.variant_qc.AF[0])
fargen_sites_ht.describe()

----------------------------------------
Global fields:
    'pao_list': array<str> 
----------------------------------------
Row fields:
    'locus': locus<GRCh38> 
    'alleles': array<str> 
    'fae': float64 
----------------------------------------
Key: ['locus', 'alleles']
----------------------------------------


## Merge all sites

Sites are matched by locus and allele. Use an outer join such that all sites in both datasets are kept. 

In [10]:
merged_ht = fargen_sites_ht.join(gnomad_ht, how='outer')

2021-06-29 09:43:20 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:
    'alleles' -> 'alleles_1'
    'locus' -> 'locus_1'


In [11]:
merged_ht.describe()

----------------------------------------
Global fields:
    'pao_list': array<str> 
----------------------------------------
Row fields:
    'locus': locus<GRCh38> 
    'alleles': array<str> 
    'fae': float64 
    'gnomad_all': float64 
    'afr': float64 
    'sas': float64 
    'amr': float64 
    'eas': float64 
    'nfe': float64 
    'fin': float64 
    'nfe_nwe': float64 
    'nfe_seu': float64 
----------------------------------------
Key: ['locus', 'alleles']
----------------------------------------


# Filter intervals

Filter the merged file to only contain sites that are contained in the exome targets of both datasets. In addition, remove sites in repeat regions using RepeatMasker.

## Load exome target files

Load the SureSelect Human All Exon V6 UTR target BED file, which is used in the FarGen Phase I exome sequencing.

In [12]:
fargen_interval_ht = hl.import_bed(RESOURCES_DIR + '/sureselect_human_all_exon_v6_utr_grch38/S07604624_Padded.bed', reference_genome='GRCh38')

2021-06-29 09:43:23 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)
  Loading field 'f3' as type str (user-supplied)


Exome calling regions used in gnomAD.

In [13]:
gnomAD_interval_ht = hl.read_table(BASE_DIR + '/data/resources/gnomAD/gnomad_exome_sites/exome_calling_regions.v1.GRCh38.ht')

Load the RepeatMasker regions, which were downloaded from UCSC with default settings.

> Repeating Elements by RepeatMasker
>
> https://genome.ucsc.edu/cgi-bin/hgTrackUi?g=rmsk

In [14]:
# NOTE: this BED file contains some special contigs like "chr1_KN196472v1_fix". I skip these, as Hail is not able
# to the anything with the table if they're included.
rmsk_interval_ht = hl.import_bed(BASE_DIR + '/data/resources/repeatmasker/repeatmasker.bed', reference_genome='GRCh38',
                                 skip_invalid_intervals=True)

2021-06-29 09:43:24 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)
  Loading field 'f3' as type str (user-supplied)
  Loading field 'f4' as type str (user-supplied)
  Loading field 'f5' as type str (not specified)


## Filter sites

Keep only sites where both the FarGen and gnomAD exome targets are defined, and discard sites contained in repeat regions.

In [15]:
merged_ht = merged_ht.filter(hl.is_defined(fargen_interval_ht[merged_ht.locus]) &
                             hl.is_defined(gnomAD_interval_ht[merged_ht.locus]) &
                             hl.is_missing(rmsk_interval_ht[merged_ht.locus]))

## Write table to file

In [16]:
if True:
    merged_ht.write(BASE_DIR + '/data/resources/gnomad_exome_sites/fargen_gnomad_union_filtered.ht', overwrite=True)

2021-06-29 09:43:38 Hail: INFO: Coerced sorted dataset
2021-06-29 09:43:53 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-06-29 09:55:22 Hail: INFO: wrote table with 15904432 rows in 10034 partitions to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/data/resources/gnomad_exome_sites/fargen_gnomad_union_filtered.ht
    Total size: 556.51 MiB
    * Rows: 556.51 MiB
    * Globals: 39.00 B
    * Smallest partition: 0 rows (21.00 B)
    * Largest partition:  3563 rows (118.66 KiB)


In [17]:
if True:
    merged_ht = hl.read_table(BASE_DIR + '/data/resources/gnomad_exome_sites/fargen_gnomad_union_filtered.ht')

In [18]:
n_variants = merged_ht.count()
print('Number of variants: ' + str(n_variants))

Number of variants: 15904432
