# Frequency of monogenic variants

In [1]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '100g', 'spark.local.dir': '/home/olavur/tmp'})

Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-7889d4ff4c-6wxtc:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/hail-20210310-1235-0.2.61-3c86d3ba497a.log


In [2]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models.scales import LogScale
output_notebook()

## Load gnomAD exome sites data

In [3]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'

In [4]:
gnomad_ht = hl.read_table(BASE_DIR + '/data/resources/gnomAD/gnomad.exomes.r2.1.1.sites.GRCh38.ht')

In [5]:
n_variants = gnomad_ht.count()
print('Number of variants: ' + str(n_variants))

Number of variants: 17204631


### Filter gnomAD sites

Filter the variants based on the AC0 and RF filters, described on the [gnomAD webiste](https://gnomad.broadinstitute.org/faq#whats-the-difference-between-gnomad-v2-and-v3) as follows:

* AC0: The allele count is zero after filtering out low-confidence genotypes (GQ < 20; DP < 10; and AB < 0.2 for het calls)
* RF (gnomAD v2 only): Failed random forest filtering thresholds of 0.055 for exome SNVs, 0.206 for exome indels, 0.263 for genome SNVs, and 0.222 for genome indels


In [6]:
gnomad_ht = gnomad_ht.filter(~gnomad_ht.filters.contains('RF') | ~gnomad_ht.filters.contains('AC0'))

In [7]:
n_variants = gnomad_ht.count()
print('Number of variants: ' + str(n_variants))

Number of variants: 16492423


## Load FarGen exome data

Load filtered, high-quality, variants.

In [8]:
fargen_mt = hl.read_matrix_table(BASE_DIR + '/data/mt/high_quality_variants.mt/')

In [9]:
n_variants, n_samples = fargen_mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

Number of variants: 1194405
Number of samples: 474


## Annotate FarGen data with gnomAD exome information

The `freq` field is a long list of values corresponding to combinatons of subsets/populations/sex. See here:

https://gnomad.broadinstitute.org/blog/2018-10-gnomad-v2-1/#hail-table-gets-a-new-schema

I can get the index for the total population as follows.

In [10]:
gnomad_index = gnomad_ht.freq_index_dict['gnomad'].collect()[0]

Key the gnomAD sites data by `rsid`.

In [11]:
gnomad_rsid_keyed_ht = gnomad_ht.key_by(gnomad_ht.rsid)

Annotate all variants in the FarGen data *that have an RSID* with the gnomAD sites data.

In [12]:
fargen_mt = fargen_mt.annotate_rows(gnomad=gnomad_rsid_keyed_ht[fargen_mt.rsid])

Write this annotated MatrixTable to disk.

In [14]:
if False:
    fargen_mt.write(BASE_DIR + '/data/mt/hq_gnomad_annotated.mt', overwrite=True)

2021-03-10 12:40:05 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-03-10 12:41:00 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-03-10 12:55:36 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-03-10 12:56:25 Hail: INFO: wrote matrix table with 1194405 rows and 474 columns in 96 partitions to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/data/mt/hq_gnomad_annotated.mt
    Total size: 5.69 GiB
    * Rows/entries: 5.69 GiB
    * Columns: 49.41 KiB
    * Globals: 11.00 B
    * Smallest partition: 11386 rows (43.62 MiB)
    * Largest partition:  13123 rows (69.20 MiB)


Reading this MatrixTable from file, will make operations faster, as Hail won't have to evaluate all the expressions in the previous both the FarGen MatrixTable and the gnomAD HailTable.

In [17]:
fargen_mt = hl.read_matrix_table(BASE_DIR + '/data/mt/hq_gnomad_annotated.mt')

## Lookup monogenic variants in FarGen and gnomAD

We will lookup the RSIDs below and check the frequency in both the FarGen and gnomAD data.

In [19]:
rsid_list = ["rs113993960", "rs72552725", "rs773966912", "rs202088921", "rs199689597", "rs727504002", "rs753887925", "rs113994161", "rs113994128", "rs104894604", "rs767139201", "rs121909100", "rs116987552", "rs748615072", "rs113624356", "rs781781440", "rs5030858", "rs192831239"]

rsid_hl_set = hl.set(rsid_list)

Get the variants corresponding to these RSIDs.

In [41]:
# Lookup the RSIDs in the data.
monogenic_mt = fargen_mt.filter_rows(rsid_hl_set.contains(fargen_mt.rsid))

# Use only the row data.
monogenic_rows_ht = monogenic_mt.rows()

# Count variants.
n_variants = monogenic_rows_ht.count()
print('Found {n_variants} out of {n_rsids} variants.'.format(n_variants=n_variants, n_rsids=len(rsid_list)))

Found 8 out of 18 variants.


Remove all unnecessary information from the table.

In [31]:
# Get a table with only the frequency (in FarGen and gnomAD, respectively), the RSID as well as the locus and the alleles at that locus.
monogenic_rows_ht = monogenic_rows_ht.select(monogenic_rows_ht.rsid, fargen_freq=monogenic_rows_ht.info.AF[0],
                                             gnomad_freq=monogenic_rows_ht.gnomad.freq.AF[gnomad_index])

In [34]:
# Make a pandas table with all this information.
monogenic_rows_pd = monogenic_rows_ht.to_pandas()

In [36]:
monogenic_rows_pd

Unnamed: 0,locus.contig,locus.position,alleles,rsid,fargen_freq,gnomad_freq
0,chr1,99875394,"[C, T]",rs113994128,0.019,1.6e-05
1,chr4,56441804,"[A, G]",rs192831239,0.008439,0.000793
2,chr5,132370067,"[A, G]",rs72552725,0.049,2.4e-05
3,chr11,64759751,"[G, A]",rs116987552,0.009494,0.001452
4,chr12,102840493,"[G, A]",rs5030858,0.011,0.00076
5,chr13,47988540,"[C, T]",rs113994161,0.036,1.2e-05
6,chr17,44006584,"[G, A]",rs104894604,0.003165,
7,chr18,57669433,"[A, G]",rs121909100,0.015,9.6e-05


Write the table to a CSV.

In [43]:
monogenic_rows_pd.to_csv(BASE_DIR + '/data/results/monogenetic_variants_freq.csv')