# ClinVar variants

Lookup variants in the FarGen dataset in ClinVar.

In [1]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '100g'}, tmp_dir='/home/olavur/tmp')

2021-11-23 10:52:05 WARN  NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


2021-11-23 10:52:06 WARN  Hail:37 - This Hail JAR was compiled for Spark 2.4.5, running with Spark 2.4.1.
  Compatibility is not guaranteed.


Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-68c965f6f5-qw44l:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/main/hail-20211123-1052-0.2.61-3c86d3ba497a.log


In [2]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models.scales import LogScale
output_notebook()

## Read FarGen variants

In [3]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'
RESOURCES_DIR = '/data/other/resources'

In [4]:
import pandas as pd
import numpy as np

In [5]:
mt = hl.read_matrix_table(BASE_DIR + '/data/mt/high_quality_variants_pao_removed.mt')

## Read ClinVar

The ClinVar VCF encodes chromosome 1 as `1` rather than `chr1`, so we must change that first.

In [6]:
# Make a dict with elements "'1': 'chr1'" and so on.
contig_list = list(mt.aggregate_rows(hl.agg.collect_as_set(mt.locus.contig)))  # Correct encoding: chr1
contig_alt_list = [contig[3:] for contig in contig_list]  # Incorrect encoding: 1
contig_recoding = dict(zip(contig_alt_list, contig_list))



Read the ClinVar VCF.

In [7]:
# Skip invalid loci, for example on contig "MT".
clinvar_path = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis/data/resources/clinvar/clinvar.vcf.gz'
clinvar_mt = hl.import_vcf(clinvar_path, force_bgz=True, reference_genome='GRCh38', contig_recoding=contig_recoding, skip_invalid_loci=True)

Remove all rows where the `CLNSIGINCL` field is empty.

In [8]:
clinvar_mt = clinvar_mt.filter_rows(~hl.is_missing(clinvar_mt.info.CLNSIG))

Annotate the variants with whether they are pathogenic or likely pathogenic.

In [9]:
clinvar_mt = clinvar_mt.annotate_rows(_contains_pathogenic = clinvar_mt.info.CLNSIG.map(lambda s: s.contains('Pathogenic')),
                                      _contains_likely_pathogenic = clinvar_mt.info.CLNSIG.map(lambda s: s.contains('Likely_pathogenic')))
clinvar_mt = clinvar_mt.annotate_rows(_is_pathogenic = clinvar_mt._contains_pathogenic.contains(True),
                                      _is_likely_pathogenic = clinvar_mt._contains_likely_pathogenic.contains(True))

Check how many pathogenic variants there are in ClinVar.

In [10]:
clinvar_mt.aggregate_rows(hl.struct(n_pathogenic = hl.agg.count_where(clinvar_mt._is_pathogenic), n_likely_pathogenic = hl.agg.count_where(clinvar_mt._is_likely_pathogenic)))

[Stage 1:>                                                          (0 + 1) / 1]2021-11-23 10:53:33 Hail: INFO: Coerced sorted dataset
[Stage 2:>                                                          (0 + 1) / 1]

Struct(n_pathogenic=105825, n_likely_pathogenic=54609)

Keep only variants that are either pathogenic or likely pathogenic.

In [11]:
clinvar_mt = clinvar_mt.filter_rows(clinvar_mt._is_pathogenic | clinvar_mt._is_likely_pathogenic)

In [12]:
clinvar_mt.count()

[Stage 3:>                                                          (0 + 1) / 1]2021-11-23 10:53:45 Hail: INFO: Coerced sorted dataset
[Stage 4:>                                                          (0 + 1) / 1]

(150012, 0)

## Annotate FarGen variants with ClinVar

Annotate the FarGen variants with ClinVar pathogenicity status.

In [13]:
mt = mt.annotate_rows(is_pathogenic = clinvar_mt.index_rows(mt.row_key)._is_pathogenic,
                      is_likely_pathogenic = clinvar_mt.index_rows(mt.row_key)._is_likely_pathogenic)

Count number of pathogenic or likely pathogenic variants in the FarGen dataset.

In [14]:
mt.aggregate_rows(hl.struct(n_pathogenic = hl.agg.count_where(mt.is_pathogenic),
                            n_likely_pathogenic = hl.agg.count_where(mt.is_likely_pathogenic),
                            n_either = hl.agg.count_where(mt.is_pathogenic | mt.is_likely_pathogenic)))

[Stage 5:>                                                          (0 + 1) / 1]2021-11-23 10:53:56 Hail: INFO: Coerced sorted dataset
[Stage 6:>                                                          (0 + 1) / 1]2021-11-23 10:54:01 Hail: INFO: Coerced sorted dataset

Struct(n_pathogenic=18, n_likely_pathogenic=10, n_either=22)

Keep only variants that are either pathogenic or likely pathogenic.

In [15]:
mt = mt.filter_rows(mt.is_pathogenic | mt.is_likely_pathogenic)

Write the resulting matrix table to the temp folder.

In [16]:
mt_path = '/home/olavur/tmp/fargen_clinvar.mt'
mt = mt.checkpoint(mt_path, overwrite=True)

[Stage 8:>                                                          (0 + 1) / 1]2021-11-23 10:54:27 Hail: INFO: Coerced sorted dataset
[Stage 9:>                                                          (0 + 1) / 1]2021-11-23 10:54:32 Hail: INFO: Coerced sorted dataset
[Stage 10:>                                                         (0 + 1) / 1]2021-11-23 10:54:37 Hail: INFO: Coerced sorted dataset
    Total size: 182.37 KiB
    * Rows/entries: 130.49 KiB
    * Columns: 51.85 KiB
    * Globals: 39.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  3 rows (18.85 KiB)


In [114]:
mt = hl.read_matrix_table(mt_path)

In [108]:
n_variants = mt.count_rows()
print('Number of pathogenic or likely pathogenic variants: ' + str(n_variants))

Number of pathogenic or likely pathogenic variants: 22


## Geisinger 76 genes

Geisinger 76 (G76) is a list of 76 "clinically actionable" genes. The list is included in in the supplement of Dewey et al. 2016 (citation below). The supplement is available at the link below.
https://www.science.org/doi/10.1126/science.aaf6814

Dewey FE, Murray MF, Overton JD, Habegger L, Leader JB, Fetterolf SN, et al. Distribution and clinical impact of functional variants in 50,726 whole-exome sequences from the DiscovEHR study. Science (80- ). 2016;354(6319).

We are going to find which of the pathogenic variants are present in the G76 genes.

First, we read the gene list.

In [19]:
g76_path = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis/data/resources/geisinger76/geisinger76.list'
with open(g76_path) as fid:
    g76_list = fid.readlines()
g76_list = [l.strip() for l in g76_list]

In [20]:
# Make a global variable with the G76 genes.
g76_mt = mt.annotate_globals(g76_list = g76_list)
# Extract the gene names from the ANN annotations.
g76_mt = g76_mt.annotate_rows(_gene = g76_mt.info.ANN.map(lambda s: s.split('\|')[3]))
# For each variant, check whether the genes in that location match any of the G76 genes.
g76_mt = g76_mt.annotate_rows(gene_in_g76 = g76_mt._gene.map(lambda g: g76_mt.g76_list.contains(g)))

Count the number of pathogenic variants G76 variants.

In [23]:
g76_mt.aggregate_rows(hl.agg.count_where(g76_mt.gene_in_g76.contains(True)))

0

## Annotate with gnomAD frequency and RSID

Read high quality exome sites data.

In [69]:
gnomad_ht = hl.read_table(BASE_DIR + '/data/resources/gnomAD/gnomad_exome_sites/gnomad.exomes.r2.1.1.sites.GRCh38.EXOME_HIGH_QUALITY.ht')

Annotate the FarGen variants with allele frequency and the RSID from gnomAD.

In [115]:
freq_index_dict = gnomad_ht.freq_index_dict.collect()[0]
mt = mt.annotate_rows(gnomad_AF = gnomad_ht.index(mt.row_key).freq.AF[freq_index_dict['gnomad']],
                      gnomad_nfe_nwe_AF = gnomad_ht.index(mt.row_key).freq.AF[freq_index_dict['gnomad_nfe_nwe']],
                      gnomad_rsid = gnomad_ht.index(mt.row_key).rsid,)

## Genes with pathogenic/likely pathogenic variants

Below is a table with various information about the FarGen variants that were found to be either pathogenic or likely pathogenic in ClinVar.

In [116]:
# Extract the variant effect and gene name from the ANN annotation.
mt = mt.annotate_rows(_effect = mt.info.ANN.map(lambda s: s.split('\|')[1]),
                      _gene = mt.info.ANN.map(lambda s: s.split('\|')[3]))
mt = mt.annotate_rows(_gene = hl.set(mt._gene), _effect = hl.set(mt._effect))

In [122]:
rows_ht = mt.rows()
rows_ht = rows_ht.annotate(Chrom=rows_ht.locus.contig, Position=rows_ht.locus.position,
                           Alleles=hl.str(', ').join(rows_ht.alleles),
                           RSID=rows_ht.gnomad_rsid,
                           Gene=hl.str(', ').join(rows_ht._gene),
                           Effect=hl.str(', ').join(rows_ht._effect),
                           FarGen_frequency=rows_ht.variant_qc.AF[1],
                          gnomAD_NFE_NWE_frequency=rows_ht.gnomad_nfe_nwe_AF,
                          gnomAD_frequency=rows_ht.gnomad_AF)
rows_pd = rows_ht.to_pandas()
#results_pd = rows_pd[['Chrom', 'Position', 'RSID', 'Alleles', 'Gene', 'Effect', 'Frequency', 'is_pathogenic', 'is_likely_pathogenic', 'info.ExcessHet', 'variant_qc.dp_stats.mean', 'variant_qc.dp_stats.stdev']]
results_pd = rows_pd[['Chrom', 'Position', 'RSID', 'Alleles', 'Gene', 'FarGen_frequency', 'gnomAD_frequency', 'gnomAD_NFE_NWE_frequency']]



In [123]:
pd.set_option('display.float_format', lambda x: '%0.4g' % x)

In [124]:
results_pd

Unnamed: 0,Chrom,Position,RSID,Alleles,Gene,FarGen_frequency,gnomAD_frequency,gnomAD_NFE_NWE_frequency
0,chr1,17270928,rs144080386,"C, T",PADI3,0.003226,0.006775,0.009031
1,chr1,145919695,,"T, C","GNRHR2, PEX11B, RBM8A",0.008602,,
2,chr1,207761338,rs202071781,"T, G",CD46,0.003226,1.591e-05,0.0
3,chr2,43918025,rs769022521,"C, CT",LRPPRC,0.01892,4.105e-05,4.816e-05
4,chr3,165830741,rs1799807,"T, C",BCHE,0.03441,0.01213,0.01977
5,chr3,183037285,rs727504002,"GC, G",MCCC1,0.02439,1.193e-05,0.0
6,chr5,41149933,rs76202909,"A, G",C6,0.005388,0.002193,0.004144
7,chr5,132370067,rs72552725,"A, G","LOC553103, MIR3936, SLC22A5",0.04516,2.413e-05,0.0
8,chr5,150848436,rs10065172,"C, T",IRGM,0.07004,0.165,0.08376
9,chr6,35457937,rs749898067,"G, GC",FANCE,0.05111,8.759e-05,0.0001898


In [125]:
results_pd.to_csv(BASE_DIR + '/data/results/pathogenic_clinvar_variants.csv')