# SnpEff effect and impact predictions

In [1]:
import hail as hl
hl.init()

Running on Apache Spark version 2.4.6
SparkUI available at http://hms-beagle-5466c684ff-d8mgh:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.58-3f304aae6ce2
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/notebooks/hail-20201123-1431-0.2.58-3f304aae6ce2.log


In [2]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models.scales import LogScale
output_notebook()

In [3]:
import pandas as pd
from pprint import pprint

In [4]:
mt = hl.read_matrix_table('/home/olavur/experiments/2020-11-13_fargen1_exome_analysis/data/mt/variants.mt')

In [5]:
mt = hl.variant_qc(mt)

## Variant impact

In [12]:
mt = mt.annotate_rows(impact=mt.info.ANN.map(lambda x: x.split('\|')[2]))
#mt = mt.annotate_rows(high_impact=mt.impact.contains('HIGH'))
mt = mt.annotate_rows(impact1=mt.impact[0])

In [13]:
impact_count = mt.aggregate_rows(hl.agg.counter(mt.impact1))
print(impact_count)

{'HIGH': 55300, 'LOW': 67901, 'MODERATE': 63044, 'MODIFIER': 762304}


In [171]:
entries = mt.entries()
entries_grouped = (entries.group_by(entries.impact1)
              .aggregate(impact_af=hl.agg.stats(entries.variant_qc.AF[1])))

In [172]:
results_pd = entries_grouped.to_pandas()

2020-11-19 12:43:02 Hail: INFO: Ordering unsorted dataset with network shuffle


In [173]:
results_pd

Unnamed: 0,impact1,impact_af.mean,impact_af.stdev,impact_af.min,impact_af.max,impact_af.n,impact_af.sum
0,HIGH,0.027102,0.070298,0.010417,1.0,2654400,71940.7
1,LOW,0.162157,0.246684,0.010417,1.0,3259248,528508.7
2,MODERATE,0.129268,0.226841,0.010417,1.0,3026112,391179.3
3,MODIFIER,0.114829,0.216623,0.010417,1.0,36590592,4201671.0


### Site frequency spectrum

In [6]:
from pycode.sfs import ffs

In [66]:
ffs_dict = {}
impact_list = ['LOW', 'MODIFIER', 'MODERATE', 'HIGH']
for impact in impact_list:
    mt_temp = mt.filter_rows(mt.impact1 == impact)
    ht_result = ffs(mt_temp, hl)
    ht_result = ht_result.annotate(impact=hl.str(impact))
    ffs_dict[impact] = ht_result

In [67]:
ht_ffs = ffs_dict[impact_list[0]]
for i in range(1, 4):
    ht_ffs = ht_ffs.union(ffs_dict[impact_list[i]])

In [85]:
p = hl.plot.scatter(ht_ffs.ac, ht_ffs.ff, label=ht_ffs.impact,
                    xlabel='Allele counts', ylabel='Frequency in population', title='Site frequency spectrum (folded)',
                    collect_all=True)
p.plot_width = 800
p.plot_height = 400
show(p)

In [90]:
p = hl.plot.scatter(ht_ffs.ac, ht_ffs.ff, label=ht_ffs.impact,
                    xlabel='Allele counts', ylabel='Frequency in population (log10 scale)', title='Site frequency spectrum (folded)',
                    collect_all=True)
p.plot_width = 800
p.plot_height = 400
p.y_scale = LogScale()
show(p)

## Variant effect

In [92]:
mt = mt.annotate_rows(effect=mt.info.ANN.map(lambda x: x.split('\|')[1]))
#mt = mt.annotate_rows(high_impact=mt.impact.contains('HIGH'))
mt = mt.annotate_rows(effect1=mt.effect[0])

In [93]:
effect_count = mt.aggregate_rows(hl.agg.counter(mt.effect1))
pprint(effect_count)

{'3_prime_UTR_variant': 159194,
 '5_prime_UTR_premature_start_codon_gain_variant': 3195,
 '5_prime_UTR_variant': 32076,
 'bidirectional_gene_fusion': 3,
 'conservative_inframe_deletion': 978,
 'conservative_inframe_deletion&splice_region_variant': 18,
 'conservative_inframe_insertion': 1324,
 'conservative_inframe_insertion&splice_region_variant': 53,
 'disruptive_inframe_deletion': 1872,
 'disruptive_inframe_deletion&splice_region_variant': 43,
 'disruptive_inframe_insertion': 1251,
 'disruptive_inframe_insertion&splice_region_variant': 57,
 'downstream_gene_variant': 65152,
 'frameshift_variant': 45989,
 'frameshift_variant&splice_acceptor_variant&splice_region_variant&intron_variant': 129,
 'frameshift_variant&splice_donor_variant&splice_region_variant&intron_variant': 92,
 'frameshift_variant&splice_region_variant': 1477,
 'frameshift_variant&start_lost': 100,
 'frameshift_variant&start_lost&splice_donor_variant&splice_region_variant&intron_variant': 1,
 'frameshift_variant&start_l

In [98]:
ffs_dict = {}
effect_list = ['synonymous_variant', 'missense_variant', 'frameshift_variant']
for effect in effect_list:
    mt_temp = mt.filter_rows(mt.effect1.contains(effect))
    ht_result = ffs(mt_temp, hl)
    ht_result = ht_result.annotate(effect=hl.str(effect))
    ffs_dict[effect] = ht_result

In [100]:
ht_ffs = ffs_dict[effect_list[0]]
for i in range(1, 3):
    ht_ffs = ht_ffs.union(ffs_dict[effect_list[i]])

In [102]:
p = hl.plot.scatter(ht_ffs.ac, ht_ffs.ff, label=ht_ffs.effect,
                    xlabel='Allele counts', ylabel='Frequency in population', title='Site frequency spectrum (folded)',
                    collect_all=True)
p.plot_width = 800
p.plot_height = 400
show(p)

In [103]:
p = hl.plot.scatter(ht_ffs.ac, ht_ffs.ff, label=ht_ffs.effect,
                    xlabel='Allele counts', ylabel='Frequency in population (log10 scale)', title='Site frequency spectrum (folded)',
                    collect_all=True)
p.plot_width = 800
p.plot_height = 400
p.y_scale = LogScale()
show(p)

In [21]:
pd.DataFrame.from_dict({'type': effect_count.keys(), 'count': effect_count.values()})

Unnamed: 0,type,count
0,conservative_inframe_deletion&splice_region_va...,18
1,stop_lost&conservative_inframe_deletion,10
2,stop_gained&disruptive_inframe_insertion&splic...,10
3,splice_acceptor_variant&3_prime_UTR_variant&in...,1
4,stop_lost,104
...,...,...
82,transcript_ablation,1
83,start_lost&disruptive_inframe_deletion,1
84,stop_lost&disruptive_inframe_insertion,5
85,stop_gained&splice_acceptor_variant&disruptive...,1
