# Variant distributions

In [1]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '100g'}, tmp_dir='/home/olavur/tmp')

2021-10-13 09:39:43 WARN  NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


2021-10-13 09:39:44 WARN  Hail:37 - This Hail JAR was compiled for Spark 2.4.5, running with Spark 2.4.1.
  Compatibility is not guaranteed.


Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-6676655f87-9xllv:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/main/hail-20211013-0939-0.2.61-3c86d3ba497a.log


In [2]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models.scales import LogScale
output_notebook()

## Read data

In [3]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'
RESOURCES_DIR = '/data/other/resources'

In [4]:
import pandas as pd
import numpy as np

In [7]:
mt = hl.read_matrix_table(BASE_DIR + '/data/mt/high_quality_variants_pao_removed.mt')

In [14]:
n_samples = mt.count_cols()
print('Number of samples: ' + str(n_samples))

Number of samples: 465


## Gender distribution

Impute the sex of the samples, as done in the `step3_check_sex.ipynb` notebook.

In [28]:
imputed_sex_ht = hl.impute_sex(mt.GT, female_threshold=0.4, male_threshold=0.4)

In [29]:
imputed_sex_ht.aggregate(hl.agg.counter(imputed_sex_ht.is_female))

{False: 190, True: 275}

Summary of $F$ for **females**:

In [40]:
imputed_sex_ht.filter(imputed_sex_ht.is_female).f_stat.summarize()

0,1
Non-missing,275 (100.00%)
Missing,0
Minimum,-0.44
Maximum,0.15
Mean,-0.15
Std Dev,0.11


In [54]:
m = -0.15
stdv = 0.11
print('2 standard deviations: ' + str(2 * stdv))

95% confidence interval:
[-0.37, 0.07]
2 standard deviations: 0.22


Summary of $F$ for **males**:

In [50]:
male_f_sum = imputed_sex_ht.filter(~imputed_sex_ht.is_female).f_stat.summarize()

0,1
Non-missing,190 (100.00%)
Missing,0
Minimum,0.78
Maximum,0.98
Mean,0.91
Std Dev,0.03


In [55]:
m = 0.91
stdv = 0.03
print('2 standard deviations: ' + str(2 * stdv))

2 standard deviations: 0.06


## Count variants

Annotate variants with variant effect, impact and gene. These annotations are obtained from SnpEff.

If multiple transcripts overlap a site, the first in the list is arbitrarily chosen.

**NOTE:** is it possible to use [split_multi()](https://hail.is/docs/0.2/methods/genetics.html#hail.methods.split_multi) to count overlapping transcripts separately?

In [8]:
# Get variant impact, effect and gene name.
# Split overlapping transcripts into a list.
mt = mt.annotate_rows(effect=mt.info.ANN.map(lambda x: x.split('\|')[1]),
                      impact=mt.info.ANN.map(lambda x: x.split('\|')[2]),
                      gene=mt.info.ANN.map(lambda x: x.split('\|')[3]))
# Use only the first transcript.
mt = mt.annotate_rows(impact1=mt.impact[0], effect1=mt.effect[0], gene1=mt.gene[0])

In [9]:
def variant_counts(ht):
    # Count number of variants, SNPs and indels. Only first allele in alternate allele list is considered.
    variant_counts_struct = ht.aggregate_rows(hl.struct(
        n_variants = hl.agg.count(),
        snps = hl.agg.count_where(hl.is_snp(ht.alleles[0], ht.alleles[1])) / hl.agg.count(),
        indels = hl.agg.count_where(hl.is_indel(ht.alleles[0], ht.alleles[1])) / hl.agg.count()))
    
    variant_counts_pd = pd.DataFrame(variant_counts_struct.values(), index=variant_counts_struct.keys(), columns=[''])
    return variant_counts_pd

def impact_counts(ht):
    # Count variant impact types.
    impact_counts_struct = ht.aggregate_rows(
        hl.struct(
            LOW = hl.agg.count_where(ht.impact1 == 'LOW') / hl.agg.count(),
            MODIFIER = hl.agg.count_where(ht.impact1 == 'MODIFIER') / hl.agg.count(),
            MODERATE = hl.agg.count_where(ht.impact1 == 'MODERATE') / hl.agg.count(),
            HIGH = hl.agg.count_where(ht.impact1 == 'HIGH') / hl.agg.count()))
    
    impact_counts_pd = pd.DataFrame(impact_counts_struct.values(), index=impact_counts_struct.keys(), columns=[''])
    return impact_counts_pd

def effect_counts(ht):
    # Count variant effect types.
    # There are many more effect types, these are some of the more interesting ones.
    effect_counts_struct = ht.aggregate_rows(
        hl.struct(
            synonymous_variant = hl.agg.count_where(ht.effect1 == 'synonymous_variant') / hl.agg.count(),
            missense_variant = hl.agg.count_where(ht.effect1 == 'missense_variant') / hl.agg.count(),
            frameshift_variant = hl.agg.count_where(ht.effect1 == 'frameshift_variant') / hl.agg.count(),              
            stop_gained = hl.agg.count_where(ht.effect1 == 'stop_gained') / hl.agg.count()))
    
    effect_counts_pd = pd.DataFrame(effect_counts_struct.values(), index=effect_counts_struct.keys(), columns=[''])
    return effect_counts_pd

In [10]:
variant_counts_pd = variant_counts(mt)
effect_counts_pd = effect_counts(mt)
impact_counts_pd = impact_counts(mt)



In [11]:
# Concatenate rows, variants, effects and impact.
counts_pd = pd.concat([variant_counts_pd, effect_counts_pd, impact_counts_pd])

In [12]:
pd.set_option('display.float_format', lambda x: '%g' % x)

In [13]:
counts_pd

Unnamed: 0,Unnamed: 1
n_variants,148267.0
snps,0.841178
indels,0.158822
synonymous_variant,0.0950515
missense_variant,0.0894467
frameshift_variant,0.00234037
stop_gained,0.000917264
LOW,0.130805
MODIFIER,0.763103
MODERATE,0.0974458


## Sequencing depth

In [20]:
mt.DP.summarize()



0,1
Non-missing,59788417 (100.00%)
Missing,0
Minimum,0
Maximum,1100
Mean,38.30
Std Dev,30.04


## Allelic depth

In [53]:
mt.filter_entries(mt.GT.is_het()).AB.summarize()



0,1
Non-missing,15357457 (100.00%)
Missing,0
Minimum,0.25
Maximum,0.75
Mean,0.49
Std Dev,0.10


## Heterozygosity/homozygosity rate 

In [30]:
mt.aggregate_entries(hl.agg.mean(mt.GT.is_het()))



0.2568634155341494

## Transition/transversion rate 

In [36]:
mt.aggregate_entries(hl.agg.count_where(hl.is_transition(mt.alleles[mt.GT[0]], mt.alleles[mt.GT[1]])) / hl.agg.count_where(hl.is_transversion(mt.alleles[mt.GT[0]], mt.alleles[mt.GT[1]])))



2.52614426612854