In [1]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '10g'}, tmp_dir='/home/olavur/tmp')

2021-09-20 13:23:15 WARN  NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


2021-09-20 13:23:16 WARN  Hail:37 - This Hail JAR was compiled for Spark 2.4.5, running with Spark 2.4.1.
  Compatibility is not guaranteed.


Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-848846b477-8tkk6:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/qc/hail-20210920-1323-0.2.61-3c86d3ba497a.log


In [2]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models.scales import LogScale
output_notebook()

In [3]:
import pandas as pd

In [4]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'

## Load FarGen exome data

Load filtered, high-quality, variants.

In [61]:
mt = hl.read_matrix_table(BASE_DIR + '/data/mt/high_quality_variants.mt/')

In [62]:
n_variants, n_samples = mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

Number of variants: 1146382
Number of samples: 468


## Load pedigree

In [29]:
ped = hl.Pedigree.read(BASE_DIR + '/data/genealogy/trios.fam')

The pedigree contains samples not in the dataset. Remove these.

In [49]:
sample_list = mt.s.collect()
ped = ped.filter_to(sample_list)

trios = ped.complete_trios()
n_trios = len(trios)
print('Found {n} trios.'.format(n=n_trios))

Found 7 trios.


## Calculate Mendel errors

First remove multi-allelic sites, as the `mendel_errors` function only works for diallalic sites.

In [63]:
mt = mt.filter_rows(hl.len(mt.alleles) == 2)

In [64]:
n_variants, n_samples = mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

Number of variants: 1025079
Number of samples: 468


In [54]:
all_errors, per_fam, per_sample, per_variant = hl.mendel_errors(mt['GT'], ped)

Annotate samples and variants with mendel errors.

In [15]:
mt = mt.annotate_cols(mendel_col=per_sample[mt.s])

In [16]:
mt = mt.annotate_rows(mendel_row=per_variant[mt.locus, mt.alleles])

In [55]:
per_fam_pd = per_fam.to_pandas()



In [58]:
per_fam_pd.drop(['pat_id', 'mat_id'], axis=1)

Unnamed: 0,fam_id,children,errors,snp_errors
0,10,2,18380,8012
1,79,1,7131,2292
2,61,1,5341,3095
3,41,1,7514,3525
4,58,1,33257,25519
5,64,1,6350,2380


In [67]:
per_variant.aggregate(hl.agg.counter(per_variant.errors))



{0: 956117, 5: 35, 1: 61569, 6: 3, 2: 6058, 3: 1093, 4: 204}

In [66]:
p = hl.plot.histogram(per_variant.errors)
p.xaxis.axis_label = 'Mendel errors per variant'
p.plot_width = 800
p.plot_height = 500
show(p)

