# Correlation between allele frequencies in FarGen and gnomAD data

In [1]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '10g', 'spark.local.dir': '/home/olavur/tmp'})

Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-7889d4ff4c-6wxtc:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/gnomad_exome_sites/hail-20210323-1008-0.2.61-3c86d3ba497a.log


In [2]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models.scales import LogScale
output_notebook()

In [3]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'

## Load FarGen data

In [4]:
fargen_mt = hl.read_matrix_table(BASE_DIR + '/data/mt/hq_gnomad_annotated.mt')

In [5]:
n_sites, n_samples = fargen_mt.count()
print('Number of sites: ' + str(n_sites))
print('Number of samples: ' + str(n_samples))

Number of sites: 1332013
Number of samples: 474


## Allele frequencies in FarGen and gnomAD

In [6]:
fargen_mt = fargen_mt.annotate_rows(AF_fargen=fargen_mt.info.AF[0], AF_gnomad=fargen_mt.gnomad.freq.AF[0])

In [7]:
p = hl.plot.scatter(fargen_mt.info.AF[0], fargen_mt.AF_gnomad, xlabel='FarGen AF', ylabel='gnomAD AF', size=1)
p.plot_width = 800
p.plot_height = 400
show(p)

## Linear regression between allele frequencies

Make a linear regression $f_{gnomad} = \beta_1 f_{fargen} + \beta_2 + \epsilon$, with some slope $\beta_1$, intercept $\beta_2$ and error $\epsilon$.

In [8]:
af_linreg = fargen_mt.aggregate_rows(hl.agg.linreg(y=fargen_mt.AF_gnomad, x=[1, fargen_mt.AF_fargen]))

The slope is about 1 and the intercept is about 0, as we would expect.

In [9]:
intercept_ci = (af_linreg.beta[0] - 2 * af_linreg.standard_error[0], af_linreg.beta[0] + 2 * af_linreg.standard_error[0])
slope_ci = (af_linreg.beta[1] - 2 * af_linreg.standard_error[1], af_linreg.beta[1] + 2 * af_linreg.standard_error[1])

print('Intercept 95% confidence interval: {l}-{u}'.format(l=intercept_ci[0], u=intercept_ci[1]))
print('Slope 95% confidence interval: {l}-{u}'.format(l=slope_ci[0], u=slope_ci[1]))

Intercept 95% confidence interval: 0.0036583053802417224-0.004221666839921709
Slope 95% confidence interval: 0.9611631148267891-0.9630797246697372


For each variant, predict the gnomAD AF as $\beta_1 f_{fargen} + \beta_2$.

In [10]:
fargen_mt = fargen_mt.annotate_rows(predicted_AF_gnomad=af_linreg.beta[0] + fargen_mt.AF_fargen * af_linreg.beta[1])

Calculate the residuals, the difference between the predicted and actual gnomAD AF.

In [11]:
fargen_mt = fargen_mt.annotate_rows(residuals_af_linreg=fargen_mt.AF_gnomad - fargen_mt.predicted_AF_gnomad)

In [12]:
p = hl.plot.histogram(fargen_mt.residuals_af_linreg, legend='Residuals')
p.plot_width = 800
p.plot_height = 500
show(p)