In [1]:
import hail as hl
import hail.expr.aggregators as agg

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import Counter
from math import log, isnan
from pprint import pprint
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.layouts import gridplot
from bokeh.models import Span
output_notebook()

In [2]:
hl.init()

Running on Apache Spark version 2.2.0
SparkUI available at http://10.1.7.107:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version devel-fb0887f9585c
NOTE: This is a beta version. Interfaces may change
  during the beta period. We recommend pulling
  the latest changes weekly.


In [3]:
hl.utils.get_1kg('data/')
ds = hl.read_matrix_table('data/1kg.mt')

table = (hl.import_table('data/1kg_annotations.txt', impute=True)
         .key_by('Sample'))

ds = ds.annotate_cols(**table[ds.s])
ds = hl.sample_qc(ds)
ds = ds.filter_cols((ds.sample_qc.dp_mean >= 4) & (ds.sample_qc.call_rate >= 0.97))

ab = ds.AD[1] / hl.sum(ds.AD)
filter_condition_ab = ((ds.GT.is_hom_ref() & (ab <= 0.1)) |
                        (ds.GT.is_het() & (ab >= 0.25) & (ab <= 0.75)) |
                        (ds.GT.is_hom_var() & (ab >= 0.9)))
ds = ds.filter_entries(filter_condition_ab)

ds = hl.variant_qc(ds)

2018-06-14 13:35:28 Hail: INFO: downloading 1KG VCF ...
  Source: https://storage.googleapis.com/hail-tutorial/1kg.vcf.bgz
2018-06-14 13:35:37 Hail: INFO: importing VCF and writing to matrix table...
2018-06-14 13:35:39 Hail: INFO: Coerced sorted dataset
2018-06-14 13:35:44 Hail: INFO: wrote 10961 items in 16 partitions to data/1kg.mt
2018-06-14 13:35:44 Hail: INFO: downloading 1KG annotations ...
  Source: https://storage.googleapis.com/hail-tutorial/1kg_annotations.txt
2018-06-14 13:35:44 Hail: INFO: Done!
2018-06-14 13:35:45 Hail: INFO: Reading table to impute column types
2018-06-14 13:35:45 Hail: INFO: Finished type imputation
  Loading column 'Sample' as type 'str' (imputed)
  Loading column 'Population' as type 'str' (imputed)
  Loading column 'SuperPopulation' as type 'str' (imputed)
  Loading column 'isFemale' as type 'bool' (imputed)
  Loading column 'PurpleHair' as type 'bool' (imputed)
  Loading column 'CaffeineConsumption' as type 'float64' (imputed)
2018-06-14 13:35:45 Ha

In [4]:
# GWAS
common_ds = ds.filter_rows(ds.variant_qc.AF[1] > 0.01)
gwas = hl.linear_regression(y=common_ds.CaffeineConsumption, x=common_ds.GT.n_alt_alleles())

2018-06-14 13:36:09 Hail: INFO: linear_regression: running on 250 samples for 1 response variable y,
    with input variable x, intercept, and 0 additional covariates...


In [5]:
# address confounding
pca_eigenvalues, pca_scores, _ = hl.hwe_normalized_pca(common_ds.GT)

cds = common_ds.annotate_cols(pca = pca_scores[common_ds.s])

linear_regression_results = hl.linear_regression(
    y=cds.CaffeineConsumption, x=cds.GT.n_alt_alleles(),
    covariates=[cds.isFemale, cds.pca.scores[0], cds.pca.scores[1], cds.pca.scores[2]])

2018-06-14 13:36:21 Hail: INFO: hwe_normalized_pca: running PCA using 9169 variants.
2018-06-14 13:36:25 Hail: INFO: pca: running PCA with 10 components...
2018-06-14 13:36:39 Hail: INFO: linear_regression: running on 250 samples for 1 response variable y,
    with input variable x, intercept, and 4 additional covariates...


In [6]:
# p-values for caffeine consumption
caffeine_pval_tbl = (linear_regression_results
       .select_rows(linear_regression_results.rsid, 
                    linear_regression_results.qual, 
                    linear_regression_results.linreg.p_value)
       .select_entries()
       .rows())
caffeine_pval_tbl.describe()
caffeine_pval_tbl.show(10)

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'locus': locus<GRCh37> 
    'alleles': array<str> 
    'rsid': str 
    'qual': float64 
    'p_value': float64 
----------------------------------------
Key: ['locus', 'alleles']
----------------------------------------
+---------------+------------+------+-------------+-------------+
| locus         | alleles    | rsid |        qual |     p_value |
+---------------+------------+------+-------------+-------------+
| locus<GRCh37> | array<str> | str  |     float64 |     float64 |
+---------------+------------+------+-------------+-------------+
| 1:904165      | ["G","A"]  | NA   | 5.23464e+04 | 1.80882e-01 |
| 1:1563691     | ["T","G"]  | NA   | 1.09075e+03 | 4.63369e-01 |
| 1:1707740     | ["T","G"]  | NA   | 9.35178e+04 | 8.80793e-01 |
| 1:2284195     | ["T","C"]  | NA   | 1.42481e+05 | 4.33107e-01 |
| 1:2779043     | ["T","C"]  | NA   | 2.88792e+05 | 3.57754e-02

In [8]:
# position stats (max, min, etc...)
pos_info = (caffeine_pval_tbl
            .group_by(caffeine_pval_tbl.locus.contig)
            .aggregate(pos_stats = agg.stats(caffeine_pval_tbl.locus.position)))
pos_info.show(23, width=150)

+--------+----------------+-----------------+---------------+---------------+-------------+---------------+
| contig | pos_stats.mean | pos_stats.stdev | pos_stats.min | pos_stats.max | pos_stats.n | pos_stats.sum |
+--------+----------------+-----------------+---------------+---------------+-------------+---------------+
| str    |        float64 |         float64 |       float64 |       float64 |       int64 |       float64 |
+--------+----------------+-----------------+---------------+---------------+-------------+---------------+
| 4      |    9.46420e+07 |     5.66664e+07 |   7.56986e+05 |   1.90459e+08 |         561 |   5.30942e+10 |
| 3      |    9.51058e+07 |     6.00178e+07 |   1.44668e+05 |   1.97495e+08 |         628 |   5.97264e+10 |
| 16     |    4.81540e+07 |     3.17956e+07 |   1.05444e+05 |   9.00113e+07 |         315 |   1.51685e+10 |
| 10     |    6.79743e+07 |     4.21169e+07 |   3.23283e+05 |   1.35370e+08 |         460 |   3.12682e+10 |
| 22     |    3.54867e+07 | 

In [23]:
# convert locus to global positions for manhattan plotting
# get negative log of p_values for manhattan plotting
caffeine_pval_tbl.describe()

# caffeine_pval_tbl.count()      --> 9177

expr = (hl.Locus(
    caffeine_pval_tbl.locus.contig.value(), 
    caffeine_pval_tbl.locus.position.value())._rg._jrep.locusToGlobalPos(
    hl.Locus(caffeine_pval_tbl.locus.contig.value(), caffeine_pval_tbl.locus.position.value())._jrep))
expr
#manhattan = (caffeine_pval_tbl
#             .annotate(
#                 global_position = expr))

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'locus': locus<GRCh37> 
    'alleles': array<str> 
    'rsid': str 
    'qual': float64 
    'p_value': float64 
----------------------------------------
Key: ['locus', 'alleles']
----------------------------------------


2018-06-14 14:36:14 Hail: ERROR: scope violation: 'eval_expr_typed' expects an expression indexed by []
    Found indices ['row'], with unexpected indices ['row']. Invalid fields:
        'locus' (indices ['row'])


ExpressionException: scope violation: 'eval_expr_typed' expects an expression indexed by []
    Found indices ['row'], with unexpected indices ['row']. Invalid fields:
        'locus' (indices ['row'])