# Do GWAS on random split

## Part 1

Import packages, create HailContext

In [2]:
import numpy as np
import random
from hail import *
hc = HailContext()

KeyboardInterrupt: 

In [None]:
vds = hc.read('gs://nbaya/split/ukb31063.hm3_variants.gwas_samples.vds')

## Part 2

Select only the columns for user ID and height phenotype (code: 50) from the key table.

In [None]:
kt_both_sexes = hc.import_table('gs://phenotype_31063/ukb31063.phesant_phenotypes.both_sexes.tsv.bgz', types = {'"50"': TFloat()}, missing = "")
kt_both_sexes = kt_both_sexes.select(['"userId"','"50"'])
kt_both_sexes = kt_both_sexes.rename({ '"userId"': 's', '"50"': 'height'})
kt_both_sexes = kt_both_sexes.key_by('s')

Write out the filtered key table.

In [None]:
kt_both_sexes.write("gs://nbaya/split/height.kt", overwrite = True)

Read the filtered key table

In [3]:
kt_height = hc.read_table("gs://nbaya/split/height.kt")

Read covariates

In [4]:
cov = hc.read_table("gs://phenotype_31063/hail/0.1/ukb31063.gwas_covariates.both_sexes.kt")

Annotate vds with covariates & height phenotype

In [5]:
vds = vds.annotate_samples_table(cov, root = 'sa.covariates')
vds = vds.annotate_samples_table(kt_height, root = 'sa.height')

In [9]:
from pprint import pprint

pprint(vds.variant_schema)

Struct{
     rsid: String,
     varid: String,
     AF: Double,
     info: Float
 }


In [18]:
vds.samples_table().flatten().export('gs://nbaya/split/phen_tables/ukb31063.hm3_variants.gwas_samples_height_v2.tsv')

2018-08-16 17:54:21 Hail: INFO: while writing:
    gs://nbaya/ukb31063.hm3_variants.gwas_samples_height_v2.tsv
  merge time: 3.615s


In [None]:
# vds.export_vcf('gs://nbaya/ukb31063.hm3_variants.gwas_samples_height_gp.vcf.bgz', export_pp=True)


Define covariates list

In [None]:
cov_list = ['sa.covariates.isFemale',
           'sa.covariates.age',
           'sa.covariates.age_squared',
           'sa.covariates.age_isFemale',
           'sa.covariates.age_squared_isFemale'] + \
['sa.covariates.PC{:}'.format(i) for i in xrange(1,21)]

Split vds randomly into two equally sized groups.

In [36]:
all_ID = vds.sample_ids    # list of all sample IDs
all_i = np.arange(len(all_ID))  # array of all indices
random.shuffle(all_i)  # randomly shuffle all indices

vds1_i = all_i[:len(all_i)/2]   # take first half of randomly shuffled indices
vds2_i = all_i[len(all_i)/2:]   # take second half of randomly shuffled indices

vds1_ID = [all_ID[i] for i in vds1_i]  # list of sample IDs for first half
vds2_ID = [all_ID[i] for i in vds2_i]  # list of sample IDs for first half

In [37]:
vds1 = vds.filter_samples_list(vds1_ID)
vds2 = vds.filter_samples_list(vds2_ID)

Run linear regression on each group separately

In [None]:
vds1_result = vds1.linreg3(ys = ['sa.height'], covariates=cov_list, root = 'va.linreg', use_dosages = True)

In [None]:
vds2_result = vds2.linreg3(ys = ['sa.height'], covariates=cov_list, root = 'va.linreg', use_dosages = True)

In [None]:
vds1_result.variants_table().export('gs://nbaya/split/vds1.tsv.bgz')

In [None]:
vds2_result.variants_table().export('gs://nbaya/split/vds2.tsv.bgz')