In [None]:
from firecloud import fiss
import pandas as pd
pd.set_option('display.max_row', 10000)
import io
import numpy as np
from pprint import pprint

In [None]:
control = fiss.fapi.get_workspace_acl("topmed-shared", "topmed-shared").json()#['acl']
print control.keys()

In [None]:
## start hail context
from hail import *
hc = HailContext(sc)

In [4]:
samples_sets = fiss.fapi.get_entities('topmed-shared','topmed-shared', 'sample_set').json()
print 'Sample set name:', samples_sets[0]['name']
print 'Sample set fields:', ', '.join(samples_sets[0]['attributes'].keys())
vcf_files = samples_sets[0]['attributes']['vcf']['items']
print '# of vcf files:', len(vcf_files)

Sample set name: freeze5b_minDP10
Sample set fields: wgsa_subset, gds, vcf, bcf, tbi, cov_grm, samples, wgsa_raw, cor_grm
# of vcf files: 23


For analysis, we will want to combine genotype data in several different ways -- and these will be specific to each working group. One of the first operations will be to subset the data to a set of cohorts in the T2D analysis.

In [6]:
data_model = fiss.fapi.get_entities_tsv("topmed-shared","topmed-shared", "sample")
data_model_text = pd.read_csv(io.StringIO(data_model.text), sep='\t')[['entity:sample_id','participant','CENTER','study','topmed_project','consent']]
data_model_text.rename(columns = {'entity:sample_id':'ent_sample_id', 'participant':'sample_id'}, inplace = True)
data_model_text[['study', 'topmed_project','consent']] = data_model_text[['study', 'topmed_project','consent']].astype(str)
data_model_text.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56436 entries, 0 to 56435
Data columns (total 6 columns):
ent_sample_id     56436 non-null object
sample_id         56436 non-null object
CENTER            56436 non-null object
study             56436 non-null object
topmed_project    56436 non-null object
consent           56436 non-null object
dtypes: object(6)

Taking a look at the top of this data frame:

In [7]:
data_model_text.head()

Unnamed: 0,ent_sample_id,sample_id,CENTER,study,topmed_project,consent
0,freeze5a_NWD100014,NWD100014,uw,JHS,JHS,HMB-IRB-NPU
1,freeze5a_NWD100018,NWD100018,broad,COPDGene,COPD,HMB
2,freeze5a_NWD100027,NWD100027,macrogen,GeneSTAR,GeneSTAR,DS-CVD-IRB-NPU-MDS
3,freeze5a_NWD100047,NWD100047,uw,EOCOPD,COPD,DS-CS-RD
4,freeze5a_NWD100048,NWD100048,broad,VU_AF,AFGen,GRU-IRB


We are going to loop through each chromosome, and create vds subsets per cohort/consent group.

In [8]:
i = 13

vcf_file = vcf_files[i]
print vcf_file

label = vcf_file.split('/')[-1]
label = ".".join(label.split('.')[:-2])
print label

gs://fc-adaae650-a458-4c56-8a55-d96fa463a5c6/25c6cc4a-9d07-49bc-a548-9166f7a5f5ad/w/e111b73f-4bff-4079-8c73-f9951d8dcb7a/call-runVcf/shard-13/freeze.5b.chr21.pass_and_fail.gtonly.minDP10.vcf.bgz
freeze.5b.chr21.pass_and_fail.gtonly.minDP10


In [9]:
vds = hc.import_vcf(vcf_files,min_partitions = 1200).split_multi()

2017-11-27 20:15:47 Hail: INFO: No multiallelics detected.
2017-11-27 20:15:48 Hail: INFO: Coerced sorted dataset
2017-11-27 20:15:48 Hail: WARN: called redundant split on an already split VDS


Annotate the samples

In [10]:
from pyspark.sql import SQLContext
sqlctx = SQLContext(hc.sc)
spark_df = sqlctx.createDataFrame(data_model_text)
kt = KeyTable.from_dataframe(spark_df,key='sample_id') 
vds = vds.annotate_samples_table(kt, root='sa')

Extract samples for each sample/consent combination, excluding the consent="NA"

In [11]:
data_model_text['cohort_consent'] = data_model_text['study'].map(str) + "." + data_model_text['consent']

In [12]:
cohort_consents = data_model_text['cohort_consent'].unique()
print cohort_consents

['JHS.HMB-IRB-NPU' 'COPD.HMB' 'GeneSTAR.DS-CVD-IRB-NPU-MDS' 'COPD.DS-CS-RD'
 'AFGen.GRU-IRB' 'WHI.HMB-IRB' 'CFS.DS-HLBS-IRB-NPU' 'FHS.HMB-IRB-MDS'
 'HyperGEN_GENOA.GRU-IRB' 'VTE.HMB-IRB' 'THRV.DS-CVD-IRB-COL-NPU-RD'
 'HyperGEN_GENOA.DS-ASC-RF-NPU' 'GenSalt.GRU-IRB' 'SAFS.nan' 'MESA.HMB'
 'BAGS.GRU-IRB' 'GOLDN.DS-CVD-IRB' 'SAS.GRU-IRB-PUB-COL-NPU-GSO'
 'CRA.DS-ASTHMA-IRB-MDS-RD' 'WHI.HMB-IRB-NPU' 'JHS.HMB-IRB'
 'PGX_Asthma.DS-LD-RD' 'THRV.nan' 'VTE.GRU' 'FHS.HMB-IRB-NPU-MDS'
 'PGX_Asthma.DS-LD' 'JHS.DS-FDO-IRB-NPU' 'AA_CAC.HMB-IRB-COL-NPU'
 'Sarcoidosis.DS-SAR-IRB' 'JHS.DS-FDO-IRB' 'MESA.nan' 'VTE.nan'
 'AFGen.DS-AF-IRB-RD' 'Amish.HMB-IRB-MDS' 'AFGen.HMB-IRB-NPU-MDS'
 'AFGen.HMB-IRB' 'AFGen.HMB' 'AA_CAC.HMB' 'AFGen.nan'
 'HyperGEN_GENOA.DS-CVD-IRB-RD' 'AA_CAC.DS-ASC-RF-NPU'
 'AA_CAC.DS-CVD-IRB-NPU-MDS' 'AFGen.HMB-IRB-MDS' 'VTE.HMB-NPU-MDS'
 'nan.NRUP' 'BAGS.nan' 'Amish.nan' 'HyperGEN_GENOA.nan' 'AA_CAC.nan'
 'JHS.nan' 'CFS.nan' 'FHS.nan' 'AA_CAC.DS-DHD-IRB-COL-NPU' 'PGX_Asthma.nan'
 'CR

In [13]:
data_model_text['cohort_consent']

0                 JHS.HMB-IRB-NPU
1                        COPD.HMB
2     GeneSTAR.DS-CVD-IRB-NPU-MDS
3                   COPD.DS-CS-RD
4                   AFGen.GRU-IRB
5                   AFGen.GRU-IRB
6                        COPD.HMB
7                     WHI.HMB-IRB
8             CFS.DS-HLBS-IRB-NPU
9                        COPD.HMB
10                FHS.HMB-IRB-MDS
11         HyperGEN_GENOA.GRU-IRB
12    GeneSTAR.DS-CVD-IRB-NPU-MDS
13                  AFGen.GRU-IRB
14                    VTE.HMB-IRB
...
56421               WHI.HMB-IRB-NPU
56422                      COPD.HMB
56423                      COPD.HMB
56424                      COPD.HMB
56425                   WHI.HMB-IRB
56426                 AFGen.GRU-IRB
56427                      SAFS.nan
56428    THRV.DS-CVD-IRB-COL-NPU-RD
56429               WHI.HMB-IRB-NPU
56430                      COPD.HMB
56431      CRA.DS-ASTHMA-IRB-MDS-RD
56432                 AFGen.GRU-IRB
56433                      COPD.HMB
56434             

In [None]:
cohorts_for_t2d = ['JHS.HMB-IRB-NPU','JHS.HMB-IRB','JHS.DS-FDO-IRB-NPU','JHS.DS-FDO-IRB', 
                   'FHS.HMB-IRB-MDS','FHS.HMB-IRB-NPU-MDS', 'MESA.HMB']
data_model_text['t2d_analysis'] = pd.data_frame([1 if x in cohorts_for_t2d else 0 for x in data_model_text['cohort_consent']])

In [14]:
consent_grouped = data_model_text.groupby('cohort_consent')

In [15]:
one_group = consent_grouped.get_group('JHS.HMB-IRB-NPU')

In [16]:
one_group["sample_id"].describe()

count           697
unique          697
top       NWD589399
freq              1
dtype: object

In [17]:
one_group["sample_id"].values

array(['NWD100014', 'NWD100597', 'NWD100900', 'NWD102903', 'NWD103804',
       'NWD107594', 'NWD108366', 'NWD109345', 'NWD112373', 'NWD113503',
       'NWD114901', 'NWD115576', 'NWD117670', 'NWD119675', 'NWD120808',
       'NWD121168', 'NWD122406', 'NWD123492', 'NWD125918', 'NWD126673',
       'NWD127039', 'NWD127105', 'NWD128382', 'NWD128980', 'NWD130049',
       'NWD131190', 'NWD131470', 'NWD132536', 'NWD133474', 'NWD136646',
       'NWD140186', 'NWD140972', 'NWD141914', 'NWD143467', 'NWD144171',
       'NWD147037', 'NWD148009', 'NWD149390', 'NWD153049', 'NWD155095',
       'NWD155490', 'NWD155709', 'NWD155738', 'NWD158614', 'NWD158978',
       'NWD162478', 'NWD163306', 'NWD164010', 'NWD164654', 'NWD165668',
       'NWD166048', 'NWD169202', 'NWD170185', 'NWD170530', 'NWD171177',
       'NWD173120', 'NWD173675', 'NWD174851', 'NWD176480', 'NWD178783',
       'NWD179190', 'NWD179781', 'NWD180145', 'NWD181046', 'NWD182275',
       'NWD183109', 'NWD183770', 'NWD184736', 'NWD184773', 'NWD1

In [18]:
vds_subset_one_group = vds.filter_samples_list(list(one_group["sample_id"].values), keep=True)

In [79]:
## this takes a very long time
#vds_subset_one_group.summarize()

Summary(samples=697, variants=7762483, call_rate=0.991701, contigs=['chr21'], multiallelics=0, snps=6949524, mnps=0, insertions=287547, deletions=525412, complex=0, star=0, max_alleles=2)

In [19]:
vds_subset_one_group = vds_subset_one_group.variant_qc().cache()

Filter to variants that have MAC>0 in this sample.

In [20]:
vds_subset_one_group = vds_subset_one_group.filter_variants_expr('va.qc.AC>0').repartition(120)

In [None]:
vds_subset_one_group.summarize().report()

LD prune - make new data set

In [21]:
vds_result = (vds_subset_one_group.variant_qc()
              .filter_variants_expr("va.qc.AF >= 0.05 && va.qc.AF <= 0.95")
              .ld_prune())

2017-11-27 21:16:14 Hail: INFO: Running LD prune with nSamples=697, nVariants=228050, nPartitions=120, and maxQueueSize=298262.
2017-11-27 21:16:23 Hail: INFO: LD prune step 1 of 3: nVariantsKept=52621, nPartitions=120, time=9.836s
2017-11-27 21:17:36 Hail: INFO: LD prune step 2 of 3: nVariantsKept=50623, nPartitions=3, time=1m12.1s
2017-11-27 21:18:05 Hail: INFO: Coerced sorted dataset
2017-11-27 21:18:16 Hail: INFO: LD prune step 3 of 3: nVariantsKept=50567, time=40.835s


## Running PCA on Freeze 5b Genotypes

In [None]:
# do pca
pca = vds_result.pca('sa.pca', k=5, eigenvalues='global.eigen')

### Visualize the differences in families within the studies by principle components.

In [None]:
# define some colors
#c = ["aec7e8","ff7f0e","ffbb78","2ca02c","98df8a","d62728","ff9896","9467bd","c5b0d5","8c564b","c49c94"]

# show the pca results
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from collections import Counter
from math import log, isnan
from pprint import pprint
%matplotlib inline

cz = zip(vds_result.query_samples('samples.map(s => sa.study).counter()').keys(),plt.cm.plasma(1))
colors = {t[0]: t[1] for t in cz}

pca_table = pca.samples_table().to_pandas()
plt.scatter(pca_table["sa.pca.PC1"], pca_table["sa.pca.PC2"],
            c = pca_table["sa.study"].map(colors),
            alpha = .5)
plt.xlim(-0.6, 0.6)
plt.xlabel("PC1")
plt.ylabel("PC2")
legend_entries = [mpatches.Patch(color=c, label=x) for x, c in colors.items()]
plt.legend(handles=legend_entries, loc=2)
plt.show()