In [1]:
from firecloud import fiss
import pandas as pd
pd.set_option('display.max_row', 10000)
import io
import numpy as np
from pprint import pprint

In [2]:
control = fiss.fapi.get_workspace_acl("topmed-shared", "topmed-shared").json()#['acl']
print control.keys()

[u'acl']


In [3]:
## start hail context
from hail import *
hc = HailContext(sc)

Running on Apache Spark version 2.0.2
SparkUI available at http://10.128.0.13:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.1-6f85985


In [4]:
samples_sets = fiss.fapi.get_entities('topmed-shared','topmed-shared', 'sample_set').json()
print 'Sample set name:', samples_sets[0]['name']
print 'Sample set fields:', ', '.join(samples_sets[0]['attributes'].keys())
vcf_files = samples_sets[0]['attributes']['vcf']['items']
print '# of vcf files:', len(vcf_files)

Sample set name: freeze5b_minDP10
Sample set fields: wgsa_subset, gds, vcf, bcf, tbi, cov_grm, samples, wgsa_raw, cor_grm
# of vcf files: 23


In [5]:
vds = hc.read("gs://fc-adaae650-a458-4c56-8a55-d96fa463a5c6/vds_topmed/freeze.5b.chr10.pass_and_fail.gtonly.minDP10.vds")

In [6]:
data_model = fiss.fapi.get_entities_tsv("topmed-shared","topmed-shared", "sample")
data_model_text = pd.read_csv(io.StringIO(data_model.text), sep='\t')[['entity:sample_id','participant','CENTER','study','topmed_project']]
data_model_text.rename(columns = {'entity:sample_id':'ent_sample_id', 'participant':'sample_id'}, inplace = True)
data_model_text[['study', 'topmed_project']] = data_model_text[['study', 'topmed_project']].astype(str)
data_model_text.info()

from pyspark.sql import SQLContext
sqlctx = SQLContext(hc.sc)
spark_df = sqlctx.createDataFrame(data_model_text)
kt = KeyTable.from_dataframe(spark_df,key='sample_id') 
vds = vds.annotate_samples_table(kt, root='sa')

data_model_text.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56436 entries, 0 to 56435
Data columns (total 5 columns):
ent_sample_id     56436 non-null object
sample_id         56436 non-null object
CENTER            56436 non-null object
study             56436 non-null object
topmed_project    56436 non-null object
dtypes: object(5)

Unnamed: 0,ent_sample_id,sample_id,CENTER,study,topmed_project
0,freeze5a_NWD100014,NWD100014,uw,JHS,JHS
1,freeze5a_NWD100018,NWD100018,broad,COPDGene,COPD
2,freeze5a_NWD100027,NWD100027,macrogen,GeneSTAR,GeneSTAR
3,freeze5a_NWD100047,NWD100047,uw,EOCOPD,COPD
4,freeze5a_NWD100048,NWD100048,broad,VU_AF,AFGen


In [7]:
pprint(vds.query_samples('samples.map(s => sa.topmed_project).counter()'))

{u'AA_CAC': 1234L,
 u'AFGen': 2875L,
 u'Amish': 1030L,
 u'BAGS': 968L,
 u'CFS': 923L,
 u'COPD': 8808L,
 u'CRA': 1043L,
 u'FHS': 3660L,
 u'GOLDN': 904L,
 u'GenSalt': 1695L,
 u'GeneSTAR': 1545L,
 u'HyperGEN_GENOA': 2822L,
 u'JHS': 3136L,
 u'MESA': 4178L,
 u'PGX_Asthma': 1366L,
 u'SAFS': 1509L,
 u'SAS': 1208L,
 u'Sarcoidosis': 608L,
 u'VTE': 4864L,
 u'WHI': 10047L,
 u'nan': 76L}


In [8]:
vds = vds.variant_qc().cache().sample_qc()

In [9]:
pprint(vds.sample_schema)

Struct{
     ent_sample_id: String,
     CENTER: String,
     study: String,
     topmed_project: String,
     qc: Struct{
         callRate: Double,
         nCalled: Int,
         nNotCalled: Int,
         nHomRef: Int,
         nHet: Int,
         nHomVar: Int,
         nSNP: Int,
         nInsertion: Int,
         nDeletion: Int,
         nSingleton: Int,
         nTransition: Int,
         nTransversion: Int,
         dpMean: Double,
         dpStDev: Double,
         gqMean: Double,
         gqStDev: Double,
         nNonRef: Int,
         rTiTv: Double,
         rHetHomVar: Double,
         rInsertionDeletion: Double
     }
 }


In [None]:
## actually want to look at proportion per cohort of singletons/doubles/etc

# annotate by allele count and frequency for each sample
vds = vds.annotate_samples_expr('sa.nDoubles = gs.filter(g => g.isHet() && va.qc.AC == 2).count()')
vds = vds.annotate_samples_expr('sa.nTri_to_one = gs.filter(g => g.isHet() && va.qc.AC == 3).count()')
vds = vds.annotate_samples_expr('sa.nOne = gs.filter(g => g.isHet() && va.qc.AF < 0.01 && va.qc.AF > 0.001).count()')
vds = vds.annotate_samples_expr('sa.nTen = gs.filter(g => g.isHet() && va.qc.AF < 0.1 && va.qc.AF > 0.01).count()')
vds = vds.annotate_samples_expr('sa.nTen_above = gs.filter(g => g.isHet() && va.qc.AF > 0.1).count()')

In [None]:
(vds.samples_table()
 .aggregate_by_key(key_expr=['Pop = sa.topmed_project'], agg_expr=['Singletons = sa.map(s => sa.qc.nSingleton).stats().sum',
                                                                                          'Doubletons = sa.map(s => sa.nDoubles).stats().sum',
                                                                                          'Tripletons_to_01 = sa.map(sa => sa.nTri_to_one).stats().sum',
                                                                                          'Zero_1_to_1 = sa.map(sa => sa.nOne).stats().sum',
                                                                                          'One_to_10 = sa.map(sa => sa.nTen).stats().sum',
                                                                                          'Ten_above = sa.map(sa => sa.nTen_above).stats().sum']).to_pandas()
)

## Running PCA on Freeze 5b Genotypes

In [None]:
# do pca
pca = vds.pca('sa.pca', k=5, eigenvalues='global.eigen')

### Visualize the differences in families within the studies by principle components.

In [None]:
# define some colors
c = ["aec7e8","ff7f0e","ffbb78","2ca02c","98df8a","d62728","ff9896","9467bd","c5b0d5","8c564b","c49c94"]
cz = zip(vds.query_samples('samples.map(s => sa.topmed_project).counter()').keys(),c)
colors = {t[0]: t[1] for t in cz}
# show the pca results
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from collections import Counter
from math import log, isnan
from pprint import pprint
%matplotlib inline
pca_table = pca.samples_table().to_pandas()
plt.scatter(pca_table["sa.pca.PC1"], pca_table["sa.pca.PC2"],
            c = pca_table["sa.metadata.SuperPopulation"].map(colors),
            alpha = .5)
plt.xlim(-0.6, 0.6)
plt.xlabel("PC1")
plt.ylabel("PC2")
legend_entries = [mpatches.Patch(color=c, label=pheno) for pheno, c in colors.items()]
plt.legend(handles=legend_entries, loc=2)
plt.show()